系列文章目录
MNN createFromBuffer(一)
MNN createRuntime(二)
MNN createSession 之 Schedule(三)
MNN createSession 之创建流水线后端(四)
MNN Session 之维度计算(五)
MNN Session 之几何计算(六)
MNN Session 之 CPU 算子(七)
MNN Session 之 Vulkan 算子(八)
文章目录
- 系列文章目录
- 1、createSession
- 1.1 createMultiPathSession
- 1.1.1 Session::resize
- 1.1.1.1 Pipeline::encode
- 1.1.1.1.1 GeometryComputerUtils::shapeComputeAndGeometryTransform
- 1.1.1.1.1.1 CPUBackend::onCreate
- 1.1.1.1.1.1.1 CPUBackend::Creator::onCreate
- 1.1.1.1.1.1.2 Backend
- 1.1.1.1.1.1.3 Execution
- 1.1.1.1.1.1.4 CPU 算子执行实例注册
- 添加CPU实现
- 1.1.1.1.1.2 Backend::onAcquireBuffer
- 1.1.1.1.1.3 Backend::onResizeBegin
- 1.1.1.1.1.4 Execution::onResize
- 1.1.1.1.1.5 Backend::onResizeEnd
- 1.1.1.1.1.6 Execution::onExecute
1、createSession
依据 ScheduleConfig 和 RuntimeInfo 创建会话。
// source/core/Interpreter.cpp
Session* Interpreter::createSession(const ScheduleConfig& config, const RuntimeInfo& runtime) {
return createMultiPathSession({config}, runtime);
}
1.1 createMultiPathSession
createMultiPathSession 完整代码
// source/core/Interpreter.cpp
Session* Interpreter::createMultiPathSession(const std::vector<ScheduleConfig>& configs, const RuntimeInfo& runtime) {
// ...
auto result = newSession.get();
auto validForResize = info.validForResize;
if (validForResize && mNet->modes.inputMode == Session_Input_Inside && mNet->modes.resizeMode == Session_Resize_Direct) {
result->resize();
}
// ...
return result;
}
1.1.1 Session::resize
Session::resize 完整代码
// source/core/Session.cpp
ErrorCode Session::resize() {
// ...
if (mNeedResize) {
bool debug = mCallBackMode == Interpreter::Session_Debug;
// mPipelines 类型为 std::vector<std::shared_ptr<Pipeline>>
for (auto& iter : mPipelines) {
auto error = iter->encode(debug, permitCodegen);
if (NO_ERROR != error) {
return error;
}
}
mNeedResize = false;
mNeedMalloc = true;
firstMalloc = true;
}
// ...
}
1.1.1.1 Pipeline::encode
Pipeline::encode 完整代码
BackendCache、OpCacheInfo
// source/core/Pipeline.cpp
// typedef std::pair<BackendCache, std::vector<OpCacheInfo>> PipelineInfo;
//
// struct BackendCache {
// Backend::Info info;
// BackendConfig config;
// std::pair<std::shared_ptr<Backend>, std::shared_ptr<Backend>> cache;
// bool needComputeShape = true;
// bool needComputeGeometry = true;
// bool reportError = true;
// std::map<Tensor*, TENSORCACHE> inputTensorCopyCache;
// };
//
// /** pipeline info */
// struct OpCacheInfo {
// /** op */
// const Op* op;
// /** input tensors */
// std::vector<Tensor*> inputs;
// /** output tensors */
// std::vector<Tensor*> outputs;
// /** schedule type*/
// Schedule::Type type = Schedule::Type::SEPARATE;
//
// /**Command buffer for cache*/
// CommandBuffer cacheBuffer;
//
// /**Command buffer for execute*/
// CommandBuffer executeBuffer;
//
// std::map<const Op*, std::shared_ptr<Execution>> executionCache;
// };
//
ErrorCode Pipeline::encode(bool supportDebug, bool permitCodegen) {
// mInfo.first.cache 类型为 std::pair<std::shared_ptr<Backend>, std::shared_ptr<Backend>>
// mBackend 创建的后端如(VulkanBackend)
auto& mBackend = mInfo.first.cache.first;
// mBackupBackend 创建的后备(默认)后端如(CPUBackend)
auto& mBackupBackend = mInfo.first.cache.second;
// Static Model just copy info to command buffer
// mInfo.first 类型为 BackendCache
if (!mInfo.first.needComputeGeometry) {
// ...
} else {
#ifndef MNN_BUILD_MINI
// mContext 类型为 GeometryComputer::Context
mContext.clear();
/** Size Compute and compute Const Begin */
auto res = GeometryComputerUtils::shapeComputeAndGeometryTransform(mInfo.second, mContext, mInfo.first.cache.second, mUseGeometry, false, permitCodegen);
if (res != NO_ERROR) {
return res;
}
#endif
}
// ...
return NO_ERROR;
}
1.1.1.1.1 GeometryComputerUtils::shapeComputeAndGeometryTransform
GeometryComputerUtils::shapeComputeAndGeometryTransform 完整代码
OpCacheInfo
// source/geometry/GeometryComputerUtils.cpp
// /** pipeline info */
// struct OpCacheInfo {
// /** op */
// const Op* op;
// /** input tensors */
// std::vector<Tensor*> inputs;
// /** output tensors */
// std::vector<Tensor*> outputs;
// /** schedule type*/
// Schedule::Type type = Schedule::Type::SEPARATE;
//
// /**Command buffer for cache*/
// CommandBuffer cacheBuffer;
//
// /**Command buffer for execute*/
// CommandBuffer executeBuffer;
//
// std::map<const Op*, std::shared_ptr<Execution>> executionCache;
// };
//
ErrorCode GeometryComputerUtils::shapeComputeAndGeometryTransform(
std::vector<Schedule::OpCacheInfo>& infos,
GeometryComputer::Context& geoContext,
std::shared_ptr<Backend> backupBackend,
Runtime::CompilerType compileType,
bool skipShapeCompute,
bool permitCodegen) {
/** Size Compute and compute Const Begin */
GeometryComputer::Context ctx(backupBackend);
// Size Compute and compute Const
// infos 为算子缓存,大小为 171
for (int i=0; i<infos.size(); ++i) {
// info 类型为 OpCacheInfo
auto& info = infos[i];
auto& cmdBufferVir = info.executeBuffer;
auto& tempBuffer = info.cacheBuffer;
// ...
if (info.type == Schedule::CONSTANT) {
// ...
for (auto& cp : cmdBufferVir.command) {
auto& c = *cp;
if (nullptr == c.execution) {
c.execution.reset(backupBackend->onCreate(c.inputs, c.outputs, c.op));
}
auto exe = c.execution;
if (nullptr == exe.get()) {
MNN_ERROR("Const Folder Error for %s\n", info.op->name()->c_str());
return NO_EXECUTION;
}
for (auto t : c.outputs) {
auto des = TensorUtils::getDescribe(t);
TensorUtils::setLinearLayout(t);
auto res = backupBackend->onAcquireBuffer(t, Backend::STATIC);
if (!res) {
return OUT_OF_MEMORY;
}
des->setBackend(backupBackend.get());
}
backupBackend->onResizeBegin();
auto code = exe->onResize(c.inputs, c.outputs);
if (NO_ERROR != code) {
return NOT_SUPPORT;
}
code = backupBackend->onResizeEnd();
if (NO_ERROR != code) {
return NOT_SUPPORT;
}
code = exe->onExecute(c.inputs, c.outputs);
if (NO_ERROR != code) {
return NOT_SUPPORT;
}
}
// Clear const command
ctx.pushCache(cmdBufferVir);
cmdBufferVir.command.clear();
cmdBufferVir.extras.clear();
}
}
/** Size Compute and compute Const End */
// ...
return NO_ERROR;
}
1.1.1.1.1.1 CPUBackend::onCreate
在函数 GeometryComputerUtils::shapeComputeAndGeometryTransform 中调用 CPUBackend::onCreate 函数的代码如下:
for (auto& cp : cmdBufferVir.command) {
auto& c = *cp;
if (nullptr == c.execution) {
c.execution.reset(backupBackend->onCreate(c.inputs, c.outputs, c.op));
}
由于传入的 backupBackend 是 CPUBackend(继承 Backend)。CPUBackend::onCreate 具体实现代码如下:
// source/backend/cpu/CPUBackend.cpp
/// get execution
Execution* CPUBackend::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op) {
/**
BatchNorm it will be converted to scale
for model convert, don't print error log
*/
if (op->type() == OpType_BatchNorm) {
return nullptr;
}
auto opType = op->type();
if (outputs.size() > 0) {
if (TensorUtils::getDescribe(outputs[0])->quantAttr != nullptr && TensorUtils::getDescribe(outputs[0])->type == DataType_DT_INT8) {
opType = _getRealOpType(opType);
}
}
// TODO: rm this convert when merge diff datatyoe of op
auto map = gCreator;
auto iter = map->find(opType);
if (iter == map->end()) {
MNN_PRINT("Don't support type [%s], %s\n", MNN::EnumNameOpType(op->type()), op->name()->c_str());
return nullptr;
}
Execution* exe = nullptr;
bool needCast = false;
if (exe == nullptr) {
exe = iter->second->onCreate(inputs, outputs, op, this);
}
return exe;
}
1.1.1.1.1.1.1 CPUBackend::Creator::onCreate
在函数 CPUBackend::onCreate 中调用 CPUBackend::Creator::onCreate 函数的代码如下:
auto map = gCreator;
auto iter = map->find(opType);
// ...
Execution* exe = nullptr;
bool needCast = false;
if (exe == nullptr) {
// 根据 opType 创建算子执行器
exe = iter->second->onCreate(inputs, outputs, op, this);
}
备注:iter->second->onCreate 调用是个多态,实际运行中根据算子类型 opType ,调用不同的子类。其基类为 CPUBackend::Creator 。
其中一个实现类为 CPURasterFactory ,具体实现代码如下:
// source/backend/cpu/CPURaster.cpp
class CPURasterFactory : public CPUBackend::Creator {
public:
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op, Backend* backend) const {
if (op->type() == OpType_While) {
if (op->main_type() != OpParameter_LoopParam) {
return nullptr;
}
return new CPULoop(backend, op->main_as_LoopParam());
}
return new CPURaster(backend);
}
};
REGISTER_CPU_OP_CREATOR(CPURasterFactory, OpType_Raster);
REGISTER_CPU_OP_CREATOR(CPURasterFactory, OpType_While);
1.1.1.1.1.1.2 Backend
// source/core/Backend.hpp
/** abstract backend */
class Backend : public NonCopyable {
public:
/** info used to create backend */
struct Info {
/** forward type. */
MNNForwardType type = MNN_FORWARD_CPU;
/** numThread for CPU . number of threads. gpuMode for GPU only. tuning/memory Mode setting. */
union {
int numThread = 4;
int gpuMode;
};
/** user data. */
BackendConfig* user = NULL;
enum Mode {
// The Op will be run in execution->onExecute
DIRECT = 0,
// The Op will be recorded. Run in onExecuteBegin and Wait in onExecuteEnd
INDIRECT = 1
};
Mode mode = DIRECT;
enum Allocator {
DEFER = 0,
EAGER = 1
};
Allocator allocator = DEFER;
};
/** backend buffer storage type */
enum StorageType {
/**
use NOT reusable memory.
- allocates memory when `onAcquireBuffer` is called.
- releases memory when `onReleaseBuffer` is called or when the backend is deleted.
- do NOTHING when `onClearBuffer` is called.
*/
STATIC,
/**
use reusable memory.
- allocates or reuses memory when `onAcquireBuffer` is called. prefers reusing.
- collects memory for reuse when `onReleaseBuffer` is called.
- releases memory when `onClearBuffer` is called or when the backend is deleted.
*/
DYNAMIC,
/**
use NOT reusable memory.
- allocates memory when `onAcquireBuffer` is called.
- do NOTHING when `onReleaseBuffer` is called.
- releases memory when `onClearBuffer` is called or when the backend is deleted.
*/
DYNAMIC_SEPERATE
};
public:
/**
* @brief initializer.
* @param type forward type.
*/
Backend(MNNForwardType type) : mType(type) {
// nothing to do
}
/**
* @brief deinitializer.
*/
virtual ~Backend() = default;
public:
/**
* @brief create execution for op with input and output tensors.
* @param inputs input tensors.
* @param outputs output tensors.
* @param op given op.
* @return created execution if op is supported, nullptr otherwise.
*/
virtual Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
const MNN::Op* op) = 0;
/**
* @brief callback before resize ops.
*/
virtual void onResizeBegin() {
// nothing to do
}
/**
* @brief callback after resize ops.
*/
virtual ErrorCode onResizeEnd() = 0;
/**
* @brief callback before executing ops.
*/
virtual void onExecuteBegin() const = 0;
/**
* @brief callback after executing ops.
*/
virtual void onExecuteEnd() const = 0;
virtual const Runtime* getRuntime() {
return nullptr;
}
const std::string externalFile();
public:
/**
* @brief allocate buffer of tensor for given storage type.
* @param tensor buffer provider.
* @param storageType buffer storage type.
* @return success or not.
*/
MNN_PUBLIC bool onAcquireBuffer(const Tensor* tensor, StorageType storageType);
/**
* @brief release buffer of tensor for given storage type.
* @param tensor buffer provider.
* @param storageType buffer storage type.
* @return success or not.
*/
MNN_PUBLIC bool onReleaseBuffer(const Tensor* tensor, StorageType storageType);
class MemObj {
public:
MemObj() {}
virtual ~ MemObj() {}
virtual MemChunk chunk() { return MemChunk(); }
};
/**
* @brief allocate buffer of tensor for given storage type.
* @param tensor buffer provider.
* @param storageType buffer storage type.
* @return MemObj for release, if failed, return nullptr.
*/
virtual MemObj* onAcquire(const Tensor* tensor, StorageType storageType) = 0;
/**
* @brief get buffer from tensor directly
* @param tensor buffer provider.
* @return support or not
*/
virtual bool onGetTensorInfo(const Tensor* tensor, void* dstInfo) {
return false;
}
/**
* @brief clear all dynamic buffers.
* @return success or not.
*/
virtual bool onClearBuffer() = 0;
/**
* @brief copy buffer from tensor to tensor.
* @param srcTensor source buffer provider.
* @param dstTensor dest buffer provider.
*/
virtual void onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const = 0;
public:
/**
* @brief get forward type.
* @return forward type.
*/
inline MNNForwardType type() const {
return mType;
}
public:
/**
* @brief get Gpu Tensor map host ptr/ unmap
*/
virtual void* onMapTensor(Tensor::MapType mtype, Tensor::DimensionType dtype, const Tensor* srcTensor) {
return nullptr;
}
virtual bool onUnmapTensor(Tensor::MapType mtype, Tensor::DimensionType dtype, const Tensor* dstTensor, void* mapPtr) {
return false;
}
virtual int onSync(Tensor::MapType mtype, bool toCpu, const Tensor* dstTensor) {
return 0;
}
private:
const MNNForwardType mType;
};
1.1.1.1.1.1.3 Execution
// source/core/Execution.hpp
/** abstract execution */
class Execution : public NonCopyable {
public:
/**
* @brief initializer.
* @param backend backend that exection will running on.
*/
Execution() = delete;
Execution(Backend *backend) : mBackEnd(backend) {
// nothing to do
}
/**
* @brief deinitializer.
*/
virtual ~Execution() = default;
/**
* @brief response shape change of input or output tensors.
* @param inputs input tensors
* @param outputs output tensors
* @return resize result
*/
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
return NO_ERROR;
}
/**
* @brief perform execution.
* @param inputs input tensors
* @param outputs output tensors
* @return execution result
*/
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) = 0;
/**
* @brief clone execution, new execution will share weight from this execution
* @param bn the cloned' execution's backend
* @param dst if dst = nullptr, just return whether execution can clone, otherwise clone the execution into dst
* @return execution result
*/
virtual bool onClone(Backend* bn, const Op* op, Execution** dst) {
return false;
}
public:
/**
* @brief designed for plugin system. not ready yet.
*/
class Creator : public NonCopyable {
public:
/**
* @brief deinitializer.
*/
virtual ~Creator() = default;
/**
* @brief create execution for given op on given backend.
* @param backend given backend.
* @param op given op.
* @return execution.
*/
virtual Execution *onCreate(Backend *backend, const Op *op) const = 0;
};
// Search for extra creator, if not found, return nullptr
MNN_PUBLIC static const Creator *searchExtraCreator(const std::string &key, MNNForwardType type);
/**
* @brief register creator for given key and backend type.
* @param creator registering creator.
* @param key given key.
* @param type given backend type.
* @return false if registered creator for same key and type exists, true otherwise.
*/
MNN_PUBLIC static bool insertExtraCreator(std::shared_ptr<Creator> creator, const std::string &key,
MNNForwardType type);
/**
* @brief unregister creator for given key and backend type.
* @param key given key.
* @param type given backend type.
* @return true if registered creator for given key and type exists, false otherwise.
*/
MNN_PUBLIC static bool removeExtraCreator(const std::string &key, MNNForwardType type);
public:
/**
* @brief check if execution is valid.
* @return valid or not.
*/
inline bool valid() const {
return mValid;
}
/**
* @brief get backend.
* @return backend.
*/
Backend *backend() const {
return mBackEnd;
}
protected:
bool mValid = true;
private:
Backend *mBackEnd;
};
1.1.1.1.1.1.4 CPU 算子执行实例注册
CPUBackend::onCreate 函数中有个 gCreator 成员,其缓存了所有的 CPU 算子执行创建实例 CPUBackend::Creator,其初始化与注册在 registerBackend 函数中调用 registerCPURuntimeCreator 来实现的。
// source/core/BackendRegister.cpp
static std::once_flag s_flag;
void registerBackend() {
std::call_once(s_flag, [&]() {
// ...
registerCPURuntimeCreator();
// ...
});
}
registerCPURuntimeCreator() 实现如下:
// source/backend/cpu/CPUBackend.cpp
void registerCPURuntimeCreator() {
CPUBackend::initCreatorMap();
registerCPUOps();
#ifdef MNN_SUPPORT_BF16
registerBF16Backend();
#endif
#ifdef MNN_USE_ARMV82
registerArm82RuntimeCreator();
#endif
// TODO: Merge _initCoreFunction MNNFunctionInit and cpuinfo_arm_init
MNNCoreFunctionInit();
MNNInsertExtraRuntimeCreator(MNN_FORWARD_CPU, new CPURuntimeCreator);
};
registerCPUOps 函数注册了所有的 CPU 的算子执行实例 Execution,其代码如下:
// source/backend/cpu/CPUOPRegister.cpp
void registerCPUOps() {
___CPUCropAndResizeCreator__OpType_CropAndResize__();
___CPUArgMaxCreator__OpType_ArgMax__();
___CPUArgMaxCreator__OpType_ArgMin__();
// ...
}
函数 ___CPUArgMaxCreator__OpType_ArgMax__ 是通过 REGISTER_CPU_OP_CREATOR 宏定义的:
// source/backend/cpu/CPUBackend.hpp
#define REGISTER_CPU_OP_CREATOR(name, opType) \
void ___##name##__##opType##__() { \
static name _temp;\
CPUBackend::addCreator(opType, &_temp); \
}
// source/backend/cpu/CPUArgMax.cpp
class CPUArgMaxCreator : public CPUBackend::Creator {
public:
virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
const MNN::Op *op, Backend *backend) const {
auto argMax = op->main_as_ArgMax();
if (op->type() == OpType_ArgMin) {
return new CPUArgMax(backend, CPUArgMax::ArgMinOrMax::ARGMIN,
argMax->topK(), argMax->outMaxVal(), argMax->softmaxThreshold(), argMax->axis());
} else {
return new CPUArgMax(backend, CPUArgMax::ArgMinOrMax::ARGMAX,
argMax->topK(), argMax->outMaxVal(), argMax->softmaxThreshold(), argMax->axis());
}
}
};
REGISTER_CPU_OP_CREATOR(CPUArgMaxCreator, OpType_ArgMax);
REGISTER_CPU_OP_CREATOR(CPUArgMaxCreator, OpType_ArgMin);
REGISTER_CPU_OP_CREATOR(CPUArgMaxCreator, OpType_ArgMax) 宏扩展如下:
// REGISTER_CPU_OP_CREATOR(CPUArgMaxCreator, OpType_ArgMax)
void ___CPUArgMaxCreator__OpType_ArgMax__() {
static CPUArgMaxCreator _temp;
CPUBackend::addCreator(OpType_ArgMax, &_temp);
}
注册是通过函数 CPUBackend::addCreator 实现的,其实现如下:
// source/backend/cpu/CPUBackend.cpp
bool CPUBackend::addCreator(OpType t, Creator* c) {
auto map = gCreator;
if (map->find(t) != map->end()) {
MNN_PRINT("Error: %d type has be added\n", t);
return false;
}
map->insert(std::make_pair(t, c));
return true;
}
由代码可知,创建器最终注册到 gCreator 中。
综上可见,扩展后的代码正是一个函数,函数名 ___CPUArgMaxCreator__OpType_ArgMax__ 呼应了 registerCPUOps 函数中的调用。gCreator 呼应了 CPUBackend::onCreate 函数的实现。
添加CPU实现
在source/backend/CPU
目录下添加CPUMyCustomOp.hpp
、CPUMyCustomOp.cpp
。
- 实现类声明
class CPUMyCustomOp : public Execution {
public:
// 若执行onExecute需要使用缓存,在此函数中申请,若无可不声明
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs,
const std::vector<Tensor *> &outputs) override;
// 具体的Op执行函数
virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs,
const std::vector<Tensor *> &outputs) override;
};
-
实现
onResize
和onExecute
在onResize
中,调用backend()->onAcquireBuffer(&mCache, Backend::DYNAMIC)
进行缓存的申请,调用backend()->onReleaseBuffer(&mCache, Backend::DYNAMIC)
回收缓存。释放后的内存可以被复用。
在onExecute
中,做必要的输入的检查,有利于提前发现问题。若执行完毕正确返回NO_ERROR。 -
注册实现类
class CPUMyCustomOpCreator : public CPUBackend::Creator {
public:
virtual Execution *onCreate(const std::vector<Tensor *> &inputs,
const std::vector<Tensor *> &outputs,
const MNN::Op *op,
Backend *backend) const override {
return new CPUMyCustomOp(backend);
}
};
REGISTER_CPU_OP_CREATOR(CPUMyCustomOpCreator, OpType_MyCustomOp);
1.1.1.1.1.2 Backend::onAcquireBuffer
在函数 GeometryComputerUtils::shapeComputeAndGeometryTransform 中调用 Backend::onAcquireBuffer 函数的代码如下:
auto res = backupBackend->onAcquireBuffer(t, Backend::STATIC);
onAcquireBuffer 只存在 Backend 基类中,其主要用来为张量 tensor 分配内存,具体实现代码如下:
bool Backend::onAcquireBuffer(const Tensor* tensor, StorageType storageType) {
auto mem = this->onAcquire(tensor, storageType);
if (nullptr == mem) {
return false;
}
if (mem == TensorUtils::getDescribe(tensor)->mem.get()) {
return true;
}
TensorUtils::getDescribe(tensor)->mem.reset(mem);
return true;
}
onAcquireBuffer 函数中调用 onAcquire 函数,这是个虚函数,由于传入的 backupBackend 是 CPUBackend(继承 Backend),所以实际调用的是 CPUBackend::onAcquire,其具体实现代码如下:
// source/backend/cpu/CPUBackend.cpp
Backend::MemObj* CPUBackend::onAcquire(const MNN::Tensor* nativeTensorConst, StorageType storageType) {
if (nativeTensorConst == nullptr) {
return nullptr;
}
//FUNC_PRINT_ALL(nativeTensorConst, p);
auto nativeTensor = (Tensor*)nativeTensorConst;
auto size = getTensorSize(nativeTensor, true);
return allocBuffer(size, nativeTensor, storageType);
}
1.1.1.1.1.3 Backend::onResizeBegin
在函数 GeometryComputerUtils::shapeComputeAndGeometryTransform 中调用 Backend::onResizeBegin 函数的代码如下:
backupBackend->onResizeBegin();
auto code = exe->onResize(c.inputs, c.outputs);
// ...
code = backupBackend->onResizeEnd();
// ...
code = exe->onExecute(c.inputs, c.outputs);
// ...
onResizeBegin 函数是个虚函数,由于传入的 backupBackend 是 CPUBackend(继承 Backend),所以实际调用的是 CPUBackend::onResizeBegin,其具体实现代码如下:
// source/backend/cpu/CPUBackend.cpp
void CPUBackend::onResizeBegin() {
mDynamicAllocator->reset();
}
1.1.1.1.1.4 Execution::onResize
在函数 GeometryComputerUtils::shapeComputeAndGeometryTransform 中调用 Execution::onResize 函数的代码如下:
backupBackend->onResizeBegin();
auto code = exe->onResize(c.inputs, c.outputs);
// ...
code = backupBackend->onResizeEnd();
// ...
code = exe->onExecute(c.inputs, c.outputs);
// ...
onResize 函数是个虚函数, exe 创建逻辑见 CPUBackend::onCreate ,exe->onResize 调用是个多态,其基类为 Execution,我们选择一个实例 CPULoop 进行分析,其具体实现代码如下:
// source/backend/cpu/CPURaster.cpp
class CPULoop : public Execution {
virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override {
int inputIndexSize = mLoop->inputIndexes()->size();
MNN_ASSERT(inputIndexSize == inputs.size());
for (int i=0; i<inputIndexSize; ++i) {
mStack[mLoop->inputIndexes()->data()[i]] = inputs[i];
}
int outputIndexSize = mLoop->outputIndexes()->size();
MNN_ASSERT(outputIndexSize == outputs.size());
for (int i=0; i<outputIndexSize; ++i) {
mStack[mLoop->outputIndexes()->data()[i]] = outputs[i];
}
int numberThread = mLoop->parallel() ? static_cast<CPUBackend*>(backend())->threadNumber() : 1;
mMaxCacheSize = 0;
auto bytes = static_cast<CPUBackend*>(backend())->functions()->bytes;
mMaxFuseBufferSize = 0;
for (int i=0; i<mLoop->commands()->size(); ++i) {
auto cmd = mLoop->commands()->GetAs<RegionCommand>(i);
auto op = cmd->op();
if (cmd->fuse() >= 0) {
// Make Temp output buffer
auto size = cmd->size()->data();
if (cmd->op()->type() == OpType_MatMul) {
mMaxFuseBufferSize = std::max(mMaxFuseBufferSize, bytes * size[0] * size[2]);
} else {
mMaxFuseBufferSize = std::max(mMaxFuseBufferSize, bytes * size[0] * size[1] * size[2]);
}
}
if (OpType_UnaryOp == op->type()) {
if (nullptr != op->main_as_UnaryOp()) {
auto view0 = cmd->view()->GetAs<View>(0);
auto view1 = cmd->view()->GetAs<View>(1);
MNN_ASSERT(view0->stride()->data()[2] == 1 || cmd->fuse() >= 0);
if (view1->stride()->data()[2] != 1) {
mMaxCacheSize = std::max(mMaxCacheSize, cmd->size()->data()[2] * bytes);
}
}
continue;
}
if (OpType_BinaryOp == op->type()) {
auto view0 = cmd->view()->GetAs<View>(0);
auto view1 = cmd->view()->GetAs<View>(1);
auto view2 = cmd->view()->GetAs<View>(2);
MNN_ASSERT(view0->stride()->data()[2] == 1 || cmd->fuse() >= 0);
if (view1->stride()->data()[2] != 1 || view2->stride()->data()[2] != 1) {
mMaxCacheSize = std::max(mMaxCacheSize, 2 * cmd->size()->data()[2] * bytes);
}
continue;
}
if (OpType_MatMul == op->type()) {
bool transposeC = true;
int e = cmd->size()->data()[0];
int l = cmd->size()->data()[1];
int h = cmd->size()->data()[2];
std::shared_ptr<Tensor> A, B, C, Bias;
C.reset(Tensor::createDevice<float>({e, h}));
if (op->main_as_MatMul()->transposeA()) {
A.reset(Tensor::createDevice<float>({l, e}));
} else {
A.reset(Tensor::createDevice<float>({e, l}));
}
if (op->main_as_MatMul()->transposeB()) {
B.reset(Tensor::createDevice<float>({h, l}));
} else {
B.reset(Tensor::createDevice<float>({l, h}));
}
auto view = cmd->view()->GetAs<View>(0);
if (view->stride()->data()[0] == 1) {
transposeC = false;
}
std::vector<Tensor*> inputs, outputs;
if (cmd->indexes()->size() > 3) {
Bias.reset(Tensor::createDevice<float>({h}));
inputs = {A.get(), B.get(), Bias.get()};
} else {
inputs = {A.get(), B.get()};
}
outputs = {C.get()};
auto bufferPool = static_cast<CPUBackend*>(backend())->getBufferAllocator();
auto code = NO_ERROR;
if (numberThread > 1) {
bufferPool->barrierBegin();
}
for (int v=0; v<numberThread; ++v) {
if (numberThread > 1) {
bufferPool->beginGroup();
}
do {
// If not loop parallel, parallel inside
bool needParallel = numberThread == 1;
mContainer[v].exe[i].reset(new CPUMatMul(backend(), op->main_as_MatMul()->transposeA(), op->main_as_MatMul()->transposeB(), transposeC, needParallel));
if (nullptr == mContainer[v].exe[i]) {
code = OUT_OF_MEMORY;
break;
}
code = mContainer[v].exe[i]->onResize(inputs, outputs);
} while (false);
if (numberThread > 1) {
bufferPool->endGroup();
}
if (NO_ERROR != code) {
break;
}
}
if (numberThread > 1) {
bufferPool->barrierEnd();
}
if (NO_ERROR != code) {
return code;
}
continue;
}
}
auto threadNumber = static_cast<CPUBackend*>(backend())->threadNumber();
if (mMaxCacheSize > 0 || mMaxFuseBufferSize > 0) {
mCacheBuffer = static_cast<CPUBackend*>(backend())->getBufferAllocator()->alloc(threadNumber * (mMaxCacheSize + mMaxFuseBufferSize));
if (mCacheBuffer.invalid()) {
return OUT_OF_MEMORY;
}
mFuseBuffer = mCacheBuffer + threadNumber * mMaxCacheSize;
static_cast<CPUBackend*>(backend())->getBufferAllocator()->free(mCacheBuffer);
}
return NO_ERROR;
}
}
1.1.1.1.1.5 Backend::onResizeEnd
在函数 GeometryComputerUtils::shapeComputeAndGeometryTransform 中调用 Backend::onResizeEnd 函数的代码如下:
backupBackend->onResizeBegin();
auto code = exe->onResize(c.inputs, c.outputs);
// ...
code = backupBackend->onResizeEnd();
// ...
code = exe->onExecute(c.inputs, c.outputs);
// ...
onResizeEnd 函数是个虚函数,由于传入的 backupBackend 是 CPUBackend(继承 Backend),所以实际调用的是 CPUBackend::onResizeEnd,其具体实现代码如下:
// source/backend/cpu/CPUBackend.cpp
ErrorCode CPUBackend::onResizeEnd() {
getCache()->release();
return mDynamicAllocator->compute();
}
1.1.1.1.1.6 Execution::onExecute
在函数 GeometryComputerUtils::shapeComputeAndGeometryTransform 中调用 Execution::onExecute 函数的代码如下:
backupBackend->onResizeBegin();
auto code = exe->onResize(c.inputs, c.outputs);
// ...
code = backupBackend->onResizeEnd();
// ...
code = exe->onExecute(c.inputs, c.outputs);
// ...
onExecute 函数是个虚函数, exe 创建逻辑见 CPUBackend::onCreate ,exe->onExecute 调用是个多态,其基类为 Execution,我们选择一个实例 CPULoop 进行分析,其具体实现代码如下:
// source/backend/cpu/CPURaster.cpp
class CPULoop : public Execution {
virtual ErrorCode onExecute(const std::vector<Tensor *> &originInputs, const std::vector<Tensor *> &originOutputs) override {
auto cpubackend = static_cast<CPUBackend*>(backend());
auto precision = cpubackend->precisionMode();
auto threadNumber = cpubackend->threadNumber();
if (mLoop->initCommand() != nullptr) {
for (int i=0; i<mLoop->initCommand()->size(); ++i) {
auto cmd = mLoop->initCommand()->GetAs<RegionCommand>(i);
if (cmd->op() == nullptr) {
auto output = mStack[cmd->indexes()->data()[0]];
::memset(output->host<void>(), 0, cpubackend->getTensorSize(output) * cpubackend->functions()->bytes);
} else {
Tensor::InsideDescribe::Region reg;
auto srcView = cmd->view()->GetAs<View>(1);
auto dstView = cmd->view()->GetAs<View>(0);
::memcpy(reg.size, cmd->size()->data(), 3 * sizeof(int32_t));
::memcpy(reg.src.stride, srcView->stride()->data(), 3 * sizeof(int32_t));
::memcpy(reg.dst.stride, dstView->stride()->data(), 3 * sizeof(int32_t));
auto input = mStack[cmd->indexes()->data()[1]];
auto inputSize = input->elementSize();
auto output = mStack[cmd->indexes()->data()[0]];
auto bytes = input->getType().bytes();
if (halide_type_float == input->getType().code) {
bytes = cpubackend->functions()->bytes;
}
_blit(reg, bytes, input->host<uint8_t>(), output->host<uint8_t>());
}
}
}
if (1 == mLoop->commands()->size()) {
auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
auto op = cmd->op();
if (OpType_UnaryOp == op->type() && nullptr == op->main() && cmd->fuse() < 0) {
// For Gather / Single Unary
auto index0 = cmd->iterIndexes()->data()[0];
auto index1 = cmd->iterIndexes()->data()[1];
int32_t iter = 0;
int32_t* iter0 = &iter;
int32_t* iter1 = &iter;
int32_t iter0Stride = 0;
int32_t iter1Stride = 0;
if (index0 >= 0) {
iter0 = originInputs[index0]->host<int32_t>();
iter0Stride = 1;
}
if (index1 >= 0) {
iter1 = originInputs[index1]->host<int32_t>();
iter1Stride = 1;
}
Tensor::InsideDescribe::Region reg;
auto srcView = cmd->view()->GetAs<View>(1);
auto dstView = cmd->view()->GetAs<View>(0);
::memcpy(reg.size, cmd->size()->data(), 3 * sizeof(int32_t));
::memcpy(reg.src.stride, srcView->stride()->data(), 3 * sizeof(int32_t));
::memcpy(reg.dst.stride, dstView->stride()->data(), 3 * sizeof(int32_t));
auto input = mStack[cmd->indexes()->data()[1]];
auto inputSize = input->elementSize();
auto output = mStack[cmd->indexes()->data()[0]];
auto bytes = input->getType().bytes();
if (halide_type_float == input->getType().code) {
bytes = static_cast<CPUBackend*>(backend())->functions()->bytes;
}
auto step0 = cmd->steps()->data()[0];
auto step1 = cmd->steps()->data()[1];
auto loopNumber = mLoop->loopNumber();
for (; iter<loopNumber; ++iter) {
auto srcIter = *(iter1 + iter1Stride * iter);
auto dstIter = *(iter0 + iter0Stride * iter);
auto srcOffset = srcIter * step1 + srcView->offset();
auto dstOffset = dstIter * step0 + dstView->offset();
if (dstOffset >= 0) {
if (srcOffset >= 0 && srcOffset < inputSize) {
_blit(reg, bytes, input->host<uint8_t>() + bytes * srcOffset, output->host<uint8_t>() + bytes * dstOffset);
} else {
_zero(reg, bytes, output->host<uint8_t>() + bytes * dstOffset);
}
}
}
return NO_ERROR;
}
}
auto bytes = static_cast<CPUBackend*>(backend())->functions()->bytes;
auto func = [&](int iter, int tId) {
int fuseOutputStride[3];
const int32_t* outputStride = nullptr;
auto fuseBuffer = mFuseBuffer + mMaxFuseBufferSize * tId;
for (int index=0; index<mLoop->commands()->size(); ++index) {
auto cmd = mLoop->commands()->GetAs<RegionCommand>(index);
auto blit = _selectUnitProc(bytes, cmd->view()->GetAs<View>(1)->stride()->data()[2], 1);
auto op = cmd->op();
int iterIndexsize = cmd->iterIndexes()->size();
if (cmd->fuse() >= 0) {
outputStride = fuseOutputStride;
auto cmdSize = cmd->size()->data();
fuseOutputStride[0] = cmdSize[1] * cmdSize[2];
fuseOutputStride[1] = cmdSize[2];
fuseOutputStride[2] = 1;
} else {
// Loop Op's command's first index must be output
outputStride = cmd->view()->GetAs<View>(0)->stride()->data();
}
halide_type_t inputType;
for (int v=0; v<iterIndexsize; ++v) {
auto tensorIndex = cmd->indexes()->data()[v];
auto tensor = mStack[tensorIndex];
auto iterIndex = cmd->iterIndexes()->data()[v];
auto offset = iter;
if (1 == v) {
inputType = tensor->getType();
}
if (iterIndex >= 0) {
offset = mStack[iterIndex]->host<int32_t>()[iter];
}
auto view = cmd->view()->GetAs<View>(v);
offset = offset * cmd->steps()->data()[v] + view->offset();
mContainer[tId].stackPtr[tensorIndex] = tensor->host<uint8_t>() + offset * bytes;
MNN_ASSERT(nullptr != tensor->host<uint8_t>());
}
auto dstOrigin = (uint8_t*)mContainer[tId].stackPtr[cmd->indexes()->data()[0]];
auto dst = dstOrigin;
if (cmd->fuse() >= 0) {
dst = fuseBuffer.ptr();
}
do {
if (OpType_UnaryOp == op->type()) {
auto src = (uint8_t*)mContainer[tId].stackPtr[cmd->indexes()->data()[1]];
if (nullptr == op->main()) {
// Copy
Tensor::InsideDescribe::Region reg;
auto srcView = cmd->view()->GetAs<View>(1);
auto dstView = cmd->view()->GetAs<View>(0);
::memcpy(reg.size, cmd->size()->data(), 3 * sizeof(int32_t));
::memcpy(reg.src.stride, srcView->stride()->data(), 3 * sizeof(int32_t));
::memcpy(reg.dst.stride, outputStride, 3 * sizeof(int32_t));
auto step0 = cmd->steps()->data()[0];
auto step1 = cmd->steps()->data()[1];
auto loopNumber = mLoop->loopNumber();
_blit(reg, bytes, (const uint8_t*)src, (uint8_t*)dst);
break;
}
auto proc = static_cast<CPUBackend*>(backend())->functions()->MNNSelectUnaryFunctionForFloat(op->main_as_UnaryOp()->opType(), static_cast<CPUBackend*>(backend())->precisionMode());
auto lastS = cmd->size()->data()[2];
if (lastS == 1 || cmd->view()->GetAs<View>(1)->stride()->data()[2] == 1) {
for (int z=0; z<cmd->size()->data()[0]; ++z) {
auto srcZ = src + z * cmd->view()->GetAs<View>(1)->stride()->data()[0] * bytes;
auto dstZ = dst + z * outputStride[0] * bytes;
for (int y=0; y<cmd->size()->data()[1]; ++y) {
auto srcY = srcZ + y * cmd->view()->GetAs<View>(1)->stride()->data()[1] * bytes;
auto dstY = dstZ + y * outputStride[1] * bytes;
proc(dstY, srcY, lastS);
}
}
} else {
// Blit to cache
auto srcCache = mCacheBuffer.ptr() + mMaxCacheSize * tId;
for (int z=0; z<cmd->size()->data()[0]; ++z) {
auto srcZ = src + z * cmd->view()->GetAs<View>(1)->stride()->data()[0] * bytes;
auto dstZ = dst + z * outputStride[0] * bytes;
for (int y=0; y<cmd->size()->data()[1]; ++y) {
auto srcY = srcZ + y * cmd->view()->GetAs<View>(1)->stride()->data()[1] * bytes;
auto dstY = dstZ + y * outputStride[1] * bytes;
blit(srcCache, srcY, lastS, cmd->view()->GetAs<View>(1)->stride()->data()[2], 1);
proc(dstY, srcCache, lastS);
}
}
}
continue;
}
if (OpType_MatMul == op->type()) {
// TODO: Don't support fuse for matmul currently
const float* APtr = nullptr;
const float* BPtr = nullptr;
const float* BiasPtr = nullptr;
float* CPtr = (float*)dst;
auto exe = static_cast<CPUMatMul*>(mContainer[tId].exe[index].get());
APtr = (const float*)mContainer[tId].stackPtr[cmd->indexes()->data()[1]];
BPtr = (const float*)mContainer[tId].stackPtr[cmd->indexes()->data()[2]];
if (iterIndexsize > 3) {
BiasPtr = (const float*)mContainer[tId].stackPtr[cmd->indexes()->data()[3]];
}
exe->execute(APtr, BPtr, CPtr, BiasPtr);
break;
}
if (OpType_BinaryOp == op->type()) {
auto src0 = mContainer[tId].stackPtr[cmd->indexes()->data()[1]];
MNNBinaryExecute proc;
if (inputType.code == halide_type_float) {
proc = static_cast<CPUBackend*>(backend())->functions()->MNNSelectBinaryFunctionForFloat(op->main_as_BinaryOp()->opType());
} else {
MNN_ASSERT(inputType.code == halide_type_int);
proc = CPUBinary::selectForInt(op->main_as_BinaryOp()->opType());
}
auto lastS = cmd->size()->data()[2];
auto stride0 = outputStride;
auto stride1 = cmd->view()->GetAs<View>(1)->stride()->data();
MNN_ASSERT(stride0[2] == 1);
auto src1 = mContainer[tId].stackPtr[cmd->indexes()->data()[2]];
auto stride2 = cmd->view()->GetAs<View>(2)->stride()->data();
auto blit1 = _selectUnitProc(bytes, stride1[2], 1);
auto blit2 = _selectUnitProc(bytes, stride2[2], 1);
if (cmd->size()->data()[2] == 1 || (stride1[2] == 1 && stride2[2] == 1)) {
for (int z=0; z<cmd->size()->data()[0]; ++z) {
auto src0Z = src0 + z * stride1[0] * bytes;
auto src1Z = src1 + z * stride2[0] * bytes;
auto dstZ = dst + z * stride0[0] * bytes;
for (int y=0; y<cmd->size()->data()[1]; ++y) {
auto src0Y = src0Z + y * stride1[1] * bytes;
auto src1Y = src1Z + y * stride2[1] * bytes;
auto dstY = dstZ + y * stride0[1] * bytes;
proc(dstY, src0Y, src1Y, cmd->size()->data()[2], -1);
}
}
} else {
auto cache0 = mCacheBuffer.ptr() + mMaxCacheSize * tId;
auto cache1 = cache0 + cmd->size()->data()[2] * bytes;
for (int z=0; z<cmd->size()->data()[0]; ++z) {
auto src0Z = src0 + z * stride1[0] * bytes;
auto src1Z = src1 + z * stride2[0] * bytes;
auto dstZ = dst + z * stride0[0] * bytes;
for (int y=0; y<cmd->size()->data()[1]; ++y) {
auto src0Y = src0Z + y * stride1[1] * bytes;
auto src1Y = src1Z + y * stride2[1] * bytes;
auto dstY = dstZ + y * stride0[1] * bytes;
blit1(cache0, src0Y, cmd->size()->data()[2], stride1[2], 1);
blit2(cache1, src1Y, cmd->size()->data()[2], stride2[2], 1);
proc(dstY, cache0, cache1, cmd->size()->data()[2], -1);
}
}
}
break;
}
} while(false);
if (dst != dstOrigin) {
MNN_ASSERT(bytes == 4);
// Currently only support add and float32
auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
auto srcF = (const float*)dst;
auto dstF = (float*)dstOrigin;
int sizeZ = cmd->size()->data()[0];
int sizeY = cmd->size()->data()[1];
int sizeX = cmd->size()->data()[2];
if (cmd->op()->type() == OpType_MatMul) {
auto proc = static_cast<CPUBackend*>(backend())->functions()->MNNSelectBinaryFunctionForFloat(cmd->fuse());
proc(dstF, dstF, srcF, sizeZ * sizeX, -1);
continue;
}
switch (cmd->fuse()) {
case BinaryOpOperation_ADD:
for (int z=0; z<sizeZ; ++z) {
auto srcZ = srcF + z * outputStride[0];
auto dstZ = dstF + z * dstStride[0];
for (int y=0; y<sizeY; ++y) {
auto srcY = srcZ + y * outputStride[1];
auto dstY = dstZ + y * dstStride[1];
for (int x=0; x<sizeX; ++x) {
auto dstOffset = x * dstStride[2];
dstY[dstOffset] = dstY[dstOffset] + srcY[x];
}
}
}
break;
case BinaryOpOperation_MUL:
for (int z=0; z<sizeZ; ++z) {
auto srcZ = srcF + z * dstStride[0];
auto dstZ = dstF + z * outputStride[0];
for (int y=0; y<sizeY; ++y) {
auto srcY = srcZ + z * dstStride[1];
auto dstY = dstZ + z * outputStride[1];
for (int x=0; x<sizeX; ++x) {
auto dstOffset = x * dstStride[2];
dstY[dstOffset] = dstY[dstOffset] * srcY[x];
}
}
}
break;
case BinaryOpOperation_SUB:
for (int z=0; z<sizeZ; ++z) {
auto srcZ = srcF + z * dstStride[0];
auto dstZ = dstF + z * outputStride[0];
for (int y=0; y<sizeY; ++y) {
auto srcY = srcZ + z * dstStride[1];
auto dstY = dstZ + z * outputStride[1];
for (int x=0; x<sizeX; ++x) {
auto dstOffset = x * dstStride[2];
auto D = dstY[dstOffset];
auto S = srcY[x];
dstY[dstOffset] = D - S;
}
}
}
break;
default:
break;
}
}
}
};
if (mLoop->parallel()) {
MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
for (int iter=tId; iter < mLoop->loopNumber(); iter+=threadNumber) {
func(iter, tId);
}
}
MNN_CONCURRENCY_END();
} else {
for (int iter=0; iter < mLoop->loopNumber(); ++iter) {
func(iter, 0);
}
}
return NO_ERROR;
}
}
☆