可变形卷积原理、实现及工程化部署

您所在的位置：网站首页 › onnx加速原理 › 可变形卷积原理、实现及工程化部署

可变形卷积原理、实现及工程化部署

#可变形卷积原理、实现及工程化部署| 来源: 网络整理| 查看: 265

引言

本文主要以目标检测算法CenterNet作为载体，介绍可变形卷积的算法原理，python实现以及工程化部署

可变形卷积原理介绍

可变形卷积目前有DCN V1以及在V1基础上改进发展来的DCN V2，具体的算法原理可以看论文原文或者参考：

https://cloud.tencent.com/developer/article/1679389

可变形卷积的实现

这里主要参考：https://cloud.tencent.com/developer/article/1638363 中的讲解

CenterNet中实现DCN V2的工程化部署

这里主要参考的代码为：https://github.com/CaoWGG/TensorRT-CenterNet

这里参考：

利用TensorRT对深度学习进行加速；利用TensorRT实现神经网络提速(读取ONNX模型并运行)实现TensorRT自定义插件(plugin)自由

有几个点需要事先说明:

为了能够实现对于模型的量化，需要将CenterNet模型先转换为ONNX，然后利用tensorRT官方公布的onnx-tensorrt库来实现tensorRT模型的转化

由于TensorRT中不支持可变形卷积的操作，所以需要自定义plugin来进行实现

CenterNet中DCNV2的plugin定义在onnx-tensorrt文件夹中，分别为：

dcn_v2_im2col_cuda.cu和dcn_v2_im2col_cuda.h；DCNv2.cpp和DCNv2.h，其中DCNv2.cpp和DCNv2.h为tensorRT中自定义plugin的文件，具体代码如下：

DCNv2Plugin::DCNv2Plugin(int in_channel, int out_channel, int kernel_H, int kernel_W, int deformable_group, int dilation, int groups, int padding, int stride, nvinfer1::Weights const &weight, nvinfer1::Weights const &bias):_in_channel(in_channel), _out_channel(out_channel),_kernel_H(kernel_H),_kernel_W(kernel_W),_deformable_group(deformable_group), _dilation(dilation),_groups(groups),_padding(padding),_stride(stride),_initialized(false){ if (weight.type == nvinfer1::DataType::kFLOAT) { _h_weight.assign((float*)weight.values,(float*)weight.values+weight.count); } else { throw std::runtime_error("Unsupported weight dtype");} if (bias.type == nvinfer1::DataType::kFLOAT) { _h_bias.assign((float*)bias.values,(float*)bias.values+bias.count); } else { throw std::runtime_error("Unsupported bias dtype");} } // 初始化函数，为参数提前开辟显存空间 int DCNv2Plugin::initialize() { if(_initialized) return 0; auto _output_dims = this->getOutputDimensions(0, &this->getInputDims(0), 3); assert(is_CHW(this->getInputDims(0))); assert(is_CHW(_output_dims)); size_t ones_size = _output_dims.d[1]*_output_dims.d[2]* sizeof(float); size_t weight_size = _h_weight.size()* sizeof(float); size_t bias_size = _h_bias.size()* sizeof(float); float *ones_cpu = new float[ones_size/ sizeof(float)]; for (int i = 0; i < ones_size/ sizeof(float); i++) { ones_cpu[i] = 1.0; } CHECK_CUDA(cudaMalloc((void**)&_d_columns, _in_channel * _kernel_H * _kernel_W * ones_size);); CHECK_CUDA(cudaMalloc((void**)&_d_ones, ones_size)); CHECK_CUDA(cudaMalloc((void**)&_d_weight, weight_size)); CHECK_CUDA(cudaMalloc((void**)&_d_bias, bias_size)); CHECK_CUDA(cudaMemcpy(_d_ones, ones_cpu, ones_size, cudaMemcpyHostToDevice)); CHECK_CUDA(cudaMemcpy(_d_weight, _h_weight.data(), weight_size, cudaMemcpyHostToDevice)); CHECK_CUDA(cudaMemcpy(_d_bias, _h_bias.data(), bias_size, cudaMemcpyHostToDevice)); delete[] ones_cpu; _initialized = true; return 0; } // 用于释放之前申请的显存空间 void DCNv2Plugin::terminate() { if (!_initialized) { return; } cudaFree(_d_columns); cudaFree(_d_bias); cudaFree(_d_weight); cudaFree(_d_ones); _initialized = false; } DCNv2Plugin::~DCNv2Plugin() { terminate(); } // 判断数据类型是否正确 bool DCNv2Plugin::supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) const { return (type == nvinfer1::DataType::kFLOAT); } // TensorRT支持动态tensor大小的时候，batch的维度需要用下面的函数定义 nvinfer1::Dims DCNv2Plugin::getOutputDimensions(int index, const nvinfer1::Dims *inputDims, int nbInputs) { assert(index == 0); assert(inputDims); assert(nbInputs == 3); nvinfer1::Dims const& input = inputDims[0]; assert(is_CHW(input)); nvinfer1::Dims output; output.nbDims = input.nbDims; for( int d=0; dgetInputDims(0); assert(batchSize==1); int h = input_dims.d[1]; int w = input_dims.d[2]; int height_out = (h + 2 * _padding - (_dilation * (_kernel_H - 1) + 1)) / _stride + 1; int width_out = (w + 2 * _padding - (_dilation * (_kernel_W - 1) + 1)) / _stride + 1; m = _out_channel; n = height_out * width_out; k = 1; alpha = 1.0; beta = 0.0; /// output nxm /// ones 1xn T ->> nx1 /// bias 1xm /// ones x bias = nxm // add bias cublasSgemm(handle, CUBLAS_OP_T, CUBLAS_OP_N, n, m, k,&alpha, _d_ones, k, _d_bias, k,&beta, output, n); // im2col (offset and mask) modulated_deformable_im2col_cuda(stream,input,offset,mask, 1, _in_channel, h, w, height_out, width_out, _kernel_H, _kernel_W, _padding, _padding, _stride, _stride, _dilation, _dilation, _deformable_group, _d_columns); m = _out_channel; n = height_out * width_out; k = _in_channel * _kernel_H * _kernel_W; alpha = 1.0; beta = 1.0; // im2col conv cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k,&alpha, _d_columns, n, _d_weight, k, &beta, output, n); return 0; }

然后需要将以上plugin定义好的op在onnx-tensorrt中builtin_op_importers.cpp文件中进行插件的注册操作：

DEFINE_BUILTIN_OP_IMPORTER(DCNv2) { ASSERT(inputs.at(0).is_tensor(), ErrorCode::kUNSUPPORTED_NODE); // input ASSERT(inputs.at(1).is_tensor(), ErrorCode::kUNSUPPORTED_NODE); // offset ASSERT(inputs.at(2).is_tensor(), ErrorCode::kUNSUPPORTED_NODE); // mask ASSERT(inputs.at(3).is_weights(), ErrorCode::kUNSUPPORTED_NODE); // weight auto kernel_weights = inputs.at(3).weights(); nvinfer1::Weights bias_weights; if( inputs.size() == 5 ) { ASSERT(inputs.at(4).is_weights(), ErrorCode::kUNSUPPORTED_NODE); auto shaped_bias_weights = inputs.at(4).weights(); ASSERT(shaped_bias_weights.shape.nbDims == 1, ErrorCode::kINVALID_NODE); ASSERT(shaped_bias_weights.shape.d[0] == kernel_weights.shape.d[0], ErrorCode::kINVALID_NODE); bias_weights = shaped_bias_weights; } else { bias_weights = ShapedWeights::empty(kernel_weights.type); } int out_channel,in_channel,kernel_H,kernel_W,deformable_group,dilation,groups,padding,stride; out_channel = kernel_weights.shape.d[0]; in_channel = kernel_weights.shape.d[1]; kernel_H = kernel_weights.shape.d[2]; kernel_W = kernel_weights.shape.d[3]; OnnxAttrs attrs(node); deformable_group = attrs.get("deformable_group", 1); dilation = attrs.get("dilation", 1); groups = attrs.get("groups", 1); padding = attrs.get("padding", 1); stride = attrs.get("stride", 1); RETURN_FIRST_OUTPUT( ctx->addPlugin( new DCNv2Plugin(in_channel,out_channel,kernel_H,kernel_W,deformable_group, dilation,groups,padding,stride, kernel_weights, bias_weights), {&inputs.at(0).tensor(),&inputs.at(1).tensor(),&inputs.at(2).tensor()})); }

在builtin_plugins.cpp也进行注册：

REGISTER_BUILTIN_PLUGIN("DCNv2", DCNv2Plugin);

然后需要在onnx-tensorrt中的CMakeLists.txt里添加上定义plugin对应的源码并将其链接到动态库中：

# 定义插件源码 set(PLUGIN_SOURCES FancyActivation.cu ResizeNearest.cu Split.cu dcn_v2_im2col_cuda.cu InstanceNormalization.cpp DCNv2.cpp plugin.cpp ) # 链接到动态库 list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC --expt-extended-lambda") if(${CMAKE_VERSION} VERSION_LESS ${CMAKE_VERSION_THRESHOLD}) CUDA_INCLUDE_DIRECTORIES(${CUDNN_INCLUDE_DIR} ${TENSORRT_INCLUDE_DIR}) CUDA_ADD_LIBRARY(nvonnxparser_plugin STATIC ${PLUGIN_SOURCES}) else() include_directories(${CUDNN_INCLUDE_DIR} ${TENSORRT_INCLUDE_DIR}) add_library(nvonnxparser_plugin STATIC ${PLUGIN_SOURCES}) endif() target_include_directories(nvonnxparser_plugin PUBLIC ${CUDA_INCLUDE_DIRS} ${ONNX_INCLUDE_DIRS} ${TENSORRT_INCLUDE_DIR} ${CUDNN_INCLUDE_DIR}) target_link_libraries(nvonnxparser_plugin ${TENSORRT_LIBRARY} cuda cudart cublas)

最后在链接以上生成的库来进行tensort标准C++ API函数进行前向推理以及模型转换操作

【本文地址】

可变形卷积原理、实现及工程化部署

可变形卷积原理、实现及工程化部署

今日新闻

推荐新闻