NotificationsYou must be signed in to change notification settings
Fork56.4k
Star85.3k

Commit99c86bb

authored

Merge pull request#24556 from plctlab:rvp

Optimization based on RISC-V P Packed SIMD Extension v0.5.2

2 parents68dc02e +a30c987 commit99c86bbCopy full SHA for 99c86bb

File tree

6 files changed

+287

-0

lines changed

modules/dnn/src/int8layers
platforms/linux
- riscv64-andes-gcc.toolchain.cmake

6 files changed

+287

-0

lines changed

`‎modules/dnn/src/int8layers/convolution_layer.cpp‎`

Lines changed: 13 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -969,6 +969,13 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl`
`969`	`969`	`stride_h, stride_w, dilation_h, dilation_w,pad_t, pad_l,`
`970`	`970`	`biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);`
`971`	`971`	`else`
	`972`	`+ #endif`
	`973`	`+ #if CV_RVP052`
	`974`	`+if(isConv2D)`
	`975`	`+opt_RVP052::fastDepthwiseConv(wptr, kernel_h, kernel_w,`
	`976`	`+ stride_h, stride_w, dilation_h, dilation_w,pad_t, pad_l,`
	`977`	`+ biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);`
	`978`	`+else`
`972`	`979`	`#endif`
`973`	`980`	`{`
`974`	`981`	`constint8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],`
`@@ -1348,6 +1355,12 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl`
`1348`	`1355`	`opt_LASX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,`
`1349`	`1356`	`outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 ==0, cn1 == inpCn);`
`1350`	`1357`	`else`
	`1358`	`+ #endif`
	`1359`	`+ #if CV_RVP052`
	`1360`	`+if(isConv2D)`
	`1361`	`+opt_RVP052::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,`
	`1362`	`+ outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 ==0, cn1 == inpCn);`
	`1363`	`+else`
`1351`	`1364`	`#endif`
`1352`	`1365`	`for(int i =0; i < outCn; i +=2 )`
`1353`	`1366`	`{`

`‎modules/dnn/src/int8layers/fully_connected_layer.cpp‎`

Lines changed: 5 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -302,6 +302,11 @@ class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8`
`302`	`302`	`if( useLASX )`
`303`	`303`	`opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );`
`304`	`304`	`else`
	`305`	`+ #endif`
	`306`	`+ #if CV_RVP052`
	`307`	`+if(1 )`
	`308`	`+opt_RVP052::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );`
	`309`	`+else`
`305`	`310`	`#endif`
`306`	`311`	`{`
`307`	`312`	`int i =0;`

`‎modules/dnn/src/int8layers/layers_common.hpp‎`

Lines changed: 2 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,8 @@`
`13`	`13`	`#include"int8layers/layers_common.simd_declarations.hpp"`
`14`	`14`	`#undef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY`
`15`	`15`
	`16`	`+#include"./layers_rvp052.hpp"`
	`17`	`+`
`16`	`18`	`#ifdef HAVE_OPENCL`
`17`	`19`	`#include"../ocl4dnn/include/ocl4dnn.hpp"`
`18`	`20`	`#endif`

`‎modules/dnn/src/int8layers/layers_rvp052.cpp‎`

Lines changed: 221 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,221 @@`
	`1`	`+// This file is part of OpenCV project.`
	`2`	`+// It is subject to the license terms in the LICENSE file found in the top-level directory`
	`3`	`+// of this distribution and at http://opencv.org/license.html.`
	`4`	`+`
	`5`	`+#include"../precomp.hpp"`
	`6`	`+#include"./layers_rvp052.hpp"`
	`7`	`+`
	`8`	`+#if CV_RVP052`
	`9`	`+`
	`10`	`+namespacecv {`
	`11`	`+namespacednn {`
	`12`	`+namespaceopt_RVP052 {`
	`13`	`+`
	`14`	`+voidfastConv(constint8_t weights,size_t wstep,constint bias,`
	`15`	`+constint8_t rowbuf,int output,constint *outShape,`
	`16`	`+int blockSize,int vecsize,int vecsize_aligned,int outZp,`
	`17`	`+constfloat *multiplier,bool initOutput,bool finalOutput)`
	`18`	`+{`
	`19`	`+int outCn = outShape[1];`
	`20`	`+size_t outPlaneSize = outShape[2] * outShape[3];`
	`21`	`+for (int i =0; i < outCn; i +=2)`
	`22`	`+ {`
	`23`	`+constint8_t wptr0 = weights + i wstep;`
	`24`	`+constint8_t *wptr1 = wptr0 + wstep;`
	`25`	`+int outptr0 = output + i outPlaneSize;`
	`26`	`+int *outptr1 = outptr0 + outPlaneSize;`
	`27`	`+int bias0 = bias[i], bias1 = bias[i +1];`
	`28`	`+float mult0 = multiplier[i], mult1 = multiplier[i +1];`
	`29`	`+`
	`30`	`+if (i +1 >= outCn)`
	`31`	`+ {`
	`32`	`+ wptr1 = wptr0;`
	`33`	`+ outptr1 = outptr0;`
	`34`	`+ bias1 = bias0;`
	`35`	`+ mult1 = mult0;`
	`36`	`+ }`
	`37`	`+int j =0;`
	`38`	`+for (; j < blockSize; j++)`
	`39`	`+ {`
	`40`	`+constint8_t rptr = rowbuf + j vecsize_aligned;`
	`41`	`+int s00 = initOutput ? bias0 : outptr0[j];`
	`42`	`+int s10 = initOutput ? bias1 : outptr1[j];`
	`43`	`+`
	`44`	`+int32x2_t vsx0 = {s00, s10};`
	`45`	`+`
	`46`	`+for (int k =0; k < vecsize; k +=4)`
	`47`	`+ {`
	`48`	`+int8x4_t vrptr[2] = {(int8x4_t)(rptr + k), (int8x4_t)(rptr + k)};`
	`49`	`+int8x4_t vwptr[2] = {(int8x4_t)(wptr0 + k), (int8x4_t)(wptr1 + k)};`
	`50`	`+ vsx0 =__nds__v_smaqa(vsx0, (int8x8_t)vwptr, (int8x8_t)vrptr);`
	`51`	`+ }`
	`52`	`+`
	`53`	`+if (finalOutput)`
	`54`	`+ {`
	`55`	`+ vsx0[0] = outZp + (int)std::round(vsx0[0] * mult0);`
	`56`	`+ vsx0[1] = outZp + (int)std::round(vsx0[1] * mult1);`
	`57`	`+ vsx0 =__nds__v_sclip32(vsx0,7);`
	`58`	`+ }`
	`59`	`+`
	`60`	`+ outptr0[j] = vsx0[0];`
	`61`	`+ outptr1[j] = vsx0[1];`
	`62`	`+ }`
	`63`	`+ }`
	`64`	`+}`
	`65`	`+`
	`66`	`+voidfastDepthwiseConv(constint8_t *wptr,`
	`67`	`+int kernel_h,int kernel_w,`
	`68`	`+int stride_h,int stride_w,`
	`69`	`+int dilation_h,int dilation_w,`
	`70`	`+intpad_t,int pad_l,`
	`71`	`+constint biasptr,constfloat multptr,`
	`72`	`+constint8_t *inptr_,`
	`73`	`+int height,int width,`
	`74`	`+int *outptr_,`
	`75`	`+int out_d,int outH,int outW,`
	`76`	`+int inpZp,int outZp)`
	`77`	`+{`
	`78`	`+constint8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],`
	`79`	`+ w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],`
	`80`	`+ w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];`
	`81`	`+int outW1 =min(outW, (width - dilation_w * (kernel_w -1) + pad_l) / stride_w);`
	`82`	`+int bias = biasptr[out_d], biasCopy;`
	`83`	`+float mult = multptr[out_d];`
	`84`	`+`
	`85`	`+for (int out_i =0; out_i < outH; out_i++)`
	`86`	`+ {`
	`87`	`+int in_i = out_i * stride_h -pad_t, out_j =0;`
	`88`	`+constint8_t imgptr0 = inptr_ + in_i width;`
	`89`	`+constint8_t imgptr1 = imgptr0 + dilation_h width;`
	`90`	`+constint8_t imgptr2 = imgptr0 + (dilation_h 2) * width;`
	`91`	`+int8_t w00 = w00_, w01 = w01_, w02 = w02_;`
	`92`	`+int8_t w20 = w20_, w21 = w21_, w22 = w22_;`
	`93`	`+int out;`
	`94`	`+ biasCopy = bias;`
	`95`	`+`
	`96`	`+if (in_i <0)`
	`97`	`+ {`
	`98`	`+ biasCopy += inpZp * (w00 + w01 + w02);`
	`99`	`+ w00 = w01 = w02 =0;`
	`100`	`+ imgptr0 = imgptr1;`
	`101`	`+ }`
	`102`	`+elseif (in_i + dilation_h * (kernel_h -1) >= height)`
	`103`	`+ {`
	`104`	`+ biasCopy += inpZp * (w20 + w21 + w22);`
	`105`	`+ w20 = w21 = w22 =0;`
	`106`	`+ imgptr2 = imgptr1;`
	`107`	`+ }`
	`108`	`+int outptr = outptr_ + out_i outW;`
	`109`	`+if (pad_l >0)`
	`110`	`+ {`
	`111`	`+ out = (int)imgptr0[0] * w01 + (int)imgptr0[dilation_w] * w02 +`
	`112`	`+ (int)imgptr1[0] * w11 + (int)imgptr1[dilation_w] * w12 +`
	`113`	`+ (int)imgptr2[0] * w21 + (int)imgptr2[dilation_w] * w22 +`
	`114`	`+ biasCopy + inpZp * (w00 + w10 + w20);`
	`115`	`+ outptr[0] =__nds__sclip32(outZp + (int)std::round(out * mult),7);`
	`116`	`+ out_j =1;`
	`117`	`+ }`
	`118`	`+`
	`119`	`+int8x8_t vwx0 = (int8x8_t){w00, w10, w20,0, w00, w10, w20,0};`
	`120`	`+int8x8_t vwx1 = (int8x8_t){w01, w11, w21,0, w01, w11, w21,0};`
	`121`	`+int8x8_t vwx2 = (int8x8_t){w02, w12, w22,0, w02, w12, w22,0};`
	`122`	`+int8x8_t vimgx0, vimgx1, vimgx2;`
	`123`	`+int32x2_t vout = {0,0};`
	`124`	`+for (; out_j < outW1; out_j+=2)`
	`125`	`+ {`
	`126`	`+int in_j = out_j * stride_w - pad_l;`
	`127`	`+ vimgx0 = (int8x8_t){imgptr0[in_j], imgptr1[in_j], imgptr2[in_j],0,`
	`128`	`+ imgptr0[in_j + stride_w], imgptr1[in_j + stride_w], imgptr2[in_j + stride_w],0};`
	`129`	`+ vimgx1 = (int8x8_t){imgptr0[in_j + dilation_w], imgptr1[in_j + dilation_w], imgptr2[in_j + dilation_w],0,`
	`130`	`+ imgptr0[in_j + dilation_w + stride_w], imgptr1[in_j + dilation_w + stride_w], imgptr2[in_j + dilation_w + stride_w],0};`
	`131`	`+ vimgx2 = (int8x8_t){imgptr0[in_j + dilation_w 2], imgptr1[in_j + dilation_w 2], imgptr2[in_j + dilation_w *2],0,`
	`132`	`+ imgptr0[in_j + dilation_w 2 + stride_w], imgptr1[in_j + dilation_w 2 + stride_w], imgptr2[in_j + dilation_w *2 + stride_w],0};`
	`133`	`+`
	`134`	`+ vout = (int32x2_t){biasCopy, biasCopy};`
	`135`	`+ vout =__nds__v_smaqa(vout, vwx0, vimgx0);`
	`136`	`+ vout =__nds__v_smaqa(vout, vwx1, vimgx1);`
	`137`	`+ vout =__nds__v_smaqa(vout, vwx2, vimgx2);`
	`138`	`+`
	`139`	`+ outptr[out_j] =__nds__sclip32(outZp + (int)std::round(vout[0] * mult),7);`
	`140`	`+ outptr[out_j +1] =__nds__sclip32(outZp + (int)std::round(vout[1] * mult),7);`
	`141`	`+ }`
	`142`	`+`
	`143`	`+while (out_j > outW1) out_j--;`
	`144`	`+`
	`145`	`+for (; out_j < outW; out_j++)`
	`146`	`+ {`
	`147`	`+int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w *2;`
	`148`	`+int s0 =1, s1 =1, s2 =1;`
	`149`	`+if (in_j0 >= width)`
	`150`	`+ {`
	`151`	`+ in_j0 =0;`
	`152`	`+ s0 =0;`
	`153`	`+ biasCopy += inpZp * (w00 + w10 + w20);`
	`154`	`+ }`
	`155`	`+if (in_j1 >= width)`
	`156`	`+ {`
	`157`	`+ in_j1 =0;`
	`158`	`+ s1 =0;`
	`159`	`+ biasCopy += inpZp * (w01 + w11 + w21);`
	`160`	`+ }`
	`161`	`+if (in_j2 >= width)`
	`162`	`+ {`
	`163`	`+ in_j2 =0;`
	`164`	`+ s2 =0;`
	`165`	`+ biasCopy += inpZp * (w02 + w12 + w22);`
	`166`	`+ }`
	`167`	`+ out = (int)imgptr0[in_j0] * w00 * s0 + (int)imgptr0[in_j1] * w01 * s1 + (int)imgptr0[in_j2] * w02 * s2 +`
	`168`	`+ (int)imgptr1[in_j0] * w10 * s0 + (int)imgptr1[in_j1] * w11 * s1 + (int)imgptr1[in_j2] * w12 * s2 +`
	`169`	`+ (int)imgptr2[in_j0] * w20 * s0 + (int)imgptr2[in_j1] * w21 * s1 + (int)imgptr2[in_j2] * w22 * s2 + biasCopy;`
	`170`	`+ outptr[out_j] =__nds__sclip32(outZp + (int)std::round(out * mult),7);`
	`171`	`+ }`
	`172`	`+ }`
	`173`	`+}`
	`174`	`+`
	`175`	`+// dst = vec * weights^t + bias`
	`176`	`+voidfastGEMM1T(constint8_t* vec,constint8_t* weights,`
	`177`	`+size_t wstep,constint* bias,constfloat* multiplier,`
	`178`	`+int* dst,int nvecs,int vecsize,int outZp )`
	`179`	`+{`
	`180`	`+int i =0;`
	`181`	`+`
	`182`	`+for( ; i <= nvecs -2; i +=2 )`
	`183`	`+ {`
	`184`	`+constint8_t* wptr0 = weights + i * wstep;`
	`185`	`+constint8_t* wptr1 = weights + (i +1) * wstep;`
	`186`	`+`
	`187`	`+int32x2_t vs0 = (int32x2_t)(bias + i);`
	`188`	`+`
	`189`	`+for(int k =0; k < vecsize; k +=4 )`
	`190`	`+ {`
	`191`	`+int8x4_t vvec[2] = {(int8x4_t)(vec + k), (int8x4_t)(vec + k)};`
	`192`	`+int8x4_t vwptr[2] = {(int8x4_t)(wptr0 + k), (int8x4_t)(wptr1 + k)};`
	`193`	`+ vs0 =__nds__v_smaqa(vs0, (int8x8_t)vwptr, (int8x8_t)vvec);`
	`194`	`+ }`
	`195`	`+`
	`196`	`+int32x2_t vdst = {(int)std::round(vs0[0] * multiplier[i]), (int)std::round(vs0[1] * multiplier[i +1])};`
	`197`	`+`
	`198`	`+ vdst =__nds__v_sclip32(vdst + outZp,7);`
	`199`	`+`
	`200`	`+ (int32x2_t)(dst + i) = vdst;`
	`201`	`+ }`
	`202`	`+`
	`203`	`+for( ; i < nvecs; i++ )`
	`204`	`+ {`
	`205`	`+constint8_t* wptr = weights + i * wstep;`
	`206`	`+int s0 = bias[i];`
	`207`	`+`
	`208`	`+for(int k =0; k < vecsize; k +=4 )`
	`209`	`+ {`
	`210`	`+int8x4_t vvec[2] = {(int8x4_t)(vec + k),0};`
	`211`	`+int8x4_t vwptr[2] = {(int8x4_t)(wptr + k),0};`
	`212`	`+ s0 =__nds__smaqa(s0, (unsignedlong)vwptr, (unsignedlong)vvec);`
	`213`	`+ }`
	`214`	`+`
	`215`	`+ dst[i] =__nds__sclip32(outZp + (int)std::round(s0 * multiplier[i]),7);`
	`216`	`+ }`
	`217`	`+}`
	`218`	`+`
	`219`	`+}}}// namespace`
	`220`	`+`
	`221`	`+#endif`

`‎modules/dnn/src/int8layers/layers_rvp052.hpp‎`

Lines changed: 36 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,36 @@`
	`1`	`+// This file is part of OpenCV project.`
	`2`	`+// It is subject to the license terms in the LICENSE file found in the top-level directory`
	`3`	`+// of this distribution and at http://opencv.org/license.html.`
	`4`	`+`
	`5`	`+#if defined(__riscv) && defined(__riscv_dsp) && defined(__ANDES)`
	`6`	`+#include<nds_intrinsic.h>`
	`7`	`+#defineCV_RVP0521`
	`8`	`+`
	`9`	`+namespacecv {`
	`10`	`+namespacednn {`
	`11`	`+namespaceopt_RVP052 {`
	`12`	`+`
	`13`	`+voidfastConv(constint8_t* weights,size_t wstep,constint* bias,`
	`14`	`+constint8_t* rowbuf,int* output,constint* outShape,`
	`15`	`+int blockSize,int vecsize,int vecsize_aligned,int outZp,`
	`16`	`+constfloat* multiplier,bool initOutput,bool finalOutput );`
	`17`	`+voidfastDepthwiseConv(constint8_t* wptr,`
	`18`	`+int kernel_h,int kernel_w,`
	`19`	`+int stride_h,int stride_w,`
	`20`	`+int dilation_h,int dilation_w,`
	`21`	`+intpad_t,int pad_l,`
	`22`	`+constint* biasptr,constfloat* multptr,`
	`23`	`+constint8_t* inptr_,`
	`24`	`+int height,int width,`
	`25`	`+int* outptr_,`
	`26`	`+int out_d,int outH,int outW,`
	`27`	`+int inpZp,int outZp );`
	`28`	`+voidfastGEMM1T(constint8_t* vec,constint8_t* weights,`
	`29`	`+size_t wstep,constint* bias,constfloat* multiplier,`
	`30`	`+int* dst,int nvecs,int vecsize,int outZp );`
	`31`	`+`
	`32`	`+}}}`
	`33`	`+`
	`34`	`+#else`
	`35`	`+#defineCV_RVP0520`
	`36`	`+#endif`

`‎platforms/linux/riscv64-andes-gcc.toolchain.cmake‎`

Lines changed: 10 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,10 @@`
	`1`	`+set(CMAKE_SYSTEM_NAME Linux)`
	`2`	`+set(CMAKE_SYSTEM_PROCESSOR riscv64)`
	`3`	`+`
	`4`	`+set(RISCV_GCC_INSTALL_ROOT$ENV{RISCV}CACHEPATH"Path to GCC for RISC-V cross compiler installation directory")`
	`5`	`+`
	`6`	`+set(CMAKE_C_COMPILER${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-linux-gcc)`
	`7`	`+set(CMAKE_CXX_COMPILER${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-linux-g++)`
	`8`	`+`
	`9`	`+set(CMAKE_C_FLAGS"${CMAKE_C_FLAGS} -march=rv64gc -mext-dsp")`
	`10`	`+set(CMAKE_CXX_FLAGS"${CMAKE_CXX_FLAGS} -march=rv64gc -mext-dsp")`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Commit99c86bb

File tree

6 files changed

6 files changed

`‎modules/dnn/src/int8layers/convolution_layer.cpp‎`

`‎modules/dnn/src/int8layers/fully_connected_layer.cpp‎`

`‎modules/dnn/src/int8layers/layers_common.hpp‎`

`‎modules/dnn/src/int8layers/layers_rvp052.cpp‎`

`‎modules/dnn/src/int8layers/layers_rvp052.hpp‎`

`‎platforms/linux/riscv64-andes-gcc.toolchain.cmake‎`

0 commit comments