Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit99c86bb

Browse files
authored
Merge pull request#24556 from plctlab:rvp
Optimization based on RISC-V P Packed SIMD Extension v0.5.2
2 parents68dc02e +a30c987 commit99c86bb

File tree

6 files changed

+287
-0
lines changed

6 files changed

+287
-0
lines changed

‎modules/dnn/src/int8layers/convolution_layer.cpp‎

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -969,6 +969,13 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
969969
stride_h, stride_w, dilation_h, dilation_w,pad_t, pad_l,
970970
biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
971971
else
972+
#endif
973+
#if CV_RVP052
974+
if(isConv2D)
975+
opt_RVP052::fastDepthwiseConv(wptr, kernel_h, kernel_w,
976+
stride_h, stride_w, dilation_h, dilation_w,pad_t, pad_l,
977+
biasptr, multptr, inptr_, height, width, outptr_, out_d, outH, outW, inpZp, outZp);
978+
else
972979
#endif
973980
{
974981
constint8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
@@ -1348,6 +1355,12 @@ class ConvolutionLayerInt8Impl CV_FINAL : public BaseConvolutionLayerInt8Impl
13481355
opt_LASX::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
13491356
outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 ==0, cn1 == inpCn);
13501357
else
1358+
#endif
1359+
#if CV_RVP052
1360+
if(isConv2D)
1361+
opt_RVP052::fastConv(wptr, wstep, biasptr, rowbuf0, data_out0 + ofs0,
1362+
outShape, bsz, vsz, vsz_a, outZp, multptr, cn0 ==0, cn1 == inpCn);
1363+
else
13511364
#endif
13521365
for(int i =0; i < outCn; i +=2 )
13531366
{

‎modules/dnn/src/int8layers/fully_connected_layer.cpp‎

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,11 @@ class FullyConnectedLayerInt8Impl CV_FINAL : public InnerProductLayerInt8
302302
if( useLASX )
303303
opt_LASX::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
304304
else
305+
#endif
306+
#if CV_RVP052
307+
if(1 )
308+
opt_RVP052::fastGEMM1T( sptr, wptr, wstep, biasptr, multptr, dptr, nw, vecsize, outZp );
309+
else
305310
#endif
306311
{
307312
int i =0;

‎modules/dnn/src/int8layers/layers_common.hpp‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
#include"int8layers/layers_common.simd_declarations.hpp"
1414
#undef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
1515

16+
#include"./layers_rvp052.hpp"
17+
1618
#ifdef HAVE_OPENCL
1719
#include"../ocl4dnn/include/ocl4dnn.hpp"
1820
#endif
Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
// This file is part of OpenCV project.
2+
// It is subject to the license terms in the LICENSE file found in the top-level directory
3+
// of this distribution and at http://opencv.org/license.html.
4+
5+
#include"../precomp.hpp"
6+
#include"./layers_rvp052.hpp"
7+
8+
#if CV_RVP052
9+
10+
namespacecv {
11+
namespacednn {
12+
namespaceopt_RVP052 {
13+
14+
voidfastConv(constint8_t *weights,size_t wstep,constint *bias,
15+
constint8_t *rowbuf,int *output,constint *outShape,
16+
int blockSize,int vecsize,int vecsize_aligned,int outZp,
17+
constfloat *multiplier,bool initOutput,bool finalOutput)
18+
{
19+
int outCn = outShape[1];
20+
size_t outPlaneSize = outShape[2] * outShape[3];
21+
for (int i =0; i < outCn; i +=2)
22+
{
23+
constint8_t *wptr0 = weights + i * wstep;
24+
constint8_t *wptr1 = wptr0 + wstep;
25+
int *outptr0 = output + i * outPlaneSize;
26+
int *outptr1 = outptr0 + outPlaneSize;
27+
int bias0 = bias[i], bias1 = bias[i +1];
28+
float mult0 = multiplier[i], mult1 = multiplier[i +1];
29+
30+
if (i +1 >= outCn)
31+
{
32+
wptr1 = wptr0;
33+
outptr1 = outptr0;
34+
bias1 = bias0;
35+
mult1 = mult0;
36+
}
37+
int j =0;
38+
for (; j < blockSize; j++)
39+
{
40+
constint8_t *rptr = rowbuf + j * vecsize_aligned;
41+
int s00 = initOutput ? bias0 : outptr0[j];
42+
int s10 = initOutput ? bias1 : outptr1[j];
43+
44+
int32x2_t vsx0 = {s00, s10};
45+
46+
for (int k =0; k < vecsize; k +=4)
47+
{
48+
int8x4_t vrptr[2] = {*(int8x4_t*)(rptr + k), *(int8x4_t*)(rptr + k)};
49+
int8x4_t vwptr[2] = {*(int8x4_t*)(wptr0 + k), *(int8x4_t*)(wptr1 + k)};
50+
vsx0 =__nds__v_smaqa(vsx0, *(int8x8_t*)vwptr, *(int8x8_t*)vrptr);
51+
}
52+
53+
if (finalOutput)
54+
{
55+
vsx0[0] = outZp + (int)std::round(vsx0[0] * mult0);
56+
vsx0[1] = outZp + (int)std::round(vsx0[1] * mult1);
57+
vsx0 =__nds__v_sclip32(vsx0,7);
58+
}
59+
60+
outptr0[j] = vsx0[0];
61+
outptr1[j] = vsx0[1];
62+
}
63+
}
64+
}
65+
66+
voidfastDepthwiseConv(constint8_t *wptr,
67+
int kernel_h,int kernel_w,
68+
int stride_h,int stride_w,
69+
int dilation_h,int dilation_w,
70+
intpad_t,int pad_l,
71+
constint *biasptr,constfloat *multptr,
72+
constint8_t *inptr_,
73+
int height,int width,
74+
int *outptr_,
75+
int out_d,int outH,int outW,
76+
int inpZp,int outZp)
77+
{
78+
constint8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
79+
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
80+
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
81+
int outW1 =min(outW, (width - dilation_w * (kernel_w -1) + pad_l) / stride_w);
82+
int bias = biasptr[out_d], biasCopy;
83+
float mult = multptr[out_d];
84+
85+
for (int out_i =0; out_i < outH; out_i++)
86+
{
87+
int in_i = out_i * stride_h -pad_t, out_j =0;
88+
constint8_t *imgptr0 = inptr_ + in_i * width;
89+
constint8_t *imgptr1 = imgptr0 + dilation_h * width;
90+
constint8_t *imgptr2 = imgptr0 + (dilation_h *2) * width;
91+
int8_t w00 = w00_, w01 = w01_, w02 = w02_;
92+
int8_t w20 = w20_, w21 = w21_, w22 = w22_;
93+
int out;
94+
biasCopy = bias;
95+
96+
if (in_i <0)
97+
{
98+
biasCopy += inpZp * (w00 + w01 + w02);
99+
w00 = w01 = w02 =0;
100+
imgptr0 = imgptr1;
101+
}
102+
elseif (in_i + dilation_h * (kernel_h -1) >= height)
103+
{
104+
biasCopy += inpZp * (w20 + w21 + w22);
105+
w20 = w21 = w22 =0;
106+
imgptr2 = imgptr1;
107+
}
108+
int *outptr = outptr_ + out_i * outW;
109+
if (pad_l >0)
110+
{
111+
out = (int)imgptr0[0] * w01 + (int)imgptr0[dilation_w] * w02 +
112+
(int)imgptr1[0] * w11 + (int)imgptr1[dilation_w] * w12 +
113+
(int)imgptr2[0] * w21 + (int)imgptr2[dilation_w] * w22 +
114+
biasCopy + inpZp * (w00 + w10 + w20);
115+
outptr[0] =__nds__sclip32(outZp + (int)std::round(out * mult),7);
116+
out_j =1;
117+
}
118+
119+
int8x8_t vwx0 = (int8x8_t){w00, w10, w20,0, w00, w10, w20,0};
120+
int8x8_t vwx1 = (int8x8_t){w01, w11, w21,0, w01, w11, w21,0};
121+
int8x8_t vwx2 = (int8x8_t){w02, w12, w22,0, w02, w12, w22,0};
122+
int8x8_t vimgx0, vimgx1, vimgx2;
123+
int32x2_t vout = {0,0};
124+
for (; out_j < outW1; out_j+=2)
125+
{
126+
int in_j = out_j * stride_w - pad_l;
127+
vimgx0 = (int8x8_t){imgptr0[in_j], imgptr1[in_j], imgptr2[in_j],0,
128+
imgptr0[in_j + stride_w], imgptr1[in_j + stride_w], imgptr2[in_j + stride_w],0};
129+
vimgx1 = (int8x8_t){imgptr0[in_j + dilation_w], imgptr1[in_j + dilation_w], imgptr2[in_j + dilation_w],0,
130+
imgptr0[in_j + dilation_w + stride_w], imgptr1[in_j + dilation_w + stride_w], imgptr2[in_j + dilation_w + stride_w],0};
131+
vimgx2 = (int8x8_t){imgptr0[in_j + dilation_w *2], imgptr1[in_j + dilation_w *2], imgptr2[in_j + dilation_w *2],0,
132+
imgptr0[in_j + dilation_w *2 + stride_w], imgptr1[in_j + dilation_w *2 + stride_w], imgptr2[in_j + dilation_w *2 + stride_w],0};
133+
134+
vout = (int32x2_t){biasCopy, biasCopy};
135+
vout =__nds__v_smaqa(vout, vwx0, vimgx0);
136+
vout =__nds__v_smaqa(vout, vwx1, vimgx1);
137+
vout =__nds__v_smaqa(vout, vwx2, vimgx2);
138+
139+
outptr[out_j] =__nds__sclip32(outZp + (int)std::round(vout[0] * mult),7);
140+
outptr[out_j +1] =__nds__sclip32(outZp + (int)std::round(vout[1] * mult),7);
141+
}
142+
143+
while (out_j > outW1) out_j--;
144+
145+
for (; out_j < outW; out_j++)
146+
{
147+
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w *2;
148+
int s0 =1, s1 =1, s2 =1;
149+
if (in_j0 >= width)
150+
{
151+
in_j0 =0;
152+
s0 =0;
153+
biasCopy += inpZp * (w00 + w10 + w20);
154+
}
155+
if (in_j1 >= width)
156+
{
157+
in_j1 =0;
158+
s1 =0;
159+
biasCopy += inpZp * (w01 + w11 + w21);
160+
}
161+
if (in_j2 >= width)
162+
{
163+
in_j2 =0;
164+
s2 =0;
165+
biasCopy += inpZp * (w02 + w12 + w22);
166+
}
167+
out = (int)imgptr0[in_j0] * w00 * s0 + (int)imgptr0[in_j1] * w01 * s1 + (int)imgptr0[in_j2] * w02 * s2 +
168+
(int)imgptr1[in_j0] * w10 * s0 + (int)imgptr1[in_j1] * w11 * s1 + (int)imgptr1[in_j2] * w12 * s2 +
169+
(int)imgptr2[in_j0] * w20 * s0 + (int)imgptr2[in_j1] * w21 * s1 + (int)imgptr2[in_j2] * w22 * s2 + biasCopy;
170+
outptr[out_j] =__nds__sclip32(outZp + (int)std::round(out * mult),7);
171+
}
172+
}
173+
}
174+
175+
// dst = vec * weights^t + bias
176+
voidfastGEMM1T(constint8_t* vec,constint8_t* weights,
177+
size_t wstep,constint* bias,constfloat* multiplier,
178+
int* dst,int nvecs,int vecsize,int outZp )
179+
{
180+
int i =0;
181+
182+
for( ; i <= nvecs -2; i +=2 )
183+
{
184+
constint8_t* wptr0 = weights + i * wstep;
185+
constint8_t* wptr1 = weights + (i +1) * wstep;
186+
187+
int32x2_t vs0 = *(int32x2_t*)(bias + i);
188+
189+
for(int k =0; k < vecsize; k +=4 )
190+
{
191+
int8x4_t vvec[2] = {*(int8x4_t*)(vec + k), *(int8x4_t*)(vec + k)};
192+
int8x4_t vwptr[2] = {*(int8x4_t*)(wptr0 + k), *(int8x4_t*)(wptr1 + k)};
193+
vs0 =__nds__v_smaqa(vs0, *(int8x8_t*)vwptr, *(int8x8_t*)vvec);
194+
}
195+
196+
int32x2_t vdst = {(int)std::round(vs0[0] * multiplier[i]), (int)std::round(vs0[1] * multiplier[i +1])};
197+
198+
vdst =__nds__v_sclip32(vdst + outZp,7);
199+
200+
*(int32x2_t*)(dst + i) = vdst;
201+
}
202+
203+
for( ; i < nvecs; i++ )
204+
{
205+
constint8_t* wptr = weights + i * wstep;
206+
int s0 = bias[i];
207+
208+
for(int k =0; k < vecsize; k +=4 )
209+
{
210+
int8x4_t vvec[2] = {*(int8x4_t*)(vec + k),0};
211+
int8x4_t vwptr[2] = {*(int8x4_t*)(wptr + k),0};
212+
s0 =__nds__smaqa(s0, *(unsignedlong*)vwptr, *(unsignedlong*)vvec);
213+
}
214+
215+
dst[i] =__nds__sclip32(outZp + (int)std::round(s0 * multiplier[i]),7);
216+
}
217+
}
218+
219+
}}}// namespace
220+
221+
#endif
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// This file is part of OpenCV project.
2+
// It is subject to the license terms in the LICENSE file found in the top-level directory
3+
// of this distribution and at http://opencv.org/license.html.
4+
5+
#if defined(__riscv) && defined(__riscv_dsp) && defined(__ANDES)
6+
#include<nds_intrinsic.h>
7+
#defineCV_RVP0521
8+
9+
namespacecv {
10+
namespacednn {
11+
namespaceopt_RVP052 {
12+
13+
voidfastConv(constint8_t* weights,size_t wstep,constint* bias,
14+
constint8_t* rowbuf,int* output,constint* outShape,
15+
int blockSize,int vecsize,int vecsize_aligned,int outZp,
16+
constfloat* multiplier,bool initOutput,bool finalOutput );
17+
voidfastDepthwiseConv(constint8_t* wptr,
18+
int kernel_h,int kernel_w,
19+
int stride_h,int stride_w,
20+
int dilation_h,int dilation_w,
21+
intpad_t,int pad_l,
22+
constint* biasptr,constfloat* multptr,
23+
constint8_t* inptr_,
24+
int height,int width,
25+
int* outptr_,
26+
int out_d,int outH,int outW,
27+
int inpZp,int outZp );
28+
voidfastGEMM1T(constint8_t* vec,constint8_t* weights,
29+
size_t wstep,constint* bias,constfloat* multiplier,
30+
int* dst,int nvecs,int vecsize,int outZp );
31+
32+
}}}
33+
34+
#else
35+
#defineCV_RVP0520
36+
#endif
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
set(CMAKE_SYSTEM_NAME Linux)
2+
set(CMAKE_SYSTEM_PROCESSOR riscv64)
3+
4+
set(RISCV_GCC_INSTALL_ROOT$ENV{RISCV}CACHEPATH"Path to GCC for RISC-V cross compiler installation directory")
5+
6+
set(CMAKE_C_COMPILER${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-linux-gcc)
7+
set(CMAKE_CXX_COMPILER${RISCV_GCC_INSTALL_ROOT}/bin/riscv64-linux-g++)
8+
9+
set(CMAKE_C_FLAGS"${CMAKE_C_FLAGS} -march=rv64gc -mext-dsp")
10+
set(CMAKE_CXX_FLAGS"${CMAKE_CXX_FLAGS} -march=rv64gc -mext-dsp")

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp