|
| 1 | +// This file is part of OpenCV project. |
| 2 | +// It is subject to the license terms in the LICENSE file found in the top-level directory |
| 3 | +// of this distribution and at http://opencv.org/license.html. |
| 4 | + |
| 5 | +#include"../precomp.hpp" |
| 6 | +#include"./layers_rvp052.hpp" |
| 7 | + |
| 8 | +#if CV_RVP052 |
| 9 | + |
| 10 | +namespacecv { |
| 11 | +namespacednn { |
| 12 | +namespaceopt_RVP052 { |
| 13 | + |
| 14 | +voidfastConv(constint8_t *weights,size_t wstep,constint *bias, |
| 15 | +constint8_t *rowbuf,int *output,constint *outShape, |
| 16 | +int blockSize,int vecsize,int vecsize_aligned,int outZp, |
| 17 | +constfloat *multiplier,bool initOutput,bool finalOutput) |
| 18 | +{ |
| 19 | +int outCn = outShape[1]; |
| 20 | +size_t outPlaneSize = outShape[2] * outShape[3]; |
| 21 | +for (int i =0; i < outCn; i +=2) |
| 22 | + { |
| 23 | +constint8_t *wptr0 = weights + i * wstep; |
| 24 | +constint8_t *wptr1 = wptr0 + wstep; |
| 25 | +int *outptr0 = output + i * outPlaneSize; |
| 26 | +int *outptr1 = outptr0 + outPlaneSize; |
| 27 | +int bias0 = bias[i], bias1 = bias[i +1]; |
| 28 | +float mult0 = multiplier[i], mult1 = multiplier[i +1]; |
| 29 | + |
| 30 | +if (i +1 >= outCn) |
| 31 | + { |
| 32 | + wptr1 = wptr0; |
| 33 | + outptr1 = outptr0; |
| 34 | + bias1 = bias0; |
| 35 | + mult1 = mult0; |
| 36 | + } |
| 37 | +int j =0; |
| 38 | +for (; j < blockSize; j++) |
| 39 | + { |
| 40 | +constint8_t *rptr = rowbuf + j * vecsize_aligned; |
| 41 | +int s00 = initOutput ? bias0 : outptr0[j]; |
| 42 | +int s10 = initOutput ? bias1 : outptr1[j]; |
| 43 | + |
| 44 | +int32x2_t vsx0 = {s00, s10}; |
| 45 | + |
| 46 | +for (int k =0; k < vecsize; k +=4) |
| 47 | + { |
| 48 | +int8x4_t vrptr[2] = {*(int8x4_t*)(rptr + k), *(int8x4_t*)(rptr + k)}; |
| 49 | +int8x4_t vwptr[2] = {*(int8x4_t*)(wptr0 + k), *(int8x4_t*)(wptr1 + k)}; |
| 50 | + vsx0 =__nds__v_smaqa(vsx0, *(int8x8_t*)vwptr, *(int8x8_t*)vrptr); |
| 51 | + } |
| 52 | + |
| 53 | +if (finalOutput) |
| 54 | + { |
| 55 | + vsx0[0] = outZp + (int)std::round(vsx0[0] * mult0); |
| 56 | + vsx0[1] = outZp + (int)std::round(vsx0[1] * mult1); |
| 57 | + vsx0 =__nds__v_sclip32(vsx0,7); |
| 58 | + } |
| 59 | + |
| 60 | + outptr0[j] = vsx0[0]; |
| 61 | + outptr1[j] = vsx0[1]; |
| 62 | + } |
| 63 | + } |
| 64 | +} |
| 65 | + |
| 66 | +voidfastDepthwiseConv(constint8_t *wptr, |
| 67 | +int kernel_h,int kernel_w, |
| 68 | +int stride_h,int stride_w, |
| 69 | +int dilation_h,int dilation_w, |
| 70 | +intpad_t,int pad_l, |
| 71 | +constint *biasptr,constfloat *multptr, |
| 72 | +constint8_t *inptr_, |
| 73 | +int height,int width, |
| 74 | +int *outptr_, |
| 75 | +int out_d,int outH,int outW, |
| 76 | +int inpZp,int outZp) |
| 77 | +{ |
| 78 | +constint8_t w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2], |
| 79 | + w10 = wptr[3], w11 = wptr[4], w12 = wptr[5], |
| 80 | + w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8]; |
| 81 | +int outW1 =min(outW, (width - dilation_w * (kernel_w -1) + pad_l) / stride_w); |
| 82 | +int bias = biasptr[out_d], biasCopy; |
| 83 | +float mult = multptr[out_d]; |
| 84 | + |
| 85 | +for (int out_i =0; out_i < outH; out_i++) |
| 86 | + { |
| 87 | +int in_i = out_i * stride_h -pad_t, out_j =0; |
| 88 | +constint8_t *imgptr0 = inptr_ + in_i * width; |
| 89 | +constint8_t *imgptr1 = imgptr0 + dilation_h * width; |
| 90 | +constint8_t *imgptr2 = imgptr0 + (dilation_h *2) * width; |
| 91 | +int8_t w00 = w00_, w01 = w01_, w02 = w02_; |
| 92 | +int8_t w20 = w20_, w21 = w21_, w22 = w22_; |
| 93 | +int out; |
| 94 | + biasCopy = bias; |
| 95 | + |
| 96 | +if (in_i <0) |
| 97 | + { |
| 98 | + biasCopy += inpZp * (w00 + w01 + w02); |
| 99 | + w00 = w01 = w02 =0; |
| 100 | + imgptr0 = imgptr1; |
| 101 | + } |
| 102 | +elseif (in_i + dilation_h * (kernel_h -1) >= height) |
| 103 | + { |
| 104 | + biasCopy += inpZp * (w20 + w21 + w22); |
| 105 | + w20 = w21 = w22 =0; |
| 106 | + imgptr2 = imgptr1; |
| 107 | + } |
| 108 | +int *outptr = outptr_ + out_i * outW; |
| 109 | +if (pad_l >0) |
| 110 | + { |
| 111 | + out = (int)imgptr0[0] * w01 + (int)imgptr0[dilation_w] * w02 + |
| 112 | + (int)imgptr1[0] * w11 + (int)imgptr1[dilation_w] * w12 + |
| 113 | + (int)imgptr2[0] * w21 + (int)imgptr2[dilation_w] * w22 + |
| 114 | + biasCopy + inpZp * (w00 + w10 + w20); |
| 115 | + outptr[0] =__nds__sclip32(outZp + (int)std::round(out * mult),7); |
| 116 | + out_j =1; |
| 117 | + } |
| 118 | + |
| 119 | +int8x8_t vwx0 = (int8x8_t){w00, w10, w20,0, w00, w10, w20,0}; |
| 120 | +int8x8_t vwx1 = (int8x8_t){w01, w11, w21,0, w01, w11, w21,0}; |
| 121 | +int8x8_t vwx2 = (int8x8_t){w02, w12, w22,0, w02, w12, w22,0}; |
| 122 | +int8x8_t vimgx0, vimgx1, vimgx2; |
| 123 | +int32x2_t vout = {0,0}; |
| 124 | +for (; out_j < outW1; out_j+=2) |
| 125 | + { |
| 126 | +int in_j = out_j * stride_w - pad_l; |
| 127 | + vimgx0 = (int8x8_t){imgptr0[in_j], imgptr1[in_j], imgptr2[in_j],0, |
| 128 | + imgptr0[in_j + stride_w], imgptr1[in_j + stride_w], imgptr2[in_j + stride_w],0}; |
| 129 | + vimgx1 = (int8x8_t){imgptr0[in_j + dilation_w], imgptr1[in_j + dilation_w], imgptr2[in_j + dilation_w],0, |
| 130 | + imgptr0[in_j + dilation_w + stride_w], imgptr1[in_j + dilation_w + stride_w], imgptr2[in_j + dilation_w + stride_w],0}; |
| 131 | + vimgx2 = (int8x8_t){imgptr0[in_j + dilation_w *2], imgptr1[in_j + dilation_w *2], imgptr2[in_j + dilation_w *2],0, |
| 132 | + imgptr0[in_j + dilation_w *2 + stride_w], imgptr1[in_j + dilation_w *2 + stride_w], imgptr2[in_j + dilation_w *2 + stride_w],0}; |
| 133 | + |
| 134 | + vout = (int32x2_t){biasCopy, biasCopy}; |
| 135 | + vout =__nds__v_smaqa(vout, vwx0, vimgx0); |
| 136 | + vout =__nds__v_smaqa(vout, vwx1, vimgx1); |
| 137 | + vout =__nds__v_smaqa(vout, vwx2, vimgx2); |
| 138 | + |
| 139 | + outptr[out_j] =__nds__sclip32(outZp + (int)std::round(vout[0] * mult),7); |
| 140 | + outptr[out_j +1] =__nds__sclip32(outZp + (int)std::round(vout[1] * mult),7); |
| 141 | + } |
| 142 | + |
| 143 | +while (out_j > outW1) out_j--; |
| 144 | + |
| 145 | +for (; out_j < outW; out_j++) |
| 146 | + { |
| 147 | +int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w *2; |
| 148 | +int s0 =1, s1 =1, s2 =1; |
| 149 | +if (in_j0 >= width) |
| 150 | + { |
| 151 | + in_j0 =0; |
| 152 | + s0 =0; |
| 153 | + biasCopy += inpZp * (w00 + w10 + w20); |
| 154 | + } |
| 155 | +if (in_j1 >= width) |
| 156 | + { |
| 157 | + in_j1 =0; |
| 158 | + s1 =0; |
| 159 | + biasCopy += inpZp * (w01 + w11 + w21); |
| 160 | + } |
| 161 | +if (in_j2 >= width) |
| 162 | + { |
| 163 | + in_j2 =0; |
| 164 | + s2 =0; |
| 165 | + biasCopy += inpZp * (w02 + w12 + w22); |
| 166 | + } |
| 167 | + out = (int)imgptr0[in_j0] * w00 * s0 + (int)imgptr0[in_j1] * w01 * s1 + (int)imgptr0[in_j2] * w02 * s2 + |
| 168 | + (int)imgptr1[in_j0] * w10 * s0 + (int)imgptr1[in_j1] * w11 * s1 + (int)imgptr1[in_j2] * w12 * s2 + |
| 169 | + (int)imgptr2[in_j0] * w20 * s0 + (int)imgptr2[in_j1] * w21 * s1 + (int)imgptr2[in_j2] * w22 * s2 + biasCopy; |
| 170 | + outptr[out_j] =__nds__sclip32(outZp + (int)std::round(out * mult),7); |
| 171 | + } |
| 172 | + } |
| 173 | +} |
| 174 | + |
| 175 | +// dst = vec * weights^t + bias |
| 176 | +voidfastGEMM1T(constint8_t* vec,constint8_t* weights, |
| 177 | +size_t wstep,constint* bias,constfloat* multiplier, |
| 178 | +int* dst,int nvecs,int vecsize,int outZp ) |
| 179 | +{ |
| 180 | +int i =0; |
| 181 | + |
| 182 | +for( ; i <= nvecs -2; i +=2 ) |
| 183 | + { |
| 184 | +constint8_t* wptr0 = weights + i * wstep; |
| 185 | +constint8_t* wptr1 = weights + (i +1) * wstep; |
| 186 | + |
| 187 | +int32x2_t vs0 = *(int32x2_t*)(bias + i); |
| 188 | + |
| 189 | +for(int k =0; k < vecsize; k +=4 ) |
| 190 | + { |
| 191 | +int8x4_t vvec[2] = {*(int8x4_t*)(vec + k), *(int8x4_t*)(vec + k)}; |
| 192 | +int8x4_t vwptr[2] = {*(int8x4_t*)(wptr0 + k), *(int8x4_t*)(wptr1 + k)}; |
| 193 | + vs0 =__nds__v_smaqa(vs0, *(int8x8_t*)vwptr, *(int8x8_t*)vvec); |
| 194 | + } |
| 195 | + |
| 196 | +int32x2_t vdst = {(int)std::round(vs0[0] * multiplier[i]), (int)std::round(vs0[1] * multiplier[i +1])}; |
| 197 | + |
| 198 | + vdst =__nds__v_sclip32(vdst + outZp,7); |
| 199 | + |
| 200 | + *(int32x2_t*)(dst + i) = vdst; |
| 201 | + } |
| 202 | + |
| 203 | +for( ; i < nvecs; i++ ) |
| 204 | + { |
| 205 | +constint8_t* wptr = weights + i * wstep; |
| 206 | +int s0 = bias[i]; |
| 207 | + |
| 208 | +for(int k =0; k < vecsize; k +=4 ) |
| 209 | + { |
| 210 | +int8x4_t vvec[2] = {*(int8x4_t*)(vec + k),0}; |
| 211 | +int8x4_t vwptr[2] = {*(int8x4_t*)(wptr + k),0}; |
| 212 | + s0 =__nds__smaqa(s0, *(unsignedlong*)vwptr, *(unsignedlong*)vvec); |
| 213 | + } |
| 214 | + |
| 215 | + dst[i] =__nds__sclip32(outZp + (int)std::round(s0 * multiplier[i]),7); |
| 216 | + } |
| 217 | +} |
| 218 | + |
| 219 | +}}}// namespace |
| 220 | + |
| 221 | +#endif |