NotificationsYou must be signed in to change notification settings
Fork56.4k
Star85.3k

Commitc96f48e

authored

Merge pull request#24412 from vrabaud:inter_area1

Speed up line merging in INTER_AREA#24412This provides a 10 to 20% speed-up.Related perf testfix:#24417This is a split of#23525 that will be updated to only deal with column merging.### Pull Request Readiness ChecklistSee details athttps://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request- [x] I agree to contribute to the project under Apache 2 License.- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV- [x] The PR is proposed to the proper branch- [x] There is a reference to the original bug report and related work- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name.- [x] The feature is well documented and sample code can be built with the project CMake

1 parenta9664ab commitc96f48eCopy full SHA for c96f48e

File tree

1 file changed

+109

-14

lines changed

modules/imgproc/src
- resize.cpp

1 file changed

+109

-14

lines changed

`‎modules/imgproc/src/resize.cpp‎`

Lines changed: 109 additions & 14 deletions

Original file line number	Diff line number	Diff line change
`@@ -3019,6 +3019,111 @@ struct DecimateAlpha`
`3019`	`3019`	`};`
`3020`	`3020`
`3021`	`3021`
	`3022`	`+namespaceinter_area {`
	`3023`	`+#if (CV_SIMD \|\| CV_SIMD_SCALABLE)`
	`3024`	`+inlinevoidsaturate_store(constfloat* src, uchar* dst) {`
	`3025`	`+const v_int32 tmp0 =v_round(vx_load(src +0 * VTraits<v_float32>::vlanes()));`
	`3026`	`+const v_int32 tmp1 =v_round(vx_load(src +1 * VTraits<v_float32>::vlanes()));`
	`3027`	`+const v_int32 tmp2 =v_round(vx_load(src +2 * VTraits<v_float32>::vlanes()));`
	`3028`	`+const v_int32 tmp3 =v_round(vx_load(src +3 * VTraits<v_float32>::vlanes()));`
	`3029`	`+v_store(dst,v_pack(v_pack_u(tmp0, tmp1),v_pack_u(tmp2, tmp3)));`
	`3030`	`+}`
	`3031`	`+`
	`3032`	`+inlinevoidsaturate_store(constfloat* src, ushort* dst) {`
	`3033`	`+const v_int32 tmp0 =v_round(vx_load(src +0 * VTraits<v_float32>::vlanes()));`
	`3034`	`+const v_int32 tmp1 =v_round(vx_load(src +1 * VTraits<v_float32>::vlanes()));`
	`3035`	`+v_store(dst,v_pack_u(tmp0, tmp1));`
	`3036`	`+}`
	`3037`	`+`
	`3038`	`+inlinevoidsaturate_store(constfloat* src,short* dst) {`
	`3039`	`+const v_int32 tmp0 =v_round(vx_load(src +0 * VTraits<v_float32>::vlanes()));`
	`3040`	`+const v_int32 tmp1 =v_round(vx_load(src +1 * VTraits<v_float32>::vlanes()));`
	`3041`	`+v_store(dst,v_pack(tmp0, tmp1));`
	`3042`	`+}`
	`3043`	`+`
	`3044`	`+staticinline v_float32vx_setall(float coeff) {returnvx_setall_f32(coeff); }`
	`3045`	`+`
	`3046`	`+template<typename T>`
	`3047`	`+structVArea {};`
	`3048`	`+`
	`3049`	`+template<>`
	`3050`	`+structVArea<float> {`
	`3051`	`+typedef v_float32 vWT;`
	`3052`	`+};`
	`3053`	`+#endif`
	`3054`	`+`
	`3055`	`+#if (CV_SIMD128_64F \|\| CV_SIMD_SCALABLE_64F)`
	`3056`	`+staticinline v_float64vx_setall(double coeff) {returnvx_setall_f64(coeff); }`
	`3057`	`+`
	`3058`	`+template<>`
	`3059`	`+structVArea<double> {`
	`3060`	`+typedef v_float64 vWT;`
	`3061`	`+};`
	`3062`	`+`
	`3063`	`+#else`
	`3064`	`+inlinevoidmul(constdouble* buf,int width,double beta,double* sum) {`
	`3065`	`+for (int dx =0; dx < width; ++dx) {`
	`3066`	`+ sum[dx] = beta * buf[dx];`
	`3067`	`+ }`
	`3068`	`+}`
	`3069`	`+`
	`3070`	`+inlinevoidmuladd(constdouble* buf,int width,double beta,double* sum) {`
	`3071`	`+for (int dx =0; dx < width; ++dx) {`
	`3072`	`+ sum[dx] += beta * buf[dx];`
	`3073`	`+ }`
	`3074`	`+}`
	`3075`	`+#endif`
	`3076`	`+`
	`3077`	`+template<typename T,typename WT>`
	`3078`	`+inlinevoidsaturate_store(const WT* sum,int width, T* D) {`
	`3079`	`+int dx =0;`
	`3080`	`+#if (CV_SIMD \|\| CV_SIMD_SCALABLE)`
	`3081`	`+constint step = VTraits<typename VArea<WT>::vWT>::vlanes() *sizeof(WT) /sizeof(T);`
	`3082`	`+for (; dx + step < width; dx += step) {`
	`3083`	`+saturate_store(sum + dx, D + dx);`
	`3084`	`+ }`
	`3085`	`+#endif`
	`3086`	`+for (; dx < width; ++dx) {`
	`3087`	`+ D[dx] = saturate_cast<T>(sum[dx]);`
	`3088`	`+ }`
	`3089`	`+}`
	`3090`	`+`
	`3091`	`+// Optimization when T == WT.`
	`3092`	`+template<typename WT>`
	`3093`	`+inlinevoidsaturate_store(const WT* sum,int width, WT* D) {`
	`3094`	`+std::copy(sum, sum + width, D);`
	`3095`	`+}`
	`3096`	`+`
	`3097`	`+template<typename WT>`
	`3098`	`+inlinevoidmul(const WT* buf,int width, WT beta, WT* sum) {`
	`3099`	`+int dx =0;`
	`3100`	`+#if (CV_SIMD \|\| CV_SIMD_SCALABLE)`
	`3101`	`+constint step = VTraits<typename VArea<WT>::vWT>::vlanes();`
	`3102`	`+for (; dx + step < width; dx += step) {`
	`3103`	`+vx_store(sum + dx,v_mul(vx_setall(beta),vx_load(buf + dx)));`
	`3104`	`+ }`
	`3105`	`+#endif`
	`3106`	`+for (; dx < width; ++dx) {`
	`3107`	`+ sum[dx] = beta * buf[dx];`
	`3108`	`+ }`
	`3109`	`+}`
	`3110`	`+`
	`3111`	`+template<typename WT>`
	`3112`	`+inlinevoidmuladd(const WT* buf,int width, WT beta, WT* sum) {`
	`3113`	`+int dx =0;`
	`3114`	`+#if (CV_SIMD \|\| CV_SIMD_SCALABLE)`
	`3115`	`+constint step = VTraits<typename VArea<WT>::vWT>::vlanes();`
	`3116`	`+for (; dx + step < width; dx += step) {`
	`3117`	`+vx_store(sum + dx,v_add(vx_load(sum + dx),v_mul(vx_setall(beta),vx_load(buf + dx))));`
	`3118`	`+ }`
	`3119`	`+#endif`
	`3120`	`+for (; dx < width; ++dx) {`
	`3121`	`+ sum[dx] += beta * buf[dx];`
	`3122`	`+ }`
	`3123`	`+}`
	`3124`	`+`
	`3125`	`+}// namespace inter_area`
	`3126`	`+`
`3022`	`3127`	`template<typename T,typename WT>classResizeArea_Invoker :`
`3023`	`3128`	`public ParallelLoopBody`
`3024`	`3129`	`{`
`@@ -3120,27 +3225,17 @@ template<typename T, typename WT> class ResizeArea_Invoker :`
`3120`	`3225`
`3121`	`3226`	`if( dy != prev_dy )`
`3122`	`3227`	`{`
`3123`		`- T* D = dst->templateptr<T>(prev_dy);`
`3124`		`-`
`3125`		`-for( dx =0; dx < dsize.width; dx++ )`
`3126`		`- {`
`3127`		`- D[dx] = saturate_cast<T>(sum[dx]);`
`3128`		`- sum[dx] = beta*buf[dx];`
`3129`		`- }`
	`3228`	`+inter_area::saturate_store(sum, dsize.width, dst->templateptr<T>(prev_dy));`
	`3229`	`+inter_area::mul(buf, dsize.width, beta, sum);`
`3130`	`3230`	`prev_dy = dy;`
`3131`	`3231`	`}`
`3132`	`3232`	`else`
`3133`	`3233`	`{`
`3134`		`-for( dx =0; dx < dsize.width; dx++ )`
`3135`		`- sum[dx] += beta*buf[dx];`
	`3234`	`+inter_area::muladd(buf, dsize.width, beta, sum);`
`3136`	`3235`	`}`
`3137`	`3236`	`}`
`3138`	`3237`
`3139`		`- {`
`3140`		`- T* D = dst->templateptr<T>(prev_dy);`
`3141`		`-for( dx =0; dx < dsize.width; dx++ )`
`3142`		`- D[dx] = saturate_cast<T>(sum[dx]);`
`3143`		`- }`
	`3238`	`+inter_area::saturate_store(sum, dsize.width, dst->templateptr<T>(prev_dy));`
`3144`	`3239`	`}`
`3145`	`3240`
`3146`	`3241`	`private:`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Commitc96f48e

File tree

1 file changed

1 file changed

`‎modules/imgproc/src/resize.cpp‎`

0 commit comments