Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitc96f48e

Browse files
authored
Merge pull request#24412 from vrabaud:inter_area1
Speed up line merging in INTER_AREA#24412This provides a 10 to 20% speed-up.Related perf testfix:#24417This is a split of#23525 that will be updated to only deal with column merging.### Pull Request Readiness ChecklistSee details athttps://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request- [x] I agree to contribute to the project under Apache 2 License.- [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV- [x] The PR is proposed to the proper branch- [x] There is a reference to the original bug report and related work- [x] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name.- [x] The feature is well documented and sample code can be built with the project CMake
1 parenta9664ab commitc96f48e

File tree

1 file changed

+109
-14
lines changed

1 file changed

+109
-14
lines changed

‎modules/imgproc/src/resize.cpp‎

Lines changed: 109 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3019,6 +3019,111 @@ struct DecimateAlpha
30193019
};
30203020

30213021

3022+
namespaceinter_area {
3023+
#if (CV_SIMD || CV_SIMD_SCALABLE)
3024+
inlinevoidsaturate_store(constfloat* src, uchar* dst) {
3025+
const v_int32 tmp0 =v_round(vx_load(src +0 * VTraits<v_float32>::vlanes()));
3026+
const v_int32 tmp1 =v_round(vx_load(src +1 * VTraits<v_float32>::vlanes()));
3027+
const v_int32 tmp2 =v_round(vx_load(src +2 * VTraits<v_float32>::vlanes()));
3028+
const v_int32 tmp3 =v_round(vx_load(src +3 * VTraits<v_float32>::vlanes()));
3029+
v_store(dst,v_pack(v_pack_u(tmp0, tmp1),v_pack_u(tmp2, tmp3)));
3030+
}
3031+
3032+
inlinevoidsaturate_store(constfloat* src, ushort* dst) {
3033+
const v_int32 tmp0 =v_round(vx_load(src +0 * VTraits<v_float32>::vlanes()));
3034+
const v_int32 tmp1 =v_round(vx_load(src +1 * VTraits<v_float32>::vlanes()));
3035+
v_store(dst,v_pack_u(tmp0, tmp1));
3036+
}
3037+
3038+
inlinevoidsaturate_store(constfloat* src,short* dst) {
3039+
const v_int32 tmp0 =v_round(vx_load(src +0 * VTraits<v_float32>::vlanes()));
3040+
const v_int32 tmp1 =v_round(vx_load(src +1 * VTraits<v_float32>::vlanes()));
3041+
v_store(dst,v_pack(tmp0, tmp1));
3042+
}
3043+
3044+
staticinline v_float32vx_setall(float coeff) {returnvx_setall_f32(coeff); }
3045+
3046+
template<typename T>
3047+
structVArea {};
3048+
3049+
template<>
3050+
structVArea<float> {
3051+
typedef v_float32 vWT;
3052+
};
3053+
#endif
3054+
3055+
#if (CV_SIMD128_64F || CV_SIMD_SCALABLE_64F)
3056+
staticinline v_float64vx_setall(double coeff) {returnvx_setall_f64(coeff); }
3057+
3058+
template<>
3059+
structVArea<double> {
3060+
typedef v_float64 vWT;
3061+
};
3062+
3063+
#else
3064+
inlinevoidmul(constdouble* buf,int width,double beta,double* sum) {
3065+
for (int dx =0; dx < width; ++dx) {
3066+
sum[dx] = beta * buf[dx];
3067+
}
3068+
}
3069+
3070+
inlinevoidmuladd(constdouble* buf,int width,double beta,double* sum) {
3071+
for (int dx =0; dx < width; ++dx) {
3072+
sum[dx] += beta * buf[dx];
3073+
}
3074+
}
3075+
#endif
3076+
3077+
template<typename T,typename WT>
3078+
inlinevoidsaturate_store(const WT* sum,int width, T* D) {
3079+
int dx =0;
3080+
#if (CV_SIMD || CV_SIMD_SCALABLE)
3081+
constint step = VTraits<typename VArea<WT>::vWT>::vlanes() *sizeof(WT) /sizeof(T);
3082+
for (; dx + step < width; dx += step) {
3083+
saturate_store(sum + dx, D + dx);
3084+
}
3085+
#endif
3086+
for (; dx < width; ++dx) {
3087+
D[dx] = saturate_cast<T>(sum[dx]);
3088+
}
3089+
}
3090+
3091+
// Optimization when T == WT.
3092+
template<typename WT>
3093+
inlinevoidsaturate_store(const WT* sum,int width, WT* D) {
3094+
std::copy(sum, sum + width, D);
3095+
}
3096+
3097+
template<typename WT>
3098+
inlinevoidmul(const WT* buf,int width, WT beta, WT* sum) {
3099+
int dx =0;
3100+
#if (CV_SIMD || CV_SIMD_SCALABLE)
3101+
constint step = VTraits<typename VArea<WT>::vWT>::vlanes();
3102+
for (; dx + step < width; dx += step) {
3103+
vx_store(sum + dx,v_mul(vx_setall(beta),vx_load(buf + dx)));
3104+
}
3105+
#endif
3106+
for (; dx < width; ++dx) {
3107+
sum[dx] = beta * buf[dx];
3108+
}
3109+
}
3110+
3111+
template<typename WT>
3112+
inlinevoidmuladd(const WT* buf,int width, WT beta, WT* sum) {
3113+
int dx =0;
3114+
#if (CV_SIMD || CV_SIMD_SCALABLE)
3115+
constint step = VTraits<typename VArea<WT>::vWT>::vlanes();
3116+
for (; dx + step < width; dx += step) {
3117+
vx_store(sum + dx,v_add(vx_load(sum + dx),v_mul(vx_setall(beta),vx_load(buf + dx))));
3118+
}
3119+
#endif
3120+
for (; dx < width; ++dx) {
3121+
sum[dx] += beta * buf[dx];
3122+
}
3123+
}
3124+
3125+
}// namespace inter_area
3126+
30223127
template<typename T,typename WT>classResizeArea_Invoker :
30233128
public ParallelLoopBody
30243129
{
@@ -3120,27 +3225,17 @@ template<typename T, typename WT> class ResizeArea_Invoker :
31203225

31213226
if( dy != prev_dy )
31223227
{
3123-
T* D = dst->templateptr<T>(prev_dy);
3124-
3125-
for( dx =0; dx < dsize.width; dx++ )
3126-
{
3127-
D[dx] = saturate_cast<T>(sum[dx]);
3128-
sum[dx] = beta*buf[dx];
3129-
}
3228+
inter_area::saturate_store(sum, dsize.width, dst->templateptr<T>(prev_dy));
3229+
inter_area::mul(buf, dsize.width, beta, sum);
31303230
prev_dy = dy;
31313231
}
31323232
else
31333233
{
3134-
for( dx =0; dx < dsize.width; dx++ )
3135-
sum[dx] += beta*buf[dx];
3234+
inter_area::muladd(buf, dsize.width, beta, sum);
31363235
}
31373236
}
31383237

3139-
{
3140-
T* D = dst->templateptr<T>(prev_dy);
3141-
for( dx =0; dx < dsize.width; dx++ )
3142-
D[dx] = saturate_cast<T>(sum[dx]);
3143-
}
3238+
inter_area::saturate_store(sum, dsize.width, dst->templateptr<T>(prev_dy));
31443239
}
31453240

31463241
private:

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp