@@ -3034,89 +3034,60 @@ static inline void vx_load_as(const ushort* ptr, v_float32& a)
30343034static inline void vx_load_as (const short * ptr, v_float32& a)
30353035{ a =v_cvt_f32 (v_reinterpret_as_s32 (vx_load_expand (ptr))); }
30363036
3037- template < typename VT>
3038- VT vx_setall_local ( double coeff);
3039- template <>
3040- v_float32vx_setall_local (double coeff) {
3037+ static inline void vx_load_as ( const float * ptr, v_float32& a)
3038+ { a = v_load (ptr); }
3039+
3040+ v_float32vx_setall_local (float coeff) {
30413041return v_setall_f32 (coeff);
30423042}
3043- template <typename WT,typename VT>
3044- void v_inter_area_set_sum (int col_end,const WT *const buf,const VT &v_coeff,
3045- WT *sum,int &x) {
3043+ #if CV_SIMD128_64F
3044+ static inline void vx_load_as (const double * ptr, v_float64& a)
3045+ { a =v_load (ptr); }
3046+
3047+ v_float64vx_setall_local (double coeff) {
3048+ return v_setall_f64 (coeff);
3049+ }
3050+ #endif
3051+ template <typename T,typename WT,typename VT>
3052+ void v_inter_area_set_or_update_sum (const T *const src,int n,bool do_set,
3053+ WT coeff, WT *sum) {
30463054constexpr int step = VT::nlanes;
3047- for (x =0 ; x + step < col_end; x += step)
3055+ const VT v_coeff =vx_setall_local (coeff);
3056+ int x;
3057+ if (do_set)
30483058 {
3049- const VT line =vx_load (buf + x);
3050- v_store (sum + x, line * v_coeff);
3059+ for (x =0 ; x + step < n; x += step)
3060+ {
3061+ VT line;
3062+ vx_load_as (src + x, line);
3063+ v_store (sum + x, line * v_coeff);
3064+ }
3065+ for (; x < n; ++x) sum[x] = saturate_cast<WT>(src[x]) * coeff;
30513066 }
3052- }
3053- template <typename WT,typename VT>
3054- void v_inter_area_update_sum (int col_end,const WT *const buf,const VT &v_coeff,
3055- WT *sum,int &x) {
3056- constexpr int step = VT::nlanes;
3057- for (x =0 ; x + step < col_end; x += step)
3067+ else
30583068 {
3059- const VT line =vx_load (buf + x);
3060- const VT sum_x =vx_load (sum + x);
3061- v_store (sum + x, sum_x + line * v_coeff);
3069+ for (x =0 ; x + step < n; x += step)
3070+ {
3071+ VT line;
3072+ vx_load_as (src + x, line);
3073+ const VT sum_x =vx_load (sum + x);
3074+ v_store (sum + x, sum_x + line * v_coeff);
3075+ }
3076+ for (; x < n; ++x) sum[x] += saturate_cast<WT>(src[x]) * coeff;
30623077 }
30633078}
3064- template <typename S>
3065- void v_inter_area_copy_or_not (const S* s,int n,float *d,float const **buf)
3066- {
3067- static_assert (!std::is_same<S,float >::value," Do not specialize for float" );
3068- constexpr int step = v_float32::nlanes;
3069- int x =0 ;
3070- for (; x + step < n; x += step)
3079+ #if !CV_SIMD128_64F
3080+ void v_inter_area_set_or_update_sum (const double *const src,int n,bool do_set,
3081+ double coeff,double *sum) {
3082+ int x;
3083+ if (do_set)
30713084 {
3072- v_float32 a;
3073- vx_load_as (s + x, a);
3074- v_store (d + x, a);
3085+ for (x =0 ; x < n; ++x) sum[x] = src[x] * coeff;
3086+ }
3087+ else
3088+ {
3089+ for (x =0 ; x < n; ++x) sum[x] += src[x] * coeff;
30753090 }
3076- for (; x < n; ++x) d[x] = saturate_cast<float >(s[x]);
3077- *buf = d;
3078- }
3079- void v_inter_area_copy_or_not (const double * s,int n,double *d,const double **buf)
3080- {
3081- (void )n;
3082- (void )d;
3083- *buf = s;
3084- }
3085- void v_inter_area_copy_or_not (const float * s,int n,float *d,const float **buf)
3086- {
3087- (void )n;
3088- (void )d;
3089- *buf = s;
3090- }
3091-
3092- #if CV_SIMD128_64F
3093- template <>
3094- v_float64vx_setall_local (double coeff) {
3095- return v_setall_f64 (coeff);
3096- }
3097- #else
3098- template <>
3099- v_uint8vx_setall_local (double coeff) {
3100- (void )coeff;
3101- return v_setall_u8 (0 );
3102- }
3103- template <>
3104- void v_inter_area_set_sum (int col_end,const double *const buf,const v_uint8 &v_coeff,
3105- double *sum,int &x) {
3106- (void )col_end;
3107- (void )buf;
3108- (void )v_coeff;
3109- (void )sum;
3110- x =0 ;
3111- }
3112- template <>
3113- void v_inter_area_update_sum (int col_end,const double *const buf,const v_uint8 &v_coeff,
3114- double *sum,int &x) {
3115- (void )col_end;
3116- (void )buf;
3117- (void )v_coeff;
3118- (void )sum;
3119- x =0 ;
31203091}
31213092#endif
31223093}
@@ -3144,10 +3115,8 @@ class ResizeArea_Invoker : public ParallelLoopBody
31443115 Size dsize = dst->size ();
31453116const int cn = dst->channels ();
31463117 dsize.width *= cn;
3147- AutoBuffer<WT>_buffer (std::max (src->cols * cn, range.size () * cn));
31483118const DecimateAlpha* xtab = xtab0;
31493119const int xtab_size = xtab_size0;
3150- WT *buf = _buffer.data ();
31513120const int j_start = tabofs[range.start ], j_end = tabofs[range.end ];
31523121
31533122static_assert (
@@ -3183,7 +3152,6 @@ class ResizeArea_Invoker : public ParallelLoopBody
31833152 }
31843153int prev_di = -1 ;
31853154int di =0 ;
3186- const WT* buf_local;
31873155 WT* sum =nullptr ;
31883156for (int j = row_start; j < row_end; ++j)
31893157 {
@@ -3194,37 +3162,31 @@ class ResizeArea_Invoker : public ParallelLoopBody
31943162 coeff = ytab[j].alpha ;
31953163 di = ytab[j].di ;
31963164 si = ytab[j].si ;
3197- const T* S = src->template ptr <T>(si);
3198- // Convert the line to the proper float/double type.
3199- v_inter_area_copy_or_not (S, col_end, buf, &buf_local);
32003165 }
32013166else
32023167 {
32033168 coeff = xtab[j].alpha ;
32043169 di = xtab[j].di / cn;
32053170 si = xtab[j].si / cn;
3206- buf_local = tmp.template ptr <WT>(si);
32073171 }
3208- const VT v_coeff = vx_setall_local<VT>(coeff);
32093172
3210- if (di != prev_di)
3173+ if (di != prev_di) sum = tmp.template ptr <WT>(di - start_di);
3174+
3175+ if (iter ==0 )
32113176 {
3212- sum = tmp.template ptr <WT>(di - start_di);
3213- int x;
3214- v_inter_area_set_sum (col_end, buf_local, v_coeff, sum, x);
3215- for (; x < col_end; ++x) sum[x] = buf_local[x] * coeff;
3216- prev_di = di;
3177+ const T* s = src->template ptr <T>(si);
3178+ v_inter_area_set_or_update_sum<T, WT, VT>(s, col_end, di != prev_di,
3179+ coeff, sum);
32173180 }
32183181else
32193182 {
3220- int x ;
3221- v_inter_area_update_sum (col_end, buf_local, v_coeff, sum, x);
3222- for (; x < col_end; ++x) sum[x] += buf_local[x] * coeff;
3183+ const WT* s = tmp. template ptr <WT>(si) ;
3184+ v_inter_area_set_or_update_sum<WT, WT, VT>(s, col_end, di != prev_di,
3185+ coeff, sum) ;
32233186 }
3187+
3188+ if (di != prev_di) prev_di = di;
32243189 }
3225- // Deal with the last row.
3226- WT* D = tmp.template ptr <WT>(di - start_di);
3227- for (int x =0 ; x < col_end; ++x) D[x] = sum[x];
32283190
32293191 tmp =tmp (cv::Range (0 , di - start_di +1 ),cv::Range (0 , col_end / cn)).t ();
32303192 }