Commitac1d06d

committed

ximgproc: optimize add_mul using NEON intrinsics for ARM64

1 parentea9f108 commitac1d06dCopy full SHA for ac1d06d

File tree

+27

-0

lines changed

+27

-0

lines changed

Lines changed: 27 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -60,6 +60,19 @@ inline bool CPU_SUPPORT_SSE1()`
`60`	`60`	`}// end`
`61`	`61`	`#endif`
`62`	`62`
	`63`	`+#if CV_NEON`
	`64`	`+namespace`
	`65`	`+{`
	`66`	`+`
	`67`	`+inlineboolCPU_SUPPORT_NEON()`
	`68`	`+{`
	`69`	`+staticconstbool is_supported =cv::checkHardwareSupport(CV_CPU_NEON);`
	`70`	`+return is_supported;`
	`71`	`+}`
	`72`	`+`
	`73`	`+}// end`
	`74`	`+#endif`
	`75`	`+`
`63`	`76`	`namespacecv`
`64`	`77`	`{`
`65`	`78`	`namespaceximgproc`
`@@ -288,6 +301,20 @@ void add_mul(float dst, float src1, float *src2, int w)`
`288`	`301`	`_mm_storeu_ps(dst + j, c);`
`289`	`302`	`}`
`290`	`303`	`}`
	`304`	`+#elif CV_NEON`
	`305`	`+if (CPU_SUPPORT_NEON())`
	`306`	`+ {`
	`307`	`+float32x4_t a, b, c;`
	`308`	`+for (; j < w -3; j +=4)`
	`309`	`+ {`
	`310`	`+ a =vld1q_f32(src1 + j);`
	`311`	`+ b =vld1q_f32(src2 + j);`
	`312`	`+ b =vmulq_f32(b, a);`
	`313`	`+ c =vld1q_f32(dst + j);`
	`314`	`+ c =vaddq_f32(c, b);`
	`315`	`+vst1q_f32(dst + j, c);`
	`316`	`+ }`
	`317`	`+ }`
`291`	`318`	`#endif`
`292`	`319`	`for (; j < w; j++)`
`293`	`320`	`{`

Comments

(0)