Commit9c3c5c5

j-naylor

authored and

pull[bot]

committed

Abstract some more architecture-specific details away from SIMD functionality

Add a typedef to represent vectors containing four 32-bit integers,and add functions operating on them. Also separate out saturatingsubtraction into its own function. The motivation for this is toprepare for a future commit to add ARM NEON support.Nathan BossartReviewed by John Naylor and Tom LaneDiscussion:https://www.postgresql.org/message-id/flat/CAFBsxsEyR9JkfbPcDXBRYEfdfC__OkwVGdwEAgY4Rv0cvw35EA%40mail.gmail.com#aba7a64b11503494ffd8dd27067626a9

1 parent151b968 commit9c3c5c5Copy full SHA for 9c3c5c5

File tree

2 files changed

+121

-43

lines changed

src/include/port
- pg_lfind.h
- simd.h

2 files changed

+121

-43

lines changed

`‎src/include/port/pg_lfind.h`

Lines changed: 39 additions & 26 deletions

Original file line number	Diff line number	Diff line change
`@@ -91,16 +91,19 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)`
`91`	`91`	`{`
`92`	`92`	`uint32i=0;`
`93`	`93`
`94`		`-#ifdefUSE_SSE2`
	`94`	`+#ifndefUSE_NO_SIMD`
`95`	`95`
`96`	`96`	`/*`
`97`		`- * A 16-byte register only has four 4-byte lanes. For better`
`98`		`- * instruction-level parallelism, each loop iteration operates on a block`
`99`		`- * of four registers. Testing has showed this is ~40% faster than using a`
`100`		`- * block of two registers.`
	`97`	`+ * For better instruction-level parallelism, each loop iteration operates`
	`98`	`+ * on a block of four registers. Testing for SSE2 has showed this is ~40%`
	`99`	`+ * faster than using a block of two registers.`
`101`	`100`	`*/`
`102`		`-const__m128ikeys=_mm_set1_epi32(key);/* load 4 copies of key */`
`103`		`-uint32iterations=nelem& ~0xF;/* round down to multiple of 16 */`
	`101`	`+constVector32keys=vector32_broadcast(key);/* load copies of key */`
	`102`	`+constuint32nelem_per_vector=sizeof(Vector32) /sizeof(uint32);`
	`103`	`+constuint32nelem_per_iteration=4*nelem_per_vector;`
	`104`	`+`
	`105`	`+/* round down to multiple of elements per iteration */`
	`106`	`+constuint32tail_idx=nelem& ~(nelem_per_iteration-1);`
`104`	`107`
`105`	`108`	`#if defined(USE_ASSERT_CHECKING)`
`106`	`109`	`boolassert_result= false;`
`@@ -116,49 +119,59 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)`
`116`	`119`	`}`
`117`	`120`	`#endif`
`118`	`121`
`119`		`-for (i=0;i<iterations;i+=16)`
	`122`	`+for (i=0;i<tail_idx;i+=nelem_per_iteration)`
`120`	`123`	`{`
`121`		`-/* load the next block into 4 registers holding 4 values each */`
`122`		`-const__m128ivals1=_mm_loadu_si128((__m128i*)&base[i]);`
`123`		`-const__m128ivals2=_mm_loadu_si128((__m128i*)&base[i+4]);`
`124`		`-const__m128ivals3=_mm_loadu_si128((__m128i*)&base[i+8]);`
`125`		`-const__m128ivals4=_mm_loadu_si128((__m128i*)&base[i+12]);`
	`124`	`+Vector32vals1,`
	`125`	`+vals2,`
	`126`	`+vals3,`
	`127`	`+vals4,`
	`128`	`+result1,`
	`129`	`+result2,`
	`130`	`+result3,`
	`131`	`+result4,`
	`132`	`+tmp1,`
	`133`	`+tmp2,`
	`134`	`+result;`
	`135`	`+`
	`136`	`+/* load the next block into 4 registers */`
	`137`	`+vector32_load(&vals1,&base[i]);`
	`138`	`+vector32_load(&vals2,&base[i+nelem_per_vector]);`
	`139`	`+vector32_load(&vals3,&base[i+nelem_per_vector*2]);`
	`140`	`+vector32_load(&vals4,&base[i+nelem_per_vector*3]);`
`126`	`141`
`127`	`142`	`/* compare each value to the key */`
`128`		`-const__m128iresult1=_mm_cmpeq_epi32(keys,vals1);`
`129`		`-const__m128iresult2=_mm_cmpeq_epi32(keys,vals2);`
`130`		`-const__m128iresult3=_mm_cmpeq_epi32(keys,vals3);`
`131`		`-const__m128iresult4=_mm_cmpeq_epi32(keys,vals4);`
	`143`	`+result1=vector32_eq(keys,vals1);`
	`144`	`+result2=vector32_eq(keys,vals2);`
	`145`	`+result3=vector32_eq(keys,vals3);`
	`146`	`+result4=vector32_eq(keys,vals4);`
`132`	`147`
`133`	`148`	`/* combine the results into a single variable */`
`134`		`-const__m128itmp1=_mm_or_si128(result1,result2);`
`135`		`-const__m128itmp2=_mm_or_si128(result3,result4);`
`136`		`-const__m128iresult=_mm_or_si128(tmp1,tmp2);`
	`149`	`+tmp1=vector32_or(result1,result2);`
	`150`	`+tmp2=vector32_or(result3,result4);`
	`151`	`+result=vector32_or(tmp1,tmp2);`
`137`	`152`
`138`	`153`	`/* see if there was a match */`
`139`		`-if (_mm_movemask_epi8(result)!=0)`
	`154`	`+if (vector8_is_highbit_set((Vector8)result))`
`140`	`155`	`{`
`141`		`-#if defined(USE_ASSERT_CHECKING)`
`142`	`156`	`Assert(assert_result== true);`
`143`		`-#endif`
`144`	`157`	`return true;`
`145`	`158`	`}`
`146`	`159`	`}`
`147`		`-#endif/USE_SSE2 /`
	`160`	`+#endif/! USE_NO_SIMD /`
`148`	`161`
`149`	`162`	`/* Process the remaining elements one at a time. */`
`150`	`163`	`for (;i<nelem;i++)`
`151`	`164`	`{`
`152`	`165`	`if (key==base[i])`
`153`	`166`	`{`
`154`		`-#if defined(USE_SSE2)&& defined(USE_ASSERT_CHECKING)`
	`167`	`+#ifndefUSE_NO_SIMD`
`155`	`168`	`Assert(assert_result== true);`
`156`	`169`	`#endif`
`157`	`170`	`return true;`
`158`	`171`	`}`
`159`	`172`	`}`
`160`	`173`
`161`		`-#if defined(USE_SSE2)&& defined(USE_ASSERT_CHECKING)`
	`174`	`+#ifndefUSE_NO_SIMD`
`162`	`175`	`Assert(assert_result== false);`
`163`	`176`	`#endif`
`164`	`177`	`return false;`

`‎src/include/port/simd.h`

Lines changed: 82 additions & 17 deletions

Original file line number	Diff line number	Diff line change
`@@ -31,22 +31,32 @@`
`31`	`31`	`#include<emmintrin.h>`
`32`	`32`	`#defineUSE_SSE2`
`33`	`33`	`typedef__m128iVector8;`
	`34`	`+typedef__m128iVector32;`
`34`	`35`
`35`	`36`	`#else`
`36`	`37`	`/*`
`37`	`38`	`* If no SIMD instructions are available, we can in some cases emulate vector`
`38`		`- * operations using bitwise operations on unsigned integers.`
	`39`	`+ * operations using bitwise operations on unsigned integers. Note that many`
	`40`	`+ * of the functions in this file presently do not have non-SIMD`
	`41`	`+ * implementations. In particular, none of the functions involving Vector32`
	`42`	`+ * are implemented without SIMD since it's likely not worthwhile to represent`
	`43`	`+ * two 32-bit integers using a uint64.`
`39`	`44`	`*/`
`40`	`45`	`#defineUSE_NO_SIMD`
`41`	`46`	`typedefuint64Vector8;`
`42`	`47`	`#endif`
`43`	`48`
`44`		`-`
`45`	`49`	`/* load/store operations */`
`46`	`50`	`staticinlinevoidvector8_load(Vector8v,constuint8s);`
	`51`	`+#ifndefUSE_NO_SIMD`
	`52`	`+staticinlinevoidvector32_load(Vector32v,constuint32s);`
	`53`	`+#endif`
`47`	`54`
`48`	`55`	`/* assignment operations */`
`49`	`56`	`staticinlineVector8vector8_broadcast(constuint8c);`
	`57`	`+#ifndefUSE_NO_SIMD`
	`58`	`+staticinlineVector32vector32_broadcast(constuint32c);`
	`59`	`+#endif`
`50`	`60`
`51`	`61`	`/* element-wise comparisons to a scalar */`
`52`	`62`	`staticinlineboolvector8_has(constVector8v,constuint8c);`
`@@ -56,14 +66,21 @@ static inline bool vector8_is_highbit_set(const Vector8 v);`
`56`	`66`
`57`	`67`	`/* arithmetic operations */`
`58`	`68`	`staticinlineVector8vector8_or(constVector8v1,constVector8v2);`
`59`		`-`
`60`		`-/* Different semantics for SIMD architectures. */`
`61`	`69`	`#ifndefUSE_NO_SIMD`
	`70`	`+staticinlineVector32vector32_or(constVector32v1,constVector32v2);`
	`71`	`+staticinlineVector8vector8_ssub(constVector8v1,constVector8v2);`
	`72`	`+#endif`
`62`	`73`
`63`		`-/* comparisons between vectors */`
	`74`	`+/*`
	`75`	`+ * comparisons between vectors`
	`76`	`+ *`
	`77`	`+ * Note: These return a vector rather than booloan, which is why we don't`
	`78`	`+ * have non-SIMD implementations.`
	`79`	`+ */`
	`80`	`+#ifndefUSE_NO_SIMD`
`64`	`81`	`staticinlineVector8vector8_eq(constVector8v1,constVector8v2);`
`65`		`-`
`66`		`-#endif/* ! USE_NO_SIMD */`
	`82`	`+staticinlineVector32vector32_eq(constVector32v1,constVector32v2);`
	`83`	`+#endif`
`67`	`84`
`68`	`85`	`/*`
`69`	`86`	`* Load a chunk of memory into the given vector.`
`@@ -78,6 +95,15 @@ vector8_load(Vector8 v, const uint8 s)`
`78`	`95`	`#endif`
`79`	`96`	`}`
`80`	`97`
	`98`	`+#ifndefUSE_NO_SIMD`
	`99`	`+staticinlinevoid`
	`100`	`+vector32_load(Vector32v,constuint32s)`
	`101`	`+{`
	`102`	`+#ifdefUSE_SSE2`
	`103`	`+v=_mm_loadu_si128((const__m128i)s);`
	`104`	`+#endif`
	`105`	`+}`
	`106`	`+#endif/* ! USE_NO_SIMD */`
`81`	`107`
`82`	`108`	`/*`
`83`	`109`	`* Create a vector with all elements set to the same value.`
`@@ -92,6 +118,16 @@ vector8_broadcast(const uint8 c)`
`92`	`118`	`#endif`
`93`	`119`	`}`
`94`	`120`
	`121`	`+#ifndefUSE_NO_SIMD`
	`122`	`+staticinlineVector32`
	`123`	`+vector32_broadcast(constuint32c)`
	`124`	`+{`
	`125`	`+#ifdefUSE_SSE2`
	`126`	`+return_mm_set1_epi32(c);`
	`127`	`+#endif`
	`128`	`+}`
	`129`	`+#endif/* ! USE_NO_SIMD */`
	`130`	`+`
`95`	`131`	`/*`
`96`	`132`	`* Return true if any elements in the vector are equal to the given scalar.`
`97`	`133`	`*/`
`@@ -118,7 +154,7 @@ vector8_has(const Vector8 v, const uint8 c)`
`118`	`154`	`/* any bytes in v equal to c will evaluate to zero via XOR */`
`119`	`155`	`result=vector8_has_zero(v ^vector8_broadcast(c));`
`120`	`156`	`#elif defined(USE_SSE2)`
`121`		`-result=_mm_movemask_epi8(_mm_cmpeq_epi8(v,vector8_broadcast(c)));`
	`157`	`+result=vector8_is_highbit_set(vector8_eq(v,vector8_broadcast(c)));`
`122`	`158`	`#endif`
`123`	`159`
`124`	`160`	`Assert(assert_result==result);`
`@@ -133,8 +169,8 @@ vector8_has_zero(const Vector8 v)`
`133`	`169`	`{`
`134`	`170`	`#if defined(USE_NO_SIMD)`
`135`	`171`	`/*`
`136`		`- * We cannot call vector8_has() here, because that would lead to a circular`
`137`		`- * definition.`
	`172`	`+ * We cannot call vector8_has() here, because that would lead to a`
	`173`	`+ *circulardefinition.`
`138`	`174`	`*/`
`139`	`175`	`returnvector8_has_le(v,0);`
`140`	`176`	`#elif defined(USE_SSE2)`
`@@ -150,9 +186,6 @@ static inline bool`
`150`	`186`	`vector8_has_le(constVector8v,constuint8c)`
`151`	`187`	`{`
`152`	`188`	`boolresult= false;`
`153`		`-#if defined(USE_SSE2)`
`154`		`-__m128isub;`
`155`		`-#endif`
`156`	`189`
`157`	`190`	`/* pre-compute the result for assert checking */`
`158`	`191`	`#ifdefUSE_ASSERT_CHECKING`
`@@ -194,10 +227,10 @@ vector8_has_le(const Vector8 v, const uint8 c)`
`194`	`227`
`195`	`228`	`/*`
`196`	`229`	`* Use saturating subtraction to find bytes <= c, which will present as`
`197`		`- * NUL bytes in 'sub'.`
	`230`	`+ * NUL bytes. This approach is a workaround for the lack of unsigned`
	`231`	`+ * comparison instructions on some architectures.`
`198`	`232`	`*/`
`199`		`-sub=_mm_subs_epu8(v,vector8_broadcast(c));`
`200`		`-result=vector8_has_zero(sub);`
	`233`	`+result=vector8_has_zero(vector8_ssub(v,vector8_broadcast(c)));`
`201`	`234`	`#endif`
`202`	`235`
`203`	`236`	`Assert(assert_result==result);`
`@@ -230,22 +263,54 @@ vector8_or(const Vector8 v1, const Vector8 v2)`
`230`	`263`	`#endif`
`231`	`264`	`}`
`232`	`265`
	`266`	`+#ifndefUSE_NO_SIMD`
	`267`	`+staticinlineVector32`
	`268`	`+vector32_or(constVector32v1,constVector32v2)`
	`269`	`+{`
	`270`	`+#ifdefUSE_SSE2`
	`271`	`+return_mm_or_si128(v1,v2);`
	`272`	`+#endif`
	`273`	`+}`
	`274`	`+#endif/* ! USE_NO_SIMD */`
`233`	`275`
`234`		`-/* Different semantics for SIMD architectures. */`
	`276`	`+/*`
	`277`	`+ * Return the result of subtracting the respective elements of the input`
	`278`	`+ * vectors using saturation (i.e., if the operation would yield a value less`
	`279`	`+ * than zero, zero is returned instead). For more information on saturation`
	`280`	`+ * arithmetic, see https://en.wikipedia.org/wiki/Saturation_arithmetic`
	`281`	`+ */`
`235`	`282`	`#ifndefUSE_NO_SIMD`
	`283`	`+staticinlineVector8`
	`284`	`+vector8_ssub(constVector8v1,constVector8v2)`
	`285`	`+{`
	`286`	`+#ifdefUSE_SSE2`
	`287`	`+return_mm_subs_epu8(v1,v2);`
	`288`	`+#endif`
	`289`	`+}`
	`290`	`+#endif/* ! USE_NO_SIMD */`
`236`	`291`
`237`	`292`	`/*`
`238`	`293`	`* Return a vector with all bits set in each lane where the the corresponding`
`239`	`294`	`* lanes in the inputs are equal.`
`240`	`295`	`*/`
	`296`	`+#ifndefUSE_NO_SIMD`
`241`	`297`	`staticinlineVector8`
`242`	`298`	`vector8_eq(constVector8v1,constVector8v2)`
`243`	`299`	`{`
`244`	`300`	`#ifdefUSE_SSE2`
`245`	`301`	`return_mm_cmpeq_epi8(v1,v2);`
`246`	`302`	`#endif`
`247`	`303`	`}`
	`304`	`+#endif/* ! USE_NO_SIMD */`
`248`	`305`
	`306`	`+#ifndefUSE_NO_SIMD`
	`307`	`+staticinlineVector32`
	`308`	`+vector32_eq(constVector32v1,constVector32v2)`
	`309`	`+{`
	`310`	`+#ifdefUSE_SSE2`
	`311`	`+return_mm_cmpeq_epi32(v1,v2);`
	`312`	`+#endif`
	`313`	`+}`
`249`	`314`	`#endif/* ! USE_NO_SIMD */`
`250`	`315`
`251`	`316`	`#endif/* SIMD_H */`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit9c3c5c5

File tree

2 files changed

2 files changed

`‎src/include/port/pg_lfind.h`

`‎src/include/port/simd.h`

0 commit comments