NotificationsYou must be signed in to change notification settings
Fork6
Star31

Commit7644a73

committed

Micro-optimize pg_lfind32().

This commit improves the performance of pg_lfind32() in many casesby modifying it to process the remaining "tail" of elements withSIMD instructions instead of processing them one-by-one. Since theSIMD code processes a large block of elements, this means that wewill process a subset of elements more than once, but that won'taffect the correctness of the result, and testing has shown thatthis helps more cases than it regresses. With this change, thestandard one-by-one linear search code is only used for smallarrays and for platforms without SIMD support.Suggested-by: John NaylorReviewed-by: John NaylorDiscussion:https://postgr.es/m/20231129171526.GA857928%40nathanxps13

1 parenta65724d commit7644a73Copy full SHA for 7644a73

File tree

1 file changed

+76

-38

lines changed

src/include/port
- pg_lfind.h

1 file changed

+76

-38

lines changed

`‎src/include/port/pg_lfind.h‎`

Lines changed: 76 additions & 38 deletions

Original file line number	Diff line number	Diff line change
`@@ -80,6 +80,51 @@ pg_lfind8_le(uint8 key, uint8 *base, uint32 nelem)`
`80`	`80`	`return false;`
`81`	`81`	`}`
`82`	`82`
	`83`	`+#ifndefUSE_NO_SIMD`
	`84`	`+/*`
	`85`	`+ * pg_lfind32_simd_helper`
	`86`	`+ *`
	`87`	`+ * Searches one 4-register-block of integers. The caller is responsible for`
	`88`	`+ * ensuring that there are at least 4-registers-worth of integers remaining.`
	`89`	`+ */`
	`90`	`+staticinlinebool`
	`91`	`+pg_lfind32_simd_helper(constVector32keys,uint32*base)`
	`92`	`+{`
	`93`	`+constuint32nelem_per_vector=sizeof(Vector32) /sizeof(uint32);`
	`94`	`+Vector32vals1,`
	`95`	`+vals2,`
	`96`	`+vals3,`
	`97`	`+vals4,`
	`98`	`+result1,`
	`99`	`+result2,`
	`100`	`+result3,`
	`101`	`+result4,`
	`102`	`+tmp1,`
	`103`	`+tmp2,`
	`104`	`+result;`
	`105`	`+`
	`106`	`+/* load the next block into 4 registers */`
	`107`	`+vector32_load(&vals1,base);`
	`108`	`+vector32_load(&vals2,&base[nelem_per_vector]);`
	`109`	`+vector32_load(&vals3,&base[nelem_per_vector*2]);`
	`110`	`+vector32_load(&vals4,&base[nelem_per_vector*3]);`
	`111`	`+`
	`112`	`+/* compare each value to the key */`
	`113`	`+result1=vector32_eq(keys,vals1);`
	`114`	`+result2=vector32_eq(keys,vals2);`
	`115`	`+result3=vector32_eq(keys,vals3);`
	`116`	`+result4=vector32_eq(keys,vals4);`
	`117`	`+`
	`118`	`+/* combine the results into a single variable */`
	`119`	`+tmp1=vector32_or(result1,result2);`
	`120`	`+tmp2=vector32_or(result3,result4);`
	`121`	`+result=vector32_or(tmp1,tmp2);`
	`122`	`+`
	`123`	`+/* return whether there was a match */`
	`124`	`+returnvector32_is_highbit_set(result);`
	`125`	`+}`
	`126`	`+#endif/* ! USE_NO_SIMD */`
	`127`	`+`
`83`	`128`	`/*`
`84`	`129`	`* pg_lfind32`
`85`	`130`	`*`
`@@ -95,8 +140,7 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)`
`95`	`140`
`96`	`141`	`/*`
`97`	`142`	`* For better instruction-level parallelism, each loop iteration operates`
`98`		`- * on a block of four registers. Testing for SSE2 has showed this is ~40%`
`99`		`- * faster than using a block of two registers.`
	`143`	`+ * on a block of four registers.`
`100`	`144`	`*/`
`101`	`145`	`constVector32keys=vector32_broadcast(key);/* load copies of key */`
`102`	`146`	`constuint32nelem_per_vector=sizeof(Vector32) /sizeof(uint32);`
`@@ -109,57 +153,51 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)`
`109`	`153`	`boolassert_result= false;`
`110`	`154`
`111`	`155`	`/* pre-compute the result for assert checking */`
`112`		`-for (i=0;i<nelem;i++)`
	`156`	`+for (intj=0;j<nelem;j++)`
`113`	`157`	`{`
`114`		`-if (key==base[i])`
	`158`	`+if (key==base[j])`
`115`	`159`	`{`
`116`	`160`	`assert_result= true;`
`117`	`161`	`break;`
`118`	`162`	`}`
`119`	`163`	`}`
`120`	`164`	`#endif`
`121`	`165`
`122`		`-for (i=0;i<tail_idx;i+=nelem_per_iteration)`
	`166`	`+/*`
	`167`	`+ * If there aren't enough elements for the SIMD code, jump to the standard`
	`168`	`+ * one-by-one linear search code.`
	`169`	`+ */`
	`170`	`+if (nelem<nelem_per_iteration)`
	`171`	`+gotoone_by_one;`
	`172`	`+`
	`173`	`+/*`
	`174`	`+ * Process as many elements as possible with a block of 4 registers.`
	`175`	`+ */`
	`176`	`+do`
`123`	`177`	`{`
`124`		`-Vector32vals1,`
`125`		`-vals2,`
`126`		`-vals3,`
`127`		`-vals4,`
`128`		`-result1,`
`129`		`-result2,`
`130`		`-result3,`
`131`		`-result4,`
`132`		`-tmp1,`
`133`		`-tmp2,`
`134`		`-result;`
`135`		`-`
`136`		`-/* load the next block into 4 registers */`
`137`		`-vector32_load(&vals1,&base[i]);`
`138`		`-vector32_load(&vals2,&base[i+nelem_per_vector]);`
`139`		`-vector32_load(&vals3,&base[i+nelem_per_vector*2]);`
`140`		`-vector32_load(&vals4,&base[i+nelem_per_vector*3]);`
`141`		`-`
`142`		`-/* compare each value to the key */`
`143`		`-result1=vector32_eq(keys,vals1);`
`144`		`-result2=vector32_eq(keys,vals2);`
`145`		`-result3=vector32_eq(keys,vals3);`
`146`		`-result4=vector32_eq(keys,vals4);`
`147`		`-`
`148`		`-/* combine the results into a single variable */`
`149`		`-tmp1=vector32_or(result1,result2);`
`150`		`-tmp2=vector32_or(result3,result4);`
`151`		`-result=vector32_or(tmp1,tmp2);`
`152`		`-`
`153`		`-/* see if there was a match */`
`154`		`-if (vector32_is_highbit_set(result))`
	`178`	`+if (pg_lfind32_simd_helper(keys,&base[i]))`
`155`	`179`	`{`
`156`	`180`	`Assert(assert_result== true);`
`157`	`181`	`return true;`
`158`	`182`	`}`
`159`		`-}`
	`183`	`+`
	`184`	`+i+=nelem_per_iteration;`
	`185`	`+`
	`186`	`+}while (i<tail_idx);`
	`187`	`+`
	`188`	`+/*`
	`189`	`+ * Process the last 'nelem_per_iteration' elements in the array with a`
	`190`	`+ * 4-register block. This will cause us to check a subset of the elements`
	`191`	`+ * more than once, but that won't affect correctness, and testing has`
	`192`	`+ * demonstrated that this helps more cases than it harms.`
	`193`	`+ */`
	`194`	`+Assert(assert_result==pg_lfind32_simd_helper(keys,&base[nelem-nelem_per_iteration]));`
	`195`	`+returnpg_lfind32_simd_helper(keys,&base[nelem-nelem_per_iteration]);`
	`196`	`+`
`160`	`197`	`#endif/* ! USE_NO_SIMD */`
`161`	`198`
`162`		`-/* Process the remaining elements one at a time. */`
	`199`	`+one_by_one:`
	`200`	`+/* Process the elements one at a time. */`
`163`	`201`	`for (;i<nelem;i++)`
`164`	`202`	`{`
`165`	`203`	`if (key==base[i])`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit7644a73

File tree

1 file changed

1 file changed

`‎src/include/port/pg_lfind.h‎`

0 commit comments