Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit7644a73

Browse files
Micro-optimize pg_lfind32().
This commit improves the performance of pg_lfind32() in many casesby modifying it to process the remaining "tail" of elements withSIMD instructions instead of processing them one-by-one. Since theSIMD code processes a large block of elements, this means that wewill process a subset of elements more than once, but that won'taffect the correctness of the result, and testing has shown thatthis helps more cases than it regresses. With this change, thestandard one-by-one linear search code is only used for smallarrays and for platforms without SIMD support.Suggested-by: John NaylorReviewed-by: John NaylorDiscussion:https://postgr.es/m/20231129171526.GA857928%40nathanxps13
1 parenta65724d commit7644a73

File tree

1 file changed

+76
-38
lines changed

1 file changed

+76
-38
lines changed

‎src/include/port/pg_lfind.h

Lines changed: 76 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,51 @@ pg_lfind8_le(uint8 key, uint8 *base, uint32 nelem)
8080
return false;
8181
}
8282

83+
#ifndefUSE_NO_SIMD
84+
/*
85+
* pg_lfind32_simd_helper
86+
*
87+
* Searches one 4-register-block of integers. The caller is responsible for
88+
* ensuring that there are at least 4-registers-worth of integers remaining.
89+
*/
90+
staticinlinebool
91+
pg_lfind32_simd_helper(constVector32keys,uint32*base)
92+
{
93+
constuint32nelem_per_vector=sizeof(Vector32) /sizeof(uint32);
94+
Vector32vals1,
95+
vals2,
96+
vals3,
97+
vals4,
98+
result1,
99+
result2,
100+
result3,
101+
result4,
102+
tmp1,
103+
tmp2,
104+
result;
105+
106+
/* load the next block into 4 registers */
107+
vector32_load(&vals1,base);
108+
vector32_load(&vals2,&base[nelem_per_vector]);
109+
vector32_load(&vals3,&base[nelem_per_vector*2]);
110+
vector32_load(&vals4,&base[nelem_per_vector*3]);
111+
112+
/* compare each value to the key */
113+
result1=vector32_eq(keys,vals1);
114+
result2=vector32_eq(keys,vals2);
115+
result3=vector32_eq(keys,vals3);
116+
result4=vector32_eq(keys,vals4);
117+
118+
/* combine the results into a single variable */
119+
tmp1=vector32_or(result1,result2);
120+
tmp2=vector32_or(result3,result4);
121+
result=vector32_or(tmp1,tmp2);
122+
123+
/* return whether there was a match */
124+
returnvector32_is_highbit_set(result);
125+
}
126+
#endif/* ! USE_NO_SIMD */
127+
83128
/*
84129
* pg_lfind32
85130
*
@@ -95,8 +140,7 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
95140

96141
/*
97142
* For better instruction-level parallelism, each loop iteration operates
98-
* on a block of four registers. Testing for SSE2 has showed this is ~40%
99-
* faster than using a block of two registers.
143+
* on a block of four registers.
100144
*/
101145
constVector32keys=vector32_broadcast(key);/* load copies of key */
102146
constuint32nelem_per_vector=sizeof(Vector32) /sizeof(uint32);
@@ -109,57 +153,51 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
109153
boolassert_result= false;
110154

111155
/* pre-compute the result for assert checking */
112-
for (i=0;i<nelem;i++)
156+
for (intj=0;j<nelem;j++)
113157
{
114-
if (key==base[i])
158+
if (key==base[j])
115159
{
116160
assert_result= true;
117161
break;
118162
}
119163
}
120164
#endif
121165

122-
for (i=0;i<tail_idx;i+=nelem_per_iteration)
166+
/*
167+
* If there aren't enough elements for the SIMD code, jump to the standard
168+
* one-by-one linear search code.
169+
*/
170+
if (nelem<nelem_per_iteration)
171+
gotoone_by_one;
172+
173+
/*
174+
* Process as many elements as possible with a block of 4 registers.
175+
*/
176+
do
123177
{
124-
Vector32vals1,
125-
vals2,
126-
vals3,
127-
vals4,
128-
result1,
129-
result2,
130-
result3,
131-
result4,
132-
tmp1,
133-
tmp2,
134-
result;
135-
136-
/* load the next block into 4 registers */
137-
vector32_load(&vals1,&base[i]);
138-
vector32_load(&vals2,&base[i+nelem_per_vector]);
139-
vector32_load(&vals3,&base[i+nelem_per_vector*2]);
140-
vector32_load(&vals4,&base[i+nelem_per_vector*3]);
141-
142-
/* compare each value to the key */
143-
result1=vector32_eq(keys,vals1);
144-
result2=vector32_eq(keys,vals2);
145-
result3=vector32_eq(keys,vals3);
146-
result4=vector32_eq(keys,vals4);
147-
148-
/* combine the results into a single variable */
149-
tmp1=vector32_or(result1,result2);
150-
tmp2=vector32_or(result3,result4);
151-
result=vector32_or(tmp1,tmp2);
152-
153-
/* see if there was a match */
154-
if (vector32_is_highbit_set(result))
178+
if (pg_lfind32_simd_helper(keys,&base[i]))
155179
{
156180
Assert(assert_result== true);
157181
return true;
158182
}
159-
}
183+
184+
i+=nelem_per_iteration;
185+
186+
}while (i<tail_idx);
187+
188+
/*
189+
* Process the last 'nelem_per_iteration' elements in the array with a
190+
* 4-register block. This will cause us to check a subset of the elements
191+
* more than once, but that won't affect correctness, and testing has
192+
* demonstrated that this helps more cases than it harms.
193+
*/
194+
Assert(assert_result==pg_lfind32_simd_helper(keys,&base[nelem-nelem_per_iteration]));
195+
returnpg_lfind32_simd_helper(keys,&base[nelem-nelem_per_iteration]);
196+
160197
#endif/* ! USE_NO_SIMD */
161198

162-
/* Process the remaining elements one at a time. */
199+
one_by_one:
200+
/* Process the elements one at a time. */
163201
for (;i<nelem;i++)
164202
{
165203
if (key==base[i])

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp