Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit9c3c5c5

Browse files
j-naylorpull[bot]
authored andcommitted
Abstract some more architecture-specific details away from SIMD functionality
Add a typedef to represent vectors containing four 32-bit integers,and add functions operating on them. Also separate out saturatingsubtraction into its own function. The motivation for this is toprepare for a future commit to add ARM NEON support.Nathan BossartReviewed by John Naylor and Tom LaneDiscussion:https://www.postgresql.org/message-id/flat/CAFBsxsEyR9JkfbPcDXBRYEfdfC__OkwVGdwEAgY4Rv0cvw35EA%40mail.gmail.com#aba7a64b11503494ffd8dd27067626a9
1 parent151b968 commit9c3c5c5

File tree

2 files changed

+121
-43
lines changed

2 files changed

+121
-43
lines changed

‎src/include/port/pg_lfind.h

Lines changed: 39 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -91,16 +91,19 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
9191
{
9292
uint32i=0;
9393

94-
#ifdefUSE_SSE2
94+
#ifndefUSE_NO_SIMD
9595

9696
/*
97-
* A 16-byte register only has four 4-byte lanes. For better
98-
* instruction-level parallelism, each loop iteration operates on a block
99-
* of four registers. Testing has showed this is ~40% faster than using a
100-
* block of two registers.
97+
* For better instruction-level parallelism, each loop iteration operates
98+
* on a block of four registers. Testing for SSE2 has showed this is ~40%
99+
* faster than using a block of two registers.
101100
*/
102-
const__m128ikeys=_mm_set1_epi32(key);/* load 4 copies of key */
103-
uint32iterations=nelem& ~0xF;/* round down to multiple of 16 */
101+
constVector32keys=vector32_broadcast(key);/* load copies of key */
102+
constuint32nelem_per_vector=sizeof(Vector32) /sizeof(uint32);
103+
constuint32nelem_per_iteration=4*nelem_per_vector;
104+
105+
/* round down to multiple of elements per iteration */
106+
constuint32tail_idx=nelem& ~(nelem_per_iteration-1);
104107

105108
#if defined(USE_ASSERT_CHECKING)
106109
boolassert_result= false;
@@ -116,49 +119,59 @@ pg_lfind32(uint32 key, uint32 *base, uint32 nelem)
116119
}
117120
#endif
118121

119-
for (i=0;i<iterations;i+=16)
122+
for (i=0;i<tail_idx;i+=nelem_per_iteration)
120123
{
121-
/* load the next block into 4 registers holding 4 values each */
122-
const__m128ivals1=_mm_loadu_si128((__m128i*)&base[i]);
123-
const__m128ivals2=_mm_loadu_si128((__m128i*)&base[i+4]);
124-
const__m128ivals3=_mm_loadu_si128((__m128i*)&base[i+8]);
125-
const__m128ivals4=_mm_loadu_si128((__m128i*)&base[i+12]);
124+
Vector32vals1,
125+
vals2,
126+
vals3,
127+
vals4,
128+
result1,
129+
result2,
130+
result3,
131+
result4,
132+
tmp1,
133+
tmp2,
134+
result;
135+
136+
/* load the next block into 4 registers */
137+
vector32_load(&vals1,&base[i]);
138+
vector32_load(&vals2,&base[i+nelem_per_vector]);
139+
vector32_load(&vals3,&base[i+nelem_per_vector*2]);
140+
vector32_load(&vals4,&base[i+nelem_per_vector*3]);
126141

127142
/* compare each value to the key */
128-
const__m128iresult1=_mm_cmpeq_epi32(keys,vals1);
129-
const__m128iresult2=_mm_cmpeq_epi32(keys,vals2);
130-
const__m128iresult3=_mm_cmpeq_epi32(keys,vals3);
131-
const__m128iresult4=_mm_cmpeq_epi32(keys,vals4);
143+
result1=vector32_eq(keys,vals1);
144+
result2=vector32_eq(keys,vals2);
145+
result3=vector32_eq(keys,vals3);
146+
result4=vector32_eq(keys,vals4);
132147

133148
/* combine the results into a single variable */
134-
const__m128itmp1=_mm_or_si128(result1,result2);
135-
const__m128itmp2=_mm_or_si128(result3,result4);
136-
const__m128iresult=_mm_or_si128(tmp1,tmp2);
149+
tmp1=vector32_or(result1,result2);
150+
tmp2=vector32_or(result3,result4);
151+
result=vector32_or(tmp1,tmp2);
137152

138153
/* see if there was a match */
139-
if (_mm_movemask_epi8(result)!=0)
154+
if (vector8_is_highbit_set((Vector8)result))
140155
{
141-
#if defined(USE_ASSERT_CHECKING)
142156
Assert(assert_result== true);
143-
#endif
144157
return true;
145158
}
146159
}
147-
#endif/*USE_SSE2 */
160+
#endif/*! USE_NO_SIMD */
148161

149162
/* Process the remaining elements one at a time. */
150163
for (;i<nelem;i++)
151164
{
152165
if (key==base[i])
153166
{
154-
#if defined(USE_SSE2)&& defined(USE_ASSERT_CHECKING)
167+
#ifndefUSE_NO_SIMD
155168
Assert(assert_result== true);
156169
#endif
157170
return true;
158171
}
159172
}
160173

161-
#if defined(USE_SSE2)&& defined(USE_ASSERT_CHECKING)
174+
#ifndefUSE_NO_SIMD
162175
Assert(assert_result== false);
163176
#endif
164177
return false;

‎src/include/port/simd.h

Lines changed: 82 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -31,22 +31,32 @@
3131
#include<emmintrin.h>
3232
#defineUSE_SSE2
3333
typedef__m128iVector8;
34+
typedef__m128iVector32;
3435

3536
#else
3637
/*
3738
* If no SIMD instructions are available, we can in some cases emulate vector
38-
* operations using bitwise operations on unsigned integers.
39+
* operations using bitwise operations on unsigned integers. Note that many
40+
* of the functions in this file presently do not have non-SIMD
41+
* implementations. In particular, none of the functions involving Vector32
42+
* are implemented without SIMD since it's likely not worthwhile to represent
43+
* two 32-bit integers using a uint64.
3944
*/
4045
#defineUSE_NO_SIMD
4146
typedefuint64Vector8;
4247
#endif
4348

44-
4549
/* load/store operations */
4650
staticinlinevoidvector8_load(Vector8*v,constuint8*s);
51+
#ifndefUSE_NO_SIMD
52+
staticinlinevoidvector32_load(Vector32*v,constuint32*s);
53+
#endif
4754

4855
/* assignment operations */
4956
staticinlineVector8vector8_broadcast(constuint8c);
57+
#ifndefUSE_NO_SIMD
58+
staticinlineVector32vector32_broadcast(constuint32c);
59+
#endif
5060

5161
/* element-wise comparisons to a scalar */
5262
staticinlineboolvector8_has(constVector8v,constuint8c);
@@ -56,14 +66,21 @@ static inline bool vector8_is_highbit_set(const Vector8 v);
5666

5767
/* arithmetic operations */
5868
staticinlineVector8vector8_or(constVector8v1,constVector8v2);
59-
60-
/* Different semantics for SIMD architectures. */
6169
#ifndefUSE_NO_SIMD
70+
staticinlineVector32vector32_or(constVector32v1,constVector32v2);
71+
staticinlineVector8vector8_ssub(constVector8v1,constVector8v2);
72+
#endif
6273

63-
/* comparisons between vectors */
74+
/*
75+
* comparisons between vectors
76+
*
77+
* Note: These return a vector rather than booloan, which is why we don't
78+
* have non-SIMD implementations.
79+
*/
80+
#ifndefUSE_NO_SIMD
6481
staticinlineVector8vector8_eq(constVector8v1,constVector8v2);
65-
66-
#endif/* ! USE_NO_SIMD */
82+
staticinlineVector32vector32_eq(constVector32v1,constVector32v2);
83+
#endif
6784

6885
/*
6986
* Load a chunk of memory into the given vector.
@@ -78,6 +95,15 @@ vector8_load(Vector8 *v, const uint8 *s)
7895
#endif
7996
}
8097

98+
#ifndefUSE_NO_SIMD
99+
staticinlinevoid
100+
vector32_load(Vector32*v,constuint32*s)
101+
{
102+
#ifdefUSE_SSE2
103+
*v=_mm_loadu_si128((const__m128i*)s);
104+
#endif
105+
}
106+
#endif/* ! USE_NO_SIMD */
81107

82108
/*
83109
* Create a vector with all elements set to the same value.
@@ -92,6 +118,16 @@ vector8_broadcast(const uint8 c)
92118
#endif
93119
}
94120

121+
#ifndefUSE_NO_SIMD
122+
staticinlineVector32
123+
vector32_broadcast(constuint32c)
124+
{
125+
#ifdefUSE_SSE2
126+
return_mm_set1_epi32(c);
127+
#endif
128+
}
129+
#endif/* ! USE_NO_SIMD */
130+
95131
/*
96132
* Return true if any elements in the vector are equal to the given scalar.
97133
*/
@@ -118,7 +154,7 @@ vector8_has(const Vector8 v, const uint8 c)
118154
/* any bytes in v equal to c will evaluate to zero via XOR */
119155
result=vector8_has_zero(v ^vector8_broadcast(c));
120156
#elif defined(USE_SSE2)
121-
result=_mm_movemask_epi8(_mm_cmpeq_epi8(v,vector8_broadcast(c)));
157+
result=vector8_is_highbit_set(vector8_eq(v,vector8_broadcast(c)));
122158
#endif
123159

124160
Assert(assert_result==result);
@@ -133,8 +169,8 @@ vector8_has_zero(const Vector8 v)
133169
{
134170
#if defined(USE_NO_SIMD)
135171
/*
136-
* We cannot call vector8_has() here, because that would lead to a circular
137-
* definition.
172+
* We cannot call vector8_has() here, because that would lead to a
173+
*circulardefinition.
138174
*/
139175
returnvector8_has_le(v,0);
140176
#elif defined(USE_SSE2)
@@ -150,9 +186,6 @@ static inline bool
150186
vector8_has_le(constVector8v,constuint8c)
151187
{
152188
boolresult= false;
153-
#if defined(USE_SSE2)
154-
__m128isub;
155-
#endif
156189

157190
/* pre-compute the result for assert checking */
158191
#ifdefUSE_ASSERT_CHECKING
@@ -194,10 +227,10 @@ vector8_has_le(const Vector8 v, const uint8 c)
194227

195228
/*
196229
* Use saturating subtraction to find bytes <= c, which will present as
197-
* NUL bytes in 'sub'.
230+
* NUL bytes. This approach is a workaround for the lack of unsigned
231+
* comparison instructions on some architectures.
198232
*/
199-
sub=_mm_subs_epu8(v,vector8_broadcast(c));
200-
result=vector8_has_zero(sub);
233+
result=vector8_has_zero(vector8_ssub(v,vector8_broadcast(c)));
201234
#endif
202235

203236
Assert(assert_result==result);
@@ -230,22 +263,54 @@ vector8_or(const Vector8 v1, const Vector8 v2)
230263
#endif
231264
}
232265

266+
#ifndefUSE_NO_SIMD
267+
staticinlineVector32
268+
vector32_or(constVector32v1,constVector32v2)
269+
{
270+
#ifdefUSE_SSE2
271+
return_mm_or_si128(v1,v2);
272+
#endif
273+
}
274+
#endif/* ! USE_NO_SIMD */
233275

234-
/* Different semantics for SIMD architectures. */
276+
/*
277+
* Return the result of subtracting the respective elements of the input
278+
* vectors using saturation (i.e., if the operation would yield a value less
279+
* than zero, zero is returned instead). For more information on saturation
280+
* arithmetic, see https://en.wikipedia.org/wiki/Saturation_arithmetic
281+
*/
235282
#ifndefUSE_NO_SIMD
283+
staticinlineVector8
284+
vector8_ssub(constVector8v1,constVector8v2)
285+
{
286+
#ifdefUSE_SSE2
287+
return_mm_subs_epu8(v1,v2);
288+
#endif
289+
}
290+
#endif/* ! USE_NO_SIMD */
236291

237292
/*
238293
* Return a vector with all bits set in each lane where the the corresponding
239294
* lanes in the inputs are equal.
240295
*/
296+
#ifndefUSE_NO_SIMD
241297
staticinlineVector8
242298
vector8_eq(constVector8v1,constVector8v2)
243299
{
244300
#ifdefUSE_SSE2
245301
return_mm_cmpeq_epi8(v1,v2);
246302
#endif
247303
}
304+
#endif/* ! USE_NO_SIMD */
248305

306+
#ifndefUSE_NO_SIMD
307+
staticinlineVector32
308+
vector32_eq(constVector32v1,constVector32v2)
309+
{
310+
#ifdefUSE_SSE2
311+
return_mm_cmpeq_epi32(v1,v2);
312+
#endif
313+
}
249314
#endif/* ! USE_NO_SIMD */
250315

251316
#endif/* SIMD_H */

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp