Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit82739d4

Browse files
committed
Use ARM Advanced SIMD (NEON) intrinsics where available
NEON support is required on the Aarch64 architecture for standardimplementations. Hardware designers for specialized markets can choosenot to support it, but that's true of floating point as well, whichwe assume is supported. As with x86, some SIMD support is availableon 32-bit platforms, but those are not interesting from a performancestandpoint and would require an inconvenient runtime check.Nathan BossartReviewed by John Naylor, Andres Freund, Thomas Munro, and Tom LaneDiscussion:https://www.postgresql.org/message-id/flat/CAFBsxsEyR9JkfbPcDXBRYEfdfC__OkwVGdwEAgY4Rv0cvw35EA%40mail.gmail.com#aba7a64b11503494ffd8dd27067626a9
1 parentf8f19f7 commit82739d4

File tree

1 file changed

+37
-3
lines changed

1 file changed

+37
-3
lines changed

‎src/include/port/simd.h

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,20 @@
3333
typedef__m128iVector8;
3434
typedef__m128iVector32;
3535

36+
#elif defined(__aarch64__)&& defined(__ARM_NEON)
37+
/*
38+
* We use the Neon instructions if the compiler provides access to them (as
39+
* indicated by __ARM_NEON) and we are on aarch64. While Neon support is
40+
* technically optional for aarch64, it appears that all available 64-bit
41+
* hardware does have it. Neon exists in some 32-bit hardware too, but we
42+
* could not realistically use it there without a run-time check, which seems
43+
* not worth the trouble for now.
44+
*/
45+
#include<arm_neon.h>
46+
#defineUSE_NEON
47+
typedefuint8x16_tVector8;
48+
typedefuint32x4_tVector32;
49+
3650
#else
3751
/*
3852
* If no SIMD instructions are available, we can in some cases emulate vector
@@ -90,6 +104,8 @@ vector8_load(Vector8 *v, const uint8 *s)
90104
{
91105
#if defined(USE_SSE2)
92106
*v=_mm_loadu_si128((const__m128i*)s);
107+
#elif defined(USE_NEON)
108+
*v=vld1q_u8(s);
93109
#else
94110
memcpy(v,s,sizeof(Vector8));
95111
#endif
@@ -101,6 +117,8 @@ vector32_load(Vector32 *v, const uint32 *s)
101117
{
102118
#ifdefUSE_SSE2
103119
*v=_mm_loadu_si128((const__m128i*)s);
120+
#elif defined(USE_NEON)
121+
*v=vld1q_u32(s);
104122
#endif
105123
}
106124
#endif/* ! USE_NO_SIMD */
@@ -113,6 +131,8 @@ vector8_broadcast(const uint8 c)
113131
{
114132
#if defined(USE_SSE2)
115133
return_mm_set1_epi8(c);
134+
#elif defined(USE_NEON)
135+
returnvdupq_n_u8(c);
116136
#else
117137
return ~UINT64CONST(0) /0xFF*c;
118138
#endif
@@ -124,6 +144,8 @@ vector32_broadcast(const uint32 c)
124144
{
125145
#ifdefUSE_SSE2
126146
return_mm_set1_epi32(c);
147+
#elif defined(USE_NEON)
148+
returnvdupq_n_u32(c);
127149
#endif
128150
}
129151
#endif/* ! USE_NO_SIMD */
@@ -153,7 +175,7 @@ vector8_has(const Vector8 v, const uint8 c)
153175
#if defined(USE_NO_SIMD)
154176
/* any bytes in v equal to c will evaluate to zero via XOR */
155177
result=vector8_has_zero(v ^vector8_broadcast(c));
156-
#elif defined(USE_SSE2)
178+
#else
157179
result=vector8_is_highbit_set(vector8_eq(v,vector8_broadcast(c)));
158180
#endif
159181

@@ -173,7 +195,7 @@ vector8_has_zero(const Vector8 v)
173195
* circular definition.
174196
*/
175197
returnvector8_has_le(v,0);
176-
#elif defined(USE_SSE2)
198+
#else
177199
returnvector8_has(v,0);
178200
#endif
179201
}
@@ -223,7 +245,7 @@ vector8_has_le(const Vector8 v, const uint8 c)
223245
}
224246
}
225247
}
226-
#elif defined(USE_SSE2)
248+
#else
227249

228250
/*
229251
* Use saturating subtraction to find bytes <= c, which will present as
@@ -245,6 +267,8 @@ vector8_is_highbit_set(const Vector8 v)
245267
{
246268
#ifdefUSE_SSE2
247269
return_mm_movemask_epi8(v)!=0;
270+
#elif defined(USE_NEON)
271+
returnvmaxvq_u8(v)>0x7F;
248272
#else
249273
returnv&vector8_broadcast(0x80);
250274
#endif
@@ -258,6 +282,8 @@ vector8_or(const Vector8 v1, const Vector8 v2)
258282
{
259283
#ifdefUSE_SSE2
260284
return_mm_or_si128(v1,v2);
285+
#elif defined(USE_NEON)
286+
returnvorrq_u8(v1,v2);
261287
#else
262288
returnv1 |v2;
263289
#endif
@@ -269,6 +295,8 @@ vector32_or(const Vector32 v1, const Vector32 v2)
269295
{
270296
#ifdefUSE_SSE2
271297
return_mm_or_si128(v1,v2);
298+
#elif defined(USE_NEON)
299+
returnvorrq_u32(v1,v2);
272300
#endif
273301
}
274302
#endif/* ! USE_NO_SIMD */
@@ -285,6 +313,8 @@ vector8_ssub(const Vector8 v1, const Vector8 v2)
285313
{
286314
#ifdefUSE_SSE2
287315
return_mm_subs_epu8(v1,v2);
316+
#elif defined(USE_NEON)
317+
returnvqsubq_u8(v1,v2);
288318
#endif
289319
}
290320
#endif/* ! USE_NO_SIMD */
@@ -299,6 +329,8 @@ vector8_eq(const Vector8 v1, const Vector8 v2)
299329
{
300330
#ifdefUSE_SSE2
301331
return_mm_cmpeq_epi8(v1,v2);
332+
#elif defined(USE_NEON)
333+
returnvceqq_u8(v1,v2);
302334
#endif
303335
}
304336
#endif/* ! USE_NO_SIMD */
@@ -309,6 +341,8 @@ vector32_eq(const Vector32 v1, const Vector32 v2)
309341
{
310342
#ifdefUSE_SSE2
311343
return_mm_cmpeq_epi32(v1,v2);
344+
#elif defined(USE_NEON)
345+
returnvceqq_u32(v1,v2);
312346
#endif
313347
}
314348
#endif/* ! USE_NO_SIMD */

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp