|
| 1 | +/*------------------------------------------------------------------------- |
| 2 | + * |
| 3 | + * pg_lfind.h |
| 4 | + * Optimized linear search routines. |
| 5 | + * |
| 6 | + * Copyright (c) 2022, PostgreSQL Global Development Group |
| 7 | + * |
| 8 | + * IDENTIFICATION |
| 9 | + * src/include/port/pg_lfind.h |
| 10 | + * |
| 11 | + *------------------------------------------------------------------------- |
| 12 | + */ |
| 13 | +#ifndefPG_LFIND_H |
| 14 | +#definePG_LFIND_H |
| 15 | + |
| 16 | +#include"port/simd.h" |
| 17 | + |
| 18 | +/* |
| 19 | + * pg_lfind32 |
| 20 | + * |
| 21 | + * Return true if there is an element in 'base' that equals 'key', otherwise |
| 22 | + * return false. |
| 23 | + */ |
| 24 | +staticinlinebool |
| 25 | +pg_lfind32(uint32key,uint32*base,uint32nelem) |
| 26 | +{ |
| 27 | +uint32i=0; |
| 28 | + |
| 29 | +/* Use SIMD intrinsics where available. */ |
| 30 | +#ifdefUSE_SSE2 |
| 31 | + |
| 32 | +/* |
| 33 | + * A 16-byte register only has four 4-byte lanes. For better |
| 34 | + * instruction-level parallelism, each loop iteration operates on a block |
| 35 | + * of four registers. Testing has showed this is ~40% faster than using a |
| 36 | + * block of two registers. |
| 37 | + */ |
| 38 | +const__m128ikeys=_mm_set1_epi32(key);/* load 4 copies of key */ |
| 39 | +uint32iterations=nelem& ~0xF;/* round down to multiple of 16 */ |
| 40 | + |
| 41 | +#if defined(USE_ASSERT_CHECKING) |
| 42 | +boolassert_result= false; |
| 43 | + |
| 44 | +/* pre-compute the result for assert checking */ |
| 45 | +for (i=0;i<nelem;i++) |
| 46 | +{ |
| 47 | +if (key==base[i]) |
| 48 | +{ |
| 49 | +assert_result= true; |
| 50 | +break; |
| 51 | +} |
| 52 | +} |
| 53 | +#endif |
| 54 | + |
| 55 | +for (i=0;i<iterations;i+=16) |
| 56 | +{ |
| 57 | +/* load the next block into 4 registers holding 4 values each */ |
| 58 | +const__m128ivals1=_mm_loadu_si128((__m128i*)&base[i]); |
| 59 | +const__m128ivals2=_mm_loadu_si128((__m128i*)&base[i+4]); |
| 60 | +const__m128ivals3=_mm_loadu_si128((__m128i*)&base[i+8]); |
| 61 | +const__m128ivals4=_mm_loadu_si128((__m128i*)&base[i+12]); |
| 62 | + |
| 63 | +/* compare each value to the key */ |
| 64 | +const__m128iresult1=_mm_cmpeq_epi32(keys,vals1); |
| 65 | +const__m128iresult2=_mm_cmpeq_epi32(keys,vals2); |
| 66 | +const__m128iresult3=_mm_cmpeq_epi32(keys,vals3); |
| 67 | +const__m128iresult4=_mm_cmpeq_epi32(keys,vals4); |
| 68 | + |
| 69 | +/* combine the results into a single variable */ |
| 70 | +const__m128itmp1=_mm_or_si128(result1,result2); |
| 71 | +const__m128itmp2=_mm_or_si128(result3,result4); |
| 72 | +const__m128iresult=_mm_or_si128(tmp1,tmp2); |
| 73 | + |
| 74 | +/* see if there was a match */ |
| 75 | +if (_mm_movemask_epi8(result)!=0) |
| 76 | +{ |
| 77 | +#if defined(USE_ASSERT_CHECKING) |
| 78 | +Assert(assert_result== true); |
| 79 | +#endif |
| 80 | +return true; |
| 81 | +} |
| 82 | +} |
| 83 | +#endif/* USE_SSE2 */ |
| 84 | + |
| 85 | +/* Process the remaining elements one at a time. */ |
| 86 | +for (;i<nelem;i++) |
| 87 | +{ |
| 88 | +if (key==base[i]) |
| 89 | +{ |
| 90 | +#if defined(USE_SSE2)&& defined(USE_ASSERT_CHECKING) |
| 91 | +Assert(assert_result== true); |
| 92 | +#endif |
| 93 | +return true; |
| 94 | +} |
| 95 | +} |
| 96 | + |
| 97 | +#if defined(USE_SSE2)&& defined(USE_ASSERT_CHECKING) |
| 98 | +Assert(assert_result== false); |
| 99 | +#endif |
| 100 | +return false; |
| 101 | +} |
| 102 | + |
| 103 | +#endif/* PG_LFIND_H */ |