| bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/40 | 42.0 ns | 41.2 ns | 1.02 |
| bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/18 | 57.1 ns | 57.2 ns | 1.00 |
| bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/16 | 62.5 ns | 63.2 ns | 0.99 |
| bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/14 | 69.2 ns | 68.7 ns | 1.01 |
| bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/10 | 84.7 ns | 84.8 ns | 1.00 |
| bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/8 | 103 ns | 118 ns | 0.87 📈 |
| bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/5 | 161 ns | 117 ns | 1.38 📉 |
| bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/4 | 191 ns | 134 ns | 1.43 📉 |
| bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/3 | 248 ns | 117 ns | 2.12 📉 |
| bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/2 | 373 ns | 116 ns | 3.22 📉 |
| bm<uint8_t, AlgType::Std, PatternType::TwoZones>/3000/1 | 18.1 ns | 28.7 ns | 0.63 |
| bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/40 | 42.1 ns | 41.4 ns | 1.02 |
| bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/18 | 57.7 ns | 57.5 ns | 1.00 |
| bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/16 | 63.0 ns | 62.9 ns | 1.00 |
| bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/14 | 70.1 ns | 69.8 ns | 1.00 |
| bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/10 | 86.6 ns | 85.7 ns | 1.01 |
| bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/8 | 104 ns | 120 ns | 0.87 📈 |
| bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/5 | 157 ns | 120 ns | 1.31 📉 |
| bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/4 | 189 ns | 118 ns | 1.60 📉 |
| bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/3 | 244 ns | 118 ns | 2.07 📉 |
| bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/2 | 369 ns | 117 ns | 3.15 📉 |
| bm<uint8_t, AlgType::Rng, PatternType::TwoZones>/3000/1 | 17.7 ns | 28.4 ns | 0.62 |
| bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/40 | 353 ns | 355 ns | 0.99 |
| bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/18 | 461 ns | 459 ns | 1.00 |
| bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/16 | 474 ns | 478 ns | 0.99 |
| bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/14 | 513 ns | 530 ns | 0.97 |
| bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/10 | 636 ns | 633 ns | 1.00 |
| bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/8 | 739 ns | 216 ns | 3.42 📉 |
| bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/5 | 930 ns | 213 ns | 4.37 📉 |
| bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/4 | 972 ns | 218 ns | 4.46 📉 |
| bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/3 | 1014 ns | 222 ns | 4.57 📉 |
| bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/2 | 1053 ns | 220 ns | 4.79 📉 |
| bm<uint8_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/1 | 50.2 ns | 38.7 ns | 1.30 |
| bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/40 | 356 ns | 355 ns | 1.00 |
| bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/18 | 450 ns | 463 ns | 0.97 |
| bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/16 | 480 ns | 483 ns | 0.99 |
| bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/14 | 516 ns | 521 ns | 0.99 |
| bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/10 | 618 ns | 645 ns | 0.96 |
| bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/8 | 736 ns | 216 ns | 3.41 📉 |
| bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/5 | 951 ns | 217 ns | 4.38 📉 |
| bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/4 | 1039 ns | 216 ns | 4.81 📉 |
| bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/3 | 1086 ns | 214 ns | 5.07 📉 |
| bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/2 | 1855 ns | 218 ns | 8.51 📉 |
| bm<uint8_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/1 | 51.8 ns | 38.6 ns | 1.34 |
| bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/40 | 58.8 ns | 45.4 ns | 1.30 |
| bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/18 | 74.8 ns | 62.7 ns | 1.19 |
| bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/16 | 79.1 ns | 68.3 ns | 1.16 |
| bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/14 | 82.4 ns | 84.6 ns | 0.97 |
| bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/10 | 102 ns | 108 ns | 0.94 |
| bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/8 | 131 ns | 135 ns | 0.97 |
| bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/5 | 197 ns | 200 ns | 0.99 |
| bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/4 | 233 ns | 162 ns | 1.44 📉 |
| bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/3 | 311 ns | 161 ns | 1.93 📉 |
| bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/2 | 450 ns | 161 ns | 2.80 📉 |
| bm<uint16_t, AlgType::Std, PatternType::TwoZones>/3000/1 | 36.1 ns | 35.1 ns | 1.03 |
| bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/40 | 53.4 ns | 46.6 ns | 1.15 |
| bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/18 | 68.9 ns | 69.3 ns | 0.99 |
| bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/16 | 74.6 ns | 75.7 ns | 0.99 |
| bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/14 | 81.4 ns | 84.2 ns | 0.97 |
| bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/10 | 104 ns | 112 ns | 0.93 |
| bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/8 | 126 ns | 131 ns | 0.96 |
| bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/5 | 191 ns | 195 ns | 0.98 |
| bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/4 | 228 ns | 161 ns | 1.42 📉 |
| bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/3 | 298 ns | 161 ns | 1.85 📉 |
| bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/2 | 454 ns | 160 ns | 2.84 📉 |
| bm<uint16_t, AlgType::Rng, PatternType::TwoZones>/3000/1 | 38.0 ns | 36.0 ns | 1.06 |
| bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/40 | 613 ns | 372 ns | 1.65 |
| bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/18 | 651 ns | 449 ns | 1.45 |
| bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/16 | 710 ns | 476 ns | 1.49 |
| bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/14 | 732 ns | 517 ns | 1.42 |
| bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/10 | 812 ns | 633 ns | 1.28 |
| bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/8 | 873 ns | 706 ns | 1.24 |
| bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/5 | 986 ns | 868 ns | 1.14 |
| bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/4 | 1077 ns | 331 ns | 3.25 📉 |
| bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/3 | 1152 ns | 327 ns | 3.52 📉 |
| bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/2 | 1138 ns | 333 ns | 3.42 📉 |
| bm<uint16_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/1 | 81.8 ns | 79.8 ns | 1.03 |
| bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/40 | 610 ns | 370 ns | 1.65 |
| bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/18 | 660 ns | 454 ns | 1.45 |
| bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/16 | 722 ns | 484 ns | 1.49 |
| bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/14 | 724 ns | 507 ns | 1.43 |
| bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/10 | 816 ns | 625 ns | 1.31 |
| bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/8 | 894 ns | 715 ns | 1.25 |
| bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/5 | 988 ns | 880 ns | 1.12 |
| bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/4 | 1061 ns | 329 ns | 3.22 📉 |
| bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/3 | 1131 ns | 330 ns | 3.43 📉 |
| bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/2 | 1124 ns | 327 ns | 3.44 📉 |
| bm<uint16_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/1 | 81.1 ns | 81.2 ns | 1.00 |
| bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/40 | 45.9 ns | 51.4 ns | 0.89 |
| bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/18 | 67.8 ns | 70.1 ns | 0.97 |
| bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/16 | 74.2 ns | 77.3 ns | 0.96 |
| bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/14 | 82.2 ns | 84.8 ns | 0.97 |
| bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/10 | 104 ns | 105 ns | 0.99 |
| bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/8 | 125 ns | 128 ns | 0.98 |
| bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/5 | 184 ns | 187 ns | 0.98 |
| bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/4 | 231 ns | 227 ns | 1.02 |
| bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/3 | 294 ns | 292 ns | 1.01 |
| bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/2 | 430 ns | 244 ns | 1.76 📉 |
| bm<uint32_t, AlgType::Std, PatternType::TwoZones>/3000/1 | 69.9 ns | 68.4 ns | 1.02 |
| bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/40 | 46.4 ns | 51.2 ns | 0.91 |
| bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/18 | 68.5 ns | 71.5 ns | 0.96 |
| bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/16 | 74.5 ns | 77.3 ns | 0.96 |
| bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/14 | 84.3 ns | 85.0 ns | 0.99 |
| bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/10 | 107 ns | 104 ns | 1.03 |
| bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/8 | 131 ns | 125 ns | 1.05 |
| bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/5 | 193 ns | 191 ns | 1.01 |
| bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/4 | 220 ns | 226 ns | 0.97 |
| bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/3 | 293 ns | 294 ns | 1.00 |
| bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/2 | 441 ns | 243 ns | 1.81 📉 |
| bm<uint32_t, AlgType::Rng, PatternType::TwoZones>/3000/1 | 70.0 ns | 70.1 ns | 1.00 |
| bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/40 | 373 ns | 369 ns | 1.01 |
| bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/18 | 501 ns | 521 ns | 0.96 |
| bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/16 | 531 ns | 551 ns | 0.96 |
| bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/14 | 565 ns | 591 ns | 0.96 |
| bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/10 | 699 ns | 771 ns | 0.91 |
| bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/8 | 831 ns | 938 ns | 0.89 |
| bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/5 | 1033 ns | 1069 ns | 0.97 |
| bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/4 | 1154 ns | 1221 ns | 0.95 |
| bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/3 | 1231 ns | 1280 ns | 0.96 |
| bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/2 | 1454 ns | 462 ns | 3.15 📉 |
| bm<uint32_t, AlgType::Std, PatternType::DenseSmallSequences>/3000/1 | 155 ns | 146 ns | 1.06 |
| bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/40 | 377 ns | 408 ns | 0.92 |
| bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/18 | 496 ns | 555 ns | 0.89 |
| bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/16 | 529 ns | 541 ns | 0.98 |
| bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/14 | 562 ns | 601 ns | 0.94 |
| bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/10 | 687 ns | 772 ns | 0.89 |
| bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/8 | 799 ns | 1005 ns | 0.80 |
| bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/5 | 1031 ns | 1276 ns | 0.81 |
| bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/4 | 1157 ns | 1316 ns | 0.88 |
| bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/3 | 1246 ns | 1371 ns | 0.91 |
| bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/2 | 1424 ns | 514 ns | 2.77 📉 |
| bm<uint32_t, AlgType::Rng, PatternType::DenseSmallSequences>/3000/1 | 148 ns | 164 ns | 0.90 |
Uh oh!
There was an error while loading.Please reload this page.
Copy AVX2 path to SSE4.2, adapt it as follows:
movemaskoutput fits 16 bit (despite using 32-bit register), the mask with carry fits 32 bitCopy AVX2 path to AVX2 tail path, adapt it as follows:
_Carrywith shifted_Msk_with_carryto take into account previous_Carryfor tails smaller thann.SSE4.2 benchmark results
These results are actually fake. AVX2 path is artificially disabled on an AVX2-capable CPU.
Also they suffer much from random variation.
Still they are useful enough to see that SSE4.2 path is indeed helpful.
I've highlighted the relevant results.