@@ -41,6 +41,12 @@ namespace cp_algo {
41
41
[[gnu::always_inline]]inline u64x4low32 (u64x4 x) {
42
42
return x &uint32_t (-1 );
43
43
}
44
+ [[gnu::always_inline]]inline auto rotr (auto x) {
45
+ return decltype (x)(__builtin_shufflevector (u32x8 (x),u32x8 (x),1 ,2 ,3 ,0 ,5 ,6 ,7 ,4 ));
46
+ }
47
+ [[gnu::always_inline]]inline auto rotl (auto x) {
48
+ return decltype (x)(__builtin_shufflevector (u32x8 (x),u32x8 (x),3 ,0 ,1 ,2 ,7 ,4 ,5 ,6 ));
49
+ }
44
50
45
51
[[gnu::always_inline]]inline u64x4montgomery_reduce (u64x4 x,uint32_t mod,uint32_t imod) {
46
52
#ifdef __AVX2__
@@ -50,7 +56,7 @@ namespace cp_algo {
50
56
auto x_ninv = x * imod;
51
57
x +=low32 (x_ninv) * mod;
52
58
#endif
53
- return x >> 32 ;
59
+ return rotr (x) ;
54
60
}
55
61
56
62
[[gnu::always_inline]]inline u64x4montgomery_mul (u64x4 x, u64x4 y,uint32_t mod,uint32_t imod) {
@@ -60,16 +66,10 @@ namespace cp_algo {
60
66
return montgomery_reduce (low32 (x) *low32 (y), mod, imod);
61
67
#endif
62
68
}
63
-
64
69
[[gnu::always_inline]]inline u32x8montgomery_mul (u32x8 x, u32x8 y,uint32_t mod,uint32_t imod) {
65
- auto x0246 =u64x4 (x);
66
- auto y0246 =u64x4 (y);
67
- auto x1357 =u64x4 (x) >>32 ;
68
- auto y1357 =u64x4 (y) >>32 ;
69
- return u32x8 (montgomery_mul (x0246, y0246, mod, imod)) |
70
- u32x8 (montgomery_mul (x1357, y1357, mod, imod) <<32 );
70
+ return u32x8 (montgomery_mul (u64x4 (x),u64x4 (y), mod, imod)) |
71
+ u32x8 (rotl (montgomery_mul (u64x4 (rotr (x)),u64x4 (rotr (y)), mod, imod)));
71
72
}
72
-
73
73
[[gnu::always_inline]]inline dx4rotate_right (dx4 x) {
74
74
static constexpr u64x4 shuffler = {3 ,0 ,1 ,2 };
75
75
return __builtin_shuffle (x, shuffler);