@@ -38,24 +38,49 @@ namespace cp_algo {
38
38
};
39
39
}
40
40
41
- [[gnu::always_inline]]inline u64x4montgomery_reduce (u64x4 x,u64x4 mod,u64x4 imod) {
42
- auto x_ninv =u64x4 (u32x8 (x) *u32x8 (imod));
41
+ [[gnu::always_inline]]inline u64x4montgomery_reduce (u64x4 x,uint32_t mod,uint32_t imod) {
42
+ auto x_ninv =u64x4 (u32x8 (x) *( u32x8 () + imod));
43
43
#ifdef __AVX2__
44
- x +=u64x4 (_mm256_mul_epu32 (__m256i (x_ninv),__m256i (mod) ));
44
+ x +=u64x4 (_mm256_mul_epu32 (__m256i (x_ninv),__m256i () + mod ));
45
45
#else
46
46
x += x_ninv * mod;
47
47
#endif
48
48
return x >>32 ;
49
49
}
50
50
51
- [[gnu::always_inline]]inline u64x4montgomery_mul (u64x4 x, u64x4 y,u64x4 mod,u64x4 imod) {
51
+ [[gnu::always_inline]]inline u64x4montgomery_mul (u64x4 x, u64x4 y,uint32_t mod,uint32_t imod) {
52
52
#ifdef __AVX2__
53
53
return montgomery_reduce (u64x4 (_mm256_mul_epu32 (__m256i (x),__m256i (y))), mod, imod);
54
54
#else
55
55
return montgomery_reduce (x * y, mod, imod);
56
56
#endif
57
57
}
58
58
59
+ u32x8montgomery_mul (u32x8 x, u32x8 y,uint32_t mod,uint32_t imod) {
60
+ auto x0246 =u64x4 (x) &uint32_t (-1 );
61
+ auto y0246 =u64x4 (y) &uint32_t (-1 );
62
+ auto x1357 =u64x4 (x) >>32 ;
63
+ auto y1357 =u64x4 (y) >>32 ;
64
+ #ifdef __AVX2__
65
+ auto xy0246 =u64x4 (_mm256_mul_epu32 (__m256i (x0246),__m256i (y0246)));
66
+ auto xy1357 =u64x4 (_mm256_mul_epu32 (__m256i (x1357),__m256i (y1357)));
67
+ #else
68
+ u64x4 xy0246 = x0246 * y0246;
69
+ u64x4 xy1357 = x1357 * y1357;
70
+ #endif
71
+ auto xy_inv =u64x4 (u32x8 (xy0246 | (xy1357 <<32 )) * (u32x8 () + imod));
72
+ auto xy_inv0246 = xy_inv &uint32_t (-1 );
73
+ auto xy_inv1357 = xy_inv >>32 ;
74
+ #ifdef __AVX2__
75
+ xy0246 +=u64x4 (_mm256_mul_epu32 (__m256i (xy_inv0246),__m256i () + mod));
76
+ xy1357 +=u64x4 (_mm256_mul_epu32 (__m256i (xy_inv1357),__m256i () + mod));
77
+ #else
78
+ xy0246 += xy_inv0246 * mod;
79
+ xy1357 += xy_inv1357 * mod;
80
+ #endif
81
+ return u32x8 ((xy0246 >>32 ) | (xy1357 & -1ULL <<32 ));
82
+ }
83
+
59
84
[[gnu::always_inline]]inline dx4rotate_right (dx4 x) {
60
85
static constexpr u64x4 shuffler = {3 ,0 ,1 ,2 };
61
86
return __builtin_shuffle (x, shuffler);