// input: array of n integers in the [0, M) range // output: sum modulo M intslow_sum(int *a, int n){ int s = 0; for (int i = 0; i < n; i++) s = (s + a[i]) % M; return s; }
intfast_sum(int *a, int n){ int s = 0; for (int i = 0; i < n; i++) { s += a[i]; // s < 2 * M s = (s >= M ? s - M : s); // will be replaced with cmov } return s; }
intfaster_sum(int *a, int n){ longlong s = 0; // 64-bit integer to handle overflow for (int i = 0; i < n; i++) s += a[i]; // will be vectorized return s % M; }
算法本身就是在计算这个公式,执行两次乘法来计算$q=x⋅n’ \mod r $和 m=q⋅n,然后从x中减去结果,然后通过右移执行除以r操作。
唯一需要注意的是,结果可能不在[0,n)范围内,但是由于
x<n⋅n<r⋅n⟹x/r<n
且
m=q⋅n<r∗n⟹m/r<n
这就能保证
−n<(x−m)/r<n
因此,我们可以简单地检查结果是否为负,若为负,则加上n,有以下算法:
typedef__uint32_t u32; typedef__uint64_t u64;
const u32 n = 1e9 + 7, nr = inverse(n, 1ull << 32);
u32 reduce(u64 x){ u32 q = u32(x) * nr; // q = x * n' mod r u64 m = (u64) q * n; // m = q * n u32 y = (x - m) >> 32; // y = (x - m) / r return x < m ? y + n : y; // if y < 0, add n to make it be in the [0, n) range }
structMontgomery { u32 n, nr; constexprMontgomery(u32 n) : n(n), nr(1) { // log(2^32) = 5 for (int i = 0; i < 5; i++) nr *= 2 - n * nr; }
u32 reduce(u64 x)const{ u32 q = u32(x) * nr; u32 m = ((u64) q * n) >> 32; return (x >> 32) + n - m; // returns a number in the [0, 2 * n - 2] range // (add a "x < n ? x : x - n" type of check if you need a proper modulo) }
u32 multiply(u32 x, u32 y)const{ returnreduce((u64) x * y); }
u32 transform(u32 x)const{ return (u64(x) << 32) % n; // can also be implemented as multiply(x, r^2 mod n) } };
为了测试其性能,我们可以将蒙哥马利乘法插入到二进制幂运算中:
constexpr Montgomery space(M);
intinverse(int _a){ u64 a = space.transform(_a); u64 r = space.transform(1); #pragma GCC unroll(30) for (int l = 0; l < 30; l++) { if ( (M - 2) >> l & 1 ) r = space.multiply(r, a); a = space.multiply(a, a); }