Geant4-11
Functions
mulmod.h File Reference
#include "helpers.h"
#include <cstdint>

Go to the source code of this file.

Functions

static void mod_m (const uint64_t *mul, uint64_t *out)
 
static void mulmod (const uint64_t *in1, uint64_t *inout)
 
static void multiply9x9 (const uint64_t *in1, const uint64_t *in2, uint64_t *out)
 
static void powermod (const uint64_t *base, uint64_t *res, uint64_t n)
 

Function Documentation

◆ mod_m()

static void mod_m ( const uint64_t *  mul,
uint64_t *  out 
)
static

Compute a value congruent to mul modulo m less than 2 ** 576

Parameters
[in]mulproduct from multiply9x9 with 18 numbers of 64 bits each
[out]outresult with 9 numbers of 64 bits each

$ m = 2^{576} - 2^{240} + 1 $

The result in out is guaranteed to be smaller than the modulus.

Definition at line 143 of file mulmod.h.

143 {
144 uint64_t r[9];
145 // Assign r = t0
146 for (int i = 0; i < 9; i++) {
147 r[i] = mul[i];
148 }
149
150 int64_t c = compute_r(mul + 9, r);
151
152 // To update r = r - c * m, it suffices to know c * (-2 ** 240 + 1)
153 // because the 2 ** 576 will cancel out. Also note that c may be zero, but
154 // the operation is still performed to avoid branching.
155
156 // c * (-2 ** 240 + 1) in 576 bits looks as follows, depending on c:
157 // - if c = 0, the number is zero.
158 // - if c = 1: bits 576 to 240 are set,
159 // bits 239 to 1 are zero, and
160 // the last one is set
161 // - if c = -1, which corresponds to all bits set (signed int64_t):
162 // bits 576 to 240 are zero and the rest is set.
163 // Note that all bits except the last are exactly complimentary (unless c = 0)
164 // and the last byte is conveniently represented by c already.
165 // Now construct the three bit patterns from c, their names correspond to the
166 // assembly implementation by Alexei Sibidanov.
167
168 // c = 0 -> t0 = 0; c = 1 -> t0 = 0; c = -1 -> all bits set (sign extension)
169 // (The assembly implementation shifts by 63, which gives the same result.)
170 int64_t t0 = c >> 1;
171
172 // Left shifting negative values is undefined behavior until C++20, cast to
173 // unsigned.
174 uint64_t c_unsigned = static_cast<uint64_t>(c);
175
176 // c = 0 -> t2 = 0; c = 1 -> upper 16 bits set; c = -1 -> lower 48 bits set
177 int64_t t2 = t0 - (c_unsigned << 48);
178
179 // c = 0 -> t1 = 0; c = 1 -> all bits set (sign extension); c = -1 -> t1 = 0
180 // (The assembly implementation shifts by 63, which gives the same result.)
181 int64_t t1 = t2 >> 48;
182
183 unsigned carry = 0;
184 {
185 uint64_t r_0 = r[0];
186
187 uint64_t out_0 = sub_carry(r_0, c, carry);
188 out[0] = out_0;
189 }
190 for (int i = 1; i < 3; i++) {
191 uint64_t r_i = r[i];
192 r_i = sub_overflow(r_i, carry, carry);
193
194 uint64_t out_i = sub_carry(r_i, t0, carry);
195 out[i] = out_i;
196 }
197 {
198 uint64_t r_3 = r[3];
199 r_3 = sub_overflow(r_3, carry, carry);
200
201 uint64_t out_3 = sub_carry(r_3, t2, carry);
202 out[3] = out_3;
203 }
204 for (int i = 4; i < 9; i++) {
205 uint64_t r_i = r[i];
206 r_i = sub_overflow(r_i, carry, carry);
207
208 uint64_t out_i = sub_carry(r_i, t1, carry);
209 out[i] = out_i;
210 }
211}
static uint64_t sub_overflow(uint64_t a, uint64_t b, unsigned &overflow)
Compute a - b and set overflow accordingly.
Definition: helpers.h:34
static uint64_t sub_carry(uint64_t a, uint64_t b, unsigned &carry)
Compute a - b and increment carry if there was an overflow.
Definition: helpers.h:42
static int64_t compute_r(const uint64_t *upper, uint64_t *r)
Definition: helpers.h:58

References compute_r(), sub_carry(), and sub_overflow().

Referenced by mulmod(), and powermod().

◆ mulmod()

static void mulmod ( const uint64_t *  in1,
uint64_t *  inout 
)
static

Combine multiply9x9 and mod_m with internal temporary storage

Parameters
[in]in1first factor with 9 numbers of 64 bits each
[in,out]inoutsecond factor and also the output of the same size

The result in inout is guaranteed to be smaller than the modulus.

Definition at line 219 of file mulmod.h.

219 {
220 uint64_t mul[2 * 9] = {0};
221 multiply9x9(in1, inout, mul);
222 mod_m(mul, inout);
223}
static void mod_m(const uint64_t *mul, uint64_t *out)
Definition: mulmod.h:143
static void multiply9x9(const uint64_t *in1, const uint64_t *in2, uint64_t *out)
Definition: mulmod.h:22

References mod_m(), and multiply9x9().

Referenced by CLHEP::RanluxppEngine::advance(), CLHEP::RanluxppEngine::setSeed(), and CLHEP::RanluxppEngine::skip().

◆ multiply9x9()

static void multiply9x9 ( const uint64_t *  in1,
const uint64_t *  in2,
uint64_t *  out 
)
static

Multiply two 576 bit numbers, stored as 9 numbers of 64 bits each

Parameters
[in]in1first factor as 9 numbers of 64 bits each
[in]in2second factor as 9 numbers of 64 bits each
[out]outresult with 18 numbers of 64 bits each

Definition at line 22 of file mulmod.h.

23 {
24 uint64_t next = 0;
25 unsigned nextCarry = 0;
26
27#if defined(__clang__) || defined(__INTEL_COMPILER)
28#pragma unroll
29#elif defined(__GNUC__) && __GNUC__ >= 8
30// This pragma was introduced in GCC version 8.
31#pragma GCC unroll 18
32#endif
33 for (int i = 0; i < 18; i++) {
34 uint64_t current = next;
35 unsigned carry = nextCarry;
36
37 next = 0;
38 nextCarry = 0;
39
40#if defined(__clang__) || defined(__INTEL_COMPILER)
41#pragma unroll
42#elif defined(__GNUC__) && __GNUC__ >= 8
43// This pragma was introduced in GCC version 8.
44#pragma GCC unroll 9
45#endif
46 for (int j = 0; j < 9; j++) {
47 int k = i - j;
48 if (k < 0 || k >= 9)
49 continue;
50
51 uint64_t fac1 = in1[j];
52 uint64_t fac2 = in2[k];
53#if defined(__SIZEOF_INT128__) && !defined(CLHEP_NO_INT128)
54#ifdef __GNUC__
55#pragma GCC diagnostic push
56#pragma GCC diagnostic ignored "-Wpedantic"
57#endif
58 unsigned __int128 prod = fac1;
59 prod = prod * fac2;
60
61 uint64_t upper = prod >> 64;
62 uint64_t lower = static_cast<uint64_t>(prod);
63#ifdef __GNUC__
64#pragma GCC diagnostic pop
65#endif
66#else
67 uint64_t upper1 = fac1 >> 32;
68 uint64_t lower1 = static_cast<uint32_t>(fac1);
69
70 uint64_t upper2 = fac2 >> 32;
71 uint64_t lower2 = static_cast<uint32_t>(fac2);
72
73 // Multiply 32-bit parts, each product has a maximum value of
74 // (2 ** 32 - 1) ** 2 = 2 ** 64 - 2 * 2 ** 32 + 1.
75 uint64_t upper = upper1 * upper2;
76 uint64_t middle1 = upper1 * lower2;
77 uint64_t middle2 = lower1 * upper2;
78 uint64_t lower = lower1 * lower2;
79
80 // When adding the two products, the maximum value for middle is
81 // 2 * 2 ** 64 - 4 * 2 ** 32 + 2, which exceeds a uint64_t.
82 unsigned overflow;
83 uint64_t middle = add_overflow(middle1, middle2, overflow);
84 // Handling the overflow by a multiplication with 0 or 1 is cheaper
85 // than branching with an if statement, which the compiler does not
86 // optimize to this equivalent code. Note that we could do entirely
87 // without this overflow handling when summing up the intermediate
88 // products differently as described in the following SO answer:
89 // https://stackoverflow.com/a/51587262
90 // However, this approach takes at least the same amount of thinking
91 // why a) the code gives the same results without b) overflowing due
92 // to the mixture of 32 bit arithmetic. Moreover, my tests show that
93 // the scheme implemented here is actually slightly more performant.
94 uint64_t overflow_add = overflow * (uint64_t(1) << 32);
95 // This addition can never overflow because the maximum value of upper
96 // is 2 ** 64 - 2 * 2 ** 32 + 1 (see above). When now adding another
97 // 2 ** 32, the result is 2 ** 64 - 2 ** 32 + 1 and still smaller than
98 // the maximum 2 ** 64 - 1 that can be stored in a uint64_t.
99 upper += overflow_add;
100
101 uint64_t middle_upper = middle >> 32;
102 uint64_t middle_lower = middle << 32;
103
104 lower = add_overflow(lower, middle_lower, overflow);
105 upper += overflow;
106
107 // This still can't overflow since the maximum of middle_upper is
108 // - 2 ** 32 - 4 if there was an overflow for middle above, bringing
109 // the maximum value of upper to 2 ** 64 - 2.
110 // - otherwise upper still has the initial maximum value given above
111 // and the addition of a value smaller than 2 ** 32 brings it to
112 // a maximum value of 2 ** 64 - 2 ** 32 + 2.
113 // (Both cases include the increment to handle the overflow in lower.)
114 //
115 // All the reasoning makes perfect sense given that the product of two
116 // 64 bit numbers is smaller than or equal to
117 // (2 ** 64 - 1) ** 2 = 2 ** 128 - 2 * 2 ** 64 + 1
118 // with the upper bits matching the 2 ** 64 - 2 of the first case.
119 upper += middle_upper;
120#endif
121
122 // Add to current, remember carry.
123 current = add_carry(current, lower, carry);
124
125 // Add to next, remember nextCarry.
126 next = add_carry(next, upper, nextCarry);
127 }
128
129 next = add_carry(next, carry, nextCarry);
130
131 out[i] = current;
132 }
133}
static uint64_t add_carry(uint64_t a, uint64_t b, unsigned &carry)
Compute a + b and increment carry if there was an overflow.
Definition: helpers.h:24
static uint64_t add_overflow(uint64_t a, uint64_t b, unsigned &overflow)
Compute a + b and set overflow accordingly.
Definition: helpers.h:16

References add_carry(), and add_overflow().

Referenced by mulmod(), and powermod().

◆ powermod()

static void powermod ( const uint64_t *  base,
uint64_t *  res,
uint64_t  n 
)
static

Compute base to the n modulo m

Parameters
[in]basewith 9 numbers of 64 bits each
[out]resoutput with 9 numbers of 64 bits each
[in]nexponent

The arguments base and res may point to the same location.

Definition at line 232 of file mulmod.h.

232 {
233 uint64_t fac[9] = {0};
234 fac[0] = base[0];
235 res[0] = 1;
236 for (int i = 1; i < 9; i++) {
237 fac[i] = base[i];
238 res[i] = 0;
239 }
240
241 uint64_t mul[18] = {0};
242 while (n) {
243 if (n & 1) {
244 multiply9x9(res, fac, mul);
245 mod_m(mul, res);
246 }
247 n >>= 1;
248 if (!n)
249 break;
250 multiply9x9(fac, fac, mul);
251 mod_m(mul, fac);
252 }
253}
static const G4double fac

References fac, mod_m(), multiply9x9(), and CLHEP::detail::n.

Referenced by CLHEP::RanluxppEngine::setSeed(), and CLHEP::RanluxppEngine::skip().