Safegcd inverses, drop Jacobi symbols, remove libgmp

sipa commented at 2:37 am on October 12, 2020: contributor

This is a rebased and squashed version of #767, adding safegcd-based implementations of constant-time and variable-time modular inverses for scalars and field elements, by Peter Dettman. The PR is organized as follows:

Add secp256k1_ctz{32,64}_var functions Introduction of ctz functions to util.h (which use __builtin_ctz on recent GCC and Clang, but fall back to using a software emulation using de Bruijn on other platforms). This isn’t used anywhere in this commit, but does include tests.
Add safegcd based modular inverse modules Add Peter Dettman’s safegcd code from #767 (without some of his optimizations, which are moved to later commits), turned into separate modules by me.
Add extensive comments on the safegcd algorithm and implementation Add a long description of the algorithm and optimizations to doc/safegcd_implementation.md, as well as additional comments to the code itself. It is probably best to review this together with the previous commit (they’re separated to keep authorship).
Add tests for modinv modules Adds tests on the modinv interface directly, for arbitrary moduli.
Improve bounds checks in modinv modules Adds a lot of sanity checking to the modinv modules.
Move secp256k1_scalar_{inverse{_var},is_even} to per-impl files A pure refactor to prepare for switching the field and scalar code to modinv.
Make field/scalar code use the new modinv modules for inverses Actually switch over.
Add extra modular inverse tests This adds modular inverse tests through the field/scalar interface, now that those use modinv.
Remove unused Jacobi symbol support No longer needed.
Remove num/gmp support Bye-bye.
3 commits with further optimizations.

sipa commented at 2:38 am on October 12, 2020: contributor

Ping @peterdettman

I made the following change to avoid signed overflow:

 0diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
 1index d5ff75c..a2c16a6 100644
 2--- a/src/field_10x26_impl.h
 3+++ b/src/field_10x26_impl.h
 4@@ -1362,7 +1362,7 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, const int32_t *t)
 5     /* P == 2^256 - 2^32 - C30 */
 6     const int64_t C30 = 0x3D1L;
 7     /* I30 == -P^-1 mod 2^30 */
 8-    const int32_t I30 = 0x12253531L;
 9+    const uint32_t I30 = 0x12253531L;
10     const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
11     const int32_t u = t[0], v = t[1], q = t[2], r = t[3];
12     int32_t di, ei, md, me;
13@@ -1377,8 +1377,8 @@ static void secp256k1_fe_update_de_30(int32_t *d, int32_t *e, const int32_t *t)
14 
15     /* Calculate the multiples of P to add, to zero the 30 bottom bits. We choose md, me
16      * from the centred range [-2^29, 2^29) to keep d, e within [-2^256, 2^256). */
17-    md = (I30 * 4 * (int32_t)cd) >> 2;
18-    me = (I30 * 4 * (int32_t)ce) >> 2;
19+    md = ((int32_t)(I30 * 4 * (uint32_t)cd)) >> 2;
20+    me = ((int32_t)(I30 * 4 * (uint32_t)ce)) >> 2;
21 
22     cd -= (int64_t)C30 * md;
23     ce -= (int64_t)C30 * me;
24diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
25index 2e81fc2..8f7d290 100644
26--- a/src/field_5x52_impl.h
27+++ b/src/field_5x52_impl.h
28@@ -663,7 +663,7 @@ static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, const int64_t *t)
29     /* P == 2^256 - C62 */
30     const int64_t C62 = 0x1000003D1LL;
31     /* I62 == -P^-1 mod 2^62 */
32-    const int64_t I62 = 0x1838091DD2253531LL;
33+    const uint64_t I62 = 0x1838091DD2253531LL;
34     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
35     const int64_t d0 = d[0], d1 = d[1], d2 = d[2], d3 = d[3], d4 = d[4];
36     const int64_t e0 = e[0], e1 = e[1], e2 = e[2], e3 = e[3], e4 = e[4];
37@@ -676,8 +676,8 @@ static void secp256k1_fe_update_de_62(int64_t *d, int64_t *e, const int64_t *t)
38 
39     /* Calculate the multiples of P to add, to zero the 62 bottom bits. We choose md, me
40      * from the centred range [-2^61, 2^61) to keep d, e within [-2^256, 2^256). */
41-    md = (I62 * 4 * (int64_t)cd) >> 2;
42-    me = (I62 * 4 * (int64_t)ce) >> 2;
43+    md = ((int64_t)(I62 * 4 * (uint64_t)cd)) >> 2;
44+    me = ((int64_t)(I62 * 4 * (uint64_t)ce)) >> 2;
45 
46     cd -= (int128_t)C62 * md;
47     ce -= (int128_t)C62 * me;
48diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
49index 5dfa742..b649928 100644
50--- a/src/scalar_4x64_impl.h
51+++ b/src/scalar_4x64_impl.h
52@@ -1113,7 +1113,7 @@ static uint64_t secp256k1_scalar_divsteps_62_var(uint64_t eta, uint64_t f0, uint
53 static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, const int64_t *t) {
54 
55     /* I62 == -P^-1 mod 2^62 */
56-    const int64_t I62 = 0x0B0DFF665588B13FLL;
57+    const uint64_t I62 = 0x0B0DFF665588B13FLL;
58     const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
59     const int64_t P[5] = { 0x3FD25E8CD0364141LL, 0x2ABB739ABD2280EELL, -0x15LL, 0, 256 };
60     const int64_t d0 = d[0], d1 = d[1], d2 = d[2], d3 = d[3], d4 = d[4];
61@@ -1127,8 +1127,8 @@ static void secp256k1_scalar_update_de_62(int64_t *d, int64_t *e, const int64_t
62 
63     /* Calculate the multiples of P to add, to zero the 62 bottom bits. We choose md, me
64      * from the centred range [-2^61, 2^61) to keep d, e within [-2^256, 2^256). */
65-    md = (I62 * 4 * (int64_t)cd) >> 2;
66-    me = (I62 * 4 * (int64_t)ce) >> 2;
67+    md = ((int64_t)(I62 * 4 * (uint64_t)cd)) >> 2;
68+    me = ((int64_t)(I62 * 4 * (uint64_t)ce)) >> 2;
69 
70     cd += (int128_t)P[0] * md;
71     ce += (int128_t)P[0] * me;
72diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
73index 40fcaf7..66c4494 100644
74--- a/src/scalar_8x32_impl.h
75+++ b/src/scalar_8x32_impl.h
76@@ -914,7 +914,7 @@ static uint32_t secp256k1_scalar_divsteps_30_var(uint32_t eta, uint32_t f0, uint
77 static void secp256k1_scalar_update_de_30(int32_t *d, int32_t *e, const int32_t *t) {
78 
79     /* I30 == -P^-1 mod 2^30 */
80-    const int32_t I30 = 0x1588B13FL;
81+    const uint32_t I30 = 0x1588B13FL;
82     const int32_t P[9] = { 0x10364141L, 0x3F497A33L, 0x348A03BBL, 0x2BB739ABL, -0x146L,
83         0, 0, 0, 65536 };
84     const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
85@@ -930,8 +930,8 @@ static void secp256k1_scalar_update_de_30(int32_t *d, int32_t *e, const int32_t
86 
87     /* Calculate the multiples of P to add, to zero the 30 bottom bits. We choose md, me
88      * from the centred range [-2^29, 2^29) to keep d, e within [-2^256, 2^256). */
89-    md = (I30 * 4 * (int32_t)cd) >> 2;
90-    me = (I30 * 4 * (int32_t)ce) >> 2;
91+    md = ((int32_t)(I30 * 4 * (uint32_t)cd)) >> 2;
92+    me = ((int32_t)(I30 * 4 * (uint32_t)ce)) >> 2;
93 
94     cd += (int64_t)P[0] * md;
95     ce += (int64_t)P[0] * me;

With that, all tests pass when compiled with -ftrapv or -fsanitize=undefined.

sipa force-pushed on Oct 12, 2020

sipa renamed this:
~~Safegcd inverses, drop Jacobi symbols, remove libgmp dependency~~
Safegcd inverses, drop Jacobi symbols, remove libgmp
on Oct 12, 2020

sipa marked this as a draft on Oct 12, 2020

sipa commented at 7:38 am on October 12, 2020: contributor

I want to rework this a bit still, introducing structs for the signed-limb and transformation-matrix for clarify, and adding comments to explain my own understanding. Marking as draft for now, but it can be built/benchmarked/tested already.

sipa cross-referenced this on Oct 12, 2020 from issue WIP: "safegcd" field and scalar inversion by peterdettman

in src/util.h:313 in 4ad49c8aab outdated

256+#define __has_builtin(x) 0
257+#endif
258+
259+static SECP256K1_INLINE int secp256k1_ctz32_var(uint32_t x) {
260+#if (__has_builtin(__builtin_ctz) || SECP256K1_GNUC_PREREQ(3,4))
261+    if (((unsigned)UINT32_MAX) == UINT32_MAX) {

real-or-random commented at 9:04 am on October 13, 2020:

It may be simpler and more correct to use __builtin_types_compatible_p here to check for equality of types, which existed before __builtin_ctz and friends.

https://gcc.gnu.org/onlinedocs/gcc-3.1/gcc/Other-Builtins.html

sipa commented at 5:29 am on October 14, 2020:

In practice that would work, but it’s not exactly what you want: if unsigned happens to be larger than 32 bits, we still want this branch to be taken.

real-or-random commented at 9:18 am on October 14, 2020:

Oh! I understand now.

My point was that this currently checks only one direction, namely if unsigned is wider or equal to uint32_t (and not the other way around). But I see that this is exactly what we want to check, at least if x is non-zero. Neat.

Can you add a comment to this conditional? There should also be a comment that explains the semantics of the function (including that the result is undefined if x is 0, which is the case even for __builtin_ctz).

(I know this is a draft PR, we can postpone all this to a more stable version.)

sipa commented at 7:45 pm on December 23, 2020:

Done.

sipa force-pushed on Oct 14, 2020

sipa commented at 5:31 am on October 14, 2020: contributor

I added a commit that introduces shared modinv* files with the bulk of the algorithm, which is then used by both scalar and field code. I measure around a 1% slowdown for the variable-time versions, but a 2% speedup for the constant-time version with this (?!).

I’d be interested to hear what benchmarks others see with/without the last commit.

peterdettman commented at 6:35 am on October 14, 2020: contributor

(only testing 64bit) I see very little change apart from field_inverse getting noticeably slower (4%), which is presumably down to no longer having the nice prime shape baked in to _update_de_62.

in src/modinv32_impl.h:207 in d5ee5592aa outdated

147+    /* Calculate the multiples of P to add, to zero the 30 bottom bits. We choose md, me
148+     * from the centred range [-2^29, 2^29) to keep d, e within [-2^256, 2^256). */
149+    md = ((int32_t)(modinfo->montmul4 * (uint32_t)cd)) >> 2;
150+    me = ((int32_t)(modinfo->montmul4 * (uint32_t)ce)) >> 2;
151+
152+    if (modinfo->modulus.v[0]) {

peterdettman commented at 6:37 am on October 14, 2020:

Are these checks helpful? It would require deep inlining for the compiler to be able to constant-fold this away, and in that case it can remove the muladd just fine anyway.

sipa commented at 7:50 am on October 14, 2020:

I get a small but reproducible speedup by adding them (i7-7820HQ, 64 bit, gcc-9 -O2), for all 4 cases (_var and const, field and scalar).

This is -O2, so there shouldn’t be any crazy inlining. The cost of the 5 conditionals could just be less than the 2 muladds that are saved in the scalar case.

peterdettman commented at 7:53 am on October 14, 2020:

Perhaps, but wouldn’t that rely on the branch predictor, and does the benchmark reflect this well?

sipa commented at 4:53 pm on October 14, 2020:

That’s a good question. Benchmarking with interleaved scalar and field inverses may be more representative.

peterdettman commented at 1:15 am on October 15, 2020:

Note also that the lowest limb of the modulus can’t ever be 0 (modulus has to be odd), and arguably if this implementation is for 256-bit moduli, then the highest limb couldn’t ever be 0 either.

peterdettman commented at 4:38 pm on November 27, 2020:

I think this point still stands; the if should be removed for modulus.v[0] and v[4] (resp. v[8]).

sipa commented at 6:56 pm on November 28, 2020:

Done.

in src/modinv64_impl.h:20 in d5ee5592aa outdated

15+static void secp256k1_modinv64_signed62_verify(const secp256k1_modinv64_signed62* a) {
16+    /* a must be in the range [-2^256, 2^256). */
17+    VERIFY_CHECK(a->v[0] >> 62 == 0);
18+    VERIFY_CHECK(a->v[1] >> 62 == 0);
19+    VERIFY_CHECK(a->v[2] >> 62 == 0);
20+    VERIFY_CHECK(a->v[3] >> 62 == 0);

peterdettman commented at 6:49 am on October 14, 2020:

This isn’t true of the field prime constants using this type. I suggest that the restriction is one that applies to d, e, f, g with respect to the _update methods, but not for all instances of the type.

sipa commented at 7:52 am on October 14, 2020:

What is special about the field prime constants? The tests seem to pass.

peterdettman commented at 7:54 am on October 14, 2020:

They contain negative limbs (to maximise the number of zeros in their representation).

sipa commented at 8:00 am on October 14, 2020: contributor

@peterdettman Oh, I see - you’re saying that these aren’t invariants that hold for all signed62 values; of course. This _verify function is a bit of a leftover - it should just be inlined back into the _from_signedXX functions, as that’s where this strict condition is required. An internal _verify function may be helpful too, but would need to be less strict.

sipa force-pushed on Nov 27, 2020

sipa marked this as ready for review on Nov 27, 2020

sipa commented at 5:19 am on November 27, 2020: contributor

Rebased on top of the endomorphism changes, included the recent fixes from https://github.com/sipa/secp256k1/pull/6, squashed the implementation commits, removed the _verify functions that were misleading, and marked as ready for review.

I’d like to make a few more changes, adding checks for the ranges of variables and additional comments, but if someone is interested I think this is pretty close to done otherwise.

sipa commented at 9:40 pm on November 27, 2020: contributor

@peterdettman I had to make these additional changes to satisfy ubsan (they were actually invoking UB, I believe):

  0diff --git a/src/modinv32.h b/src/modinv32.h
  1index 74fc3fd8..d46447b1 100644
  2--- a/src/modinv32.h
  3+++ b/src/modinv32.h
  4@@ -22,7 +22,7 @@ typedef struct {
  5     secp256k1_modinv32_signed30 modulus;
  6 
  7     /* modulus^{-1} mod 2^30 */
  8-    int32_t modulus_inv30;
  9+    uint32_t modulus_inv30;
 10 } secp256k1_modinv32_modinfo;
 11 
 12 static void secp256k1_modinv32(secp256k1_modinv32_signed30 *x, const secp256k1_modinv32_modinfo *modinfo);
 13diff --git a/src/modinv32_impl.h b/src/modinv32_impl.h
 14index 2cd65557..a15bd2f9 100644
 15--- a/src/modinv32_impl.h
 16+++ b/src/modinv32_impl.h
 17@@ -201,8 +201,8 @@ static void secp256k1_modinv32_update_de_30(secp256k1_modinv32_signed30 *d, secp
 18      * the range (-2.P, P), consistent with the input constraint.
 19      */
 20 
 21-    md -= (modinfo->modulus_inv30 * (int32_t)cd + md) & M30;
 22-    me -= (modinfo->modulus_inv30 * (int32_t)ce + me) & M30;
 23+    md -= (modinfo->modulus_inv30 * (uint32_t)cd + md) & M30;
 24+    me -= (modinfo->modulus_inv30 * (uint32_t)ce + me) & M30;
 25 
 26     /* The modulus has to be odd, so we can assume it is nonzero. */
 27     cd += (int64_t)modinfo->modulus.v[0] * md;
 28@@ -380,8 +380,8 @@ static void secp256k1_modinv32_var(secp256k1_modinv32_signed30 *x, const secp256
 29         cond |= gn ^ (gn >> 31);
 30 
 31         if (cond == 0) {
 32-            f.v[len - 2] |= fn << 30;
 33-            g.v[len - 2] |= gn << 30;
 34+            f.v[len - 2] |= (uint32_t)fn << 30;
 35+            g.v[len - 2] |= (uint32_t)gn << 30;
 36             --len;
 37         }
 38     }
 39diff --git a/src/modinv64.h b/src/modinv64.h
 40index caff6f5a..6fea9651 100644
 41--- a/src/modinv64.h
 42+++ b/src/modinv64.h
 43@@ -26,7 +26,7 @@ typedef struct {
 44     secp256k1_modinv64_signed62 modulus;
 45 
 46     /* modulus^{-1} mod 2^62 */
 47-    int64_t modulus_inv62;
 48+    uint64_t modulus_inv62;
 49 } secp256k1_modinv64_modinfo;
 50 
 51 static void secp256k1_modinv64(secp256k1_modinv64_signed62 *x, const secp256k1_modinv64_modinfo *modinfo);
 52diff --git a/src/modinv64_impl.h b/src/modinv64_impl.h
 53index ebf1fe7f..7ce6c909 100644
 54--- a/src/modinv64_impl.h
 55+++ b/src/modinv64_impl.h
 56@@ -177,8 +177,8 @@ static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp
 57      * the range (-2.P, P), consistent with the input constraint.
 58...skipping...
 59index 2cd65557..a15bd2f9 100644
 60--- a/src/modinv32_impl.h
 61+++ b/src/modinv32_impl.h
 62@@ -201,8 +201,8 @@ static void secp256k1_modinv32_update_de_30(secp256k1_modinv32_signed30 *d, secp
 63      * the range (-2.P, P), consistent with the input constraint.
 64      */
 65 
 66-    md -= (modinfo->modulus_inv30 * (int32_t)cd + md) & M30;
 67-    me -= (modinfo->modulus_inv30 * (int32_t)ce + me) & M30;
 68+    md -= (modinfo->modulus_inv30 * (uint32_t)cd + md) & M30;
 69+    me -= (modinfo->modulus_inv30 * (uint32_t)ce + me) & M30;
 70 
 71     /* The modulus has to be odd, so we can assume it is nonzero. */
 72     cd += (int64_t)modinfo->modulus.v[0] * md;
 73@@ -380,8 +380,8 @@ static void secp256k1_modinv32_var(secp256k1_modinv32_signed30 *x, const secp256
 74         cond |= gn ^ (gn >> 31);
 75 
 76         if (cond == 0) {
 77-            f.v[len - 2] |= fn << 30;
 78-            g.v[len - 2] |= gn << 30;
 79+            f.v[len - 2] |= (uint32_t)fn << 30;
 80+            g.v[len - 2] |= (uint32_t)gn << 30;
 81             --len;
 82         }
 83     }
 84diff --git a/src/modinv64.h b/src/modinv64.h
 85index caff6f5a..6fea9651 100644
 86--- a/src/modinv64.h
 87+++ b/src/modinv64.h
 88@@ -26,7 +26,7 @@ typedef struct {
 89     secp256k1_modinv64_signed62 modulus;
 90 
 91     /* modulus^{-1} mod 2^62 */
 92-    int64_t modulus_inv62;
 93+    uint64_t modulus_inv62;
 94 } secp256k1_modinv64_modinfo;
 95 
 96 static void secp256k1_modinv64(secp256k1_modinv64_signed62 *x, const secp256k1_modinv64_modinfo *modinfo);
 97diff --git a/src/modinv64_impl.h b/src/modinv64_impl.h
 98index ebf1fe7f..7ce6c909 100644
 99--- a/src/modinv64_impl.h
100+++ b/src/modinv64_impl.h
101@@ -177,8 +177,8 @@ static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp
102      * the range (-2.P, P), consistent with the input constraint.
103      */
104 
105-    md -= (modinfo->modulus_inv62 * (int64_t)cd + md) & M62;
106-    me -= (modinfo->modulus_inv62 * (int64_t)ce + me) & M62;
107+    md -= (modinfo->modulus_inv62 * (uint64_t)cd + md) & M62;
108+    me -= (modinfo->modulus_inv62 * (uint64_t)ce + me) & M62;
109 
110     /* The modulus has to be odd, so we can assume it is nonzero. */
111     cd += (int128_t)modinfo->modulus.v[0] * md;
112@@ -388,8 +388,8 @@ static void secp256k1_modinv64_var(secp256k1_modinv64_signed62 *x, const secp256
113         cond |= gn ^ (gn >> 63);
114 
115         if (cond == 0) {
116-            f.v[len - 2] |= fn << 62;
117-            g.v[len - 2] |= gn << 62;
118+            f.v[len - 2] |= (uint64_t)fn << 62;
119+            g.v[len - 2] |= (uint64_t)gn << 62;
120             --len;
121         }
122     }

sipa force-pushed on Nov 28, 2020

sipa force-pushed on Nov 29, 2020

in src/modinv64_impl.h:33 in b4721dc00a outdated

28+    c += r3 + (modinfo->modulus.v[3] & cond_add);
29+    r3 = c & M62; c >>= 62;
30+    c += r4 + (modinfo->modulus.v[4] & cond_add);
31+    r4 = c;
32+
33+    cond_add = (c >> 63) ^ cond_negate;

peterdettman commented at 5:22 am on December 3, 2020:

This is wrong if input r is 0 (or -P) and cond_negate is “true”, although I don’t think such a call can actually happen. I have created a PR to fix it, and add _var variants as well.

sipa commented at 9:13 pm on December 3, 2020:

I’ve squashed some of the changes from that commit in here. I can’t measure a performance improvement from the _var version (not on x86_64, arm64, or arm32), so haven’t included that.

sipa force-pushed on Dec 3, 2020

sipa commented at 1:32 am on December 4, 2020: contributor

@peterdettman It seems that the bounds on (u,v,q,r) after N divsteps are:

u in [-2^(N-2),2^N]
v in [2-2^(N-2),2^N]
q in [-2^(N-1),2^N-1]
r in [1-2^(N-1),2^N-1]

If we’d negate those coefficients, 31/63 fit in a single int{32,64}_t transformation matrix. Is there any reason why 30/62 are preferable? For the 32-bit constant time version this gains us one big step. For the variable-time version it may mean doing a group less in some cases.

Some additional bounds that may help reasoning about ranges:

u+v in [-2^(N-2),2^N]
u-v in [-2^N,2^N]
q+r in [-2^(N-1),2^N]
q-r in [-2^(N-1),2^N-2]

peterdettman commented at 6:46 am on December 4, 2020: contributor

@sipa I personally prefer to start u,v,q,r as -1,0,0,-1 for the nicer bound that it gives (in terms of 2’s complement bit length). The original code did this, but we have since put it back to the way the paper presents it. The original code was also attempting to use 31 divsteps in order to save the 1 big step, but I think in several places, particularly _update_de there are going to be overflow problems as the code stands. For myself I’d prefer to bed down the current version and explore that option later, but if anyone can show 31/63-based code that’s faster and correct I’m not opposed.

sipa commented at 6:54 am on December 4, 2020: contributor

@peterdettman Sounds good. The new normalize code also relies on having one slack bit.

I’m currently working on adding more explicit bounds checking (mostly to convince myself, but I’ll try to also document it).

sipa force-pushed on Dec 8, 2020

sipa cross-referenced this on Dec 8, 2020 from issue Rework _normalize_30/62 methods by peterdettman

sipa force-pushed on Dec 8, 2020

sipa commented at 7:50 am on December 8, 2020: contributor

@peterdettman I’ve added a commit that adds lots of range checks on the signed62 numbers (just the 64-bit version for now), plus additional comments to explain my understanding of why these bounds hold. I plan on adding more comments, but feel free to see if it’s correct so far.

EDIT: ran 40 million iterations of the unit tests with this, no problem.

sipa force-pushed on Dec 9, 2020

sipa force-pushed on Dec 11, 2020

in src/modinv64_impl.h:39 in cea64c0e8e outdated

34+ *     else:
35+ *         return 1 + delta, f, (g    ) // 2
36+ *
37+ * Note that only even numbers are ever divided by 2.
38+ *
39+ * Repeated application, starting with (1, a, b) where a is odd, will eventually reach a state where

sanket1729 commented at 2:38 pm on December 11, 2020:

Do you have any intuition for the delta parameter? Why is this set to 1? I read the paper and in the edd255 curve example, they set this to 1. In the general x-adic case, this is set to be the difference in degrees between the polynomials.

My intuition says that maybe this should be the difference highest set bits of the two numbers. For example, a = 256 bit and b is 254 bit, we should set delta to 2?

sipa commented at 2:54 am on December 13, 2020:

I’ve added something to explain it.

in src/modinv64_impl.h:18 in cea64c0e8e outdated

11@@ -12,33 +12,476 @@
12 
13 #include "util.h"
14 
15+/* This file implements modular inversion based on the paper "Fast constant-time gcd computation and
16+ * modular inversion" by Daniel J. Bernstein and Bo-Yin Yang.
17+ *
18+ * Paper: https://gcd.cr.yp.to/papers.html. The references below are for the Date: 2019.04.13

sanket1729 commented at 2:39 pm on December 11, 2020:

link is broken for me.

adamjonas commented at 0:45 am on December 12, 2020:

The trailing period breaks it in the GitHub GUI, but the link works for me.

sipa commented at 2:54 am on December 13, 2020:

Fixed.

in src/modinv64_impl.h:84 in cea64c0e8e outdated

79+ *             delta, f, g, d, e = 1 + delta, f, (g    ) // 2, d, div2(modulus, e    )
80+ *         assert f % modulus == (d * x) % modulus
81+ *         assert g % modulus == (e * x) % modulus
82+ *     assert f == 1 or f == -1
83+ *     # As f = d*x (mod modulus), d/f = x^-1 (mod modulus). As |f|=1, d/f = d*f.
84+ *     return d * f

sanket1729 commented at 2:57 pm on December 11, 2020:

Note that this method (and all the subsequent methods based on this) can return numbers negative numbers too. But it would still return them in the range (-modulus, modulus).

sipa commented at 2:54 am on December 13, 2020:

Fixed, added ‘% modulus’ in a few places.

in src/modinv64_impl.h:681 in cea64c0e8e outdated

392+ *         if eta < 0:
393+ *             eta, f, u, v, g, q, r = -eta, g, q, r, -f, -u, -v
394+ *         limit = min(min(eta + 1, i), 4)
395+ *         w = (g * NEGINV16[(f & 15) // 2]) % (2**limit)
396+ *         g, q, r = g + w * f, q + w * u, r + w * v
397+ *

sanket1729 commented at 3:25 pm on December 11, 2020:

missing return statement.

sipa commented at 2:54 am on December 13, 2020:

Fixed.

sanket1729 commented at 3:44 pm on December 11, 2020: none

I just looked at the comment explaining the algorithm and with some aid from the paper I am able to (roughly)convince myself that is correct and if it terminates will produce a gcd.

I still completely grasp why it should terminate, but I think that is outside of the scope of this comment/PR. Your analysis in the convex hull method offers a nice empirical guarantee that it should terminate for all inputs that we care about and it is enough to convince me.

Maybe I will try to understand the termination proof one day!

sipa force-pushed on Dec 13, 2020

sipa force-pushed on Dec 14, 2020

in src/modinv64_impl.h:284 in 96cb1cba4e outdated

276+ *     u, v, q, r = t
277+ *     md, me = 0, 0
278+ *     if d < 0:
279+ *         md, me = md + u, me + q
280+ *     if e < 0:
281+ *         md, me = md + q, me + r

sanket1729 commented at 11:52 am on December 14, 2020:

I think this should be md + v instead of md + q.

sipa commented at 6:08 am on December 16, 2020:

Nice catch, fixed.

in src/modinv64_impl.h:284 in 96cb1cba4e outdated

281+ *         md, me = md + q, me + r
282+ *     # Compute bottom N bits of cd and ce.
283+ *     cd, ce = (u * d + v * e + md * modulus) % 2**N, (q * d + r * e + me*modulus) % 2**N
284+ *     # Correct md and me such that the bottom bits of cd and ce would be cancelled out.
285+ *     md -= ((modulus_inv2n * (cd % (2**N))) % (2**N))
286+ *     me -= ((modulus_inv2n * (ce % (2**N))) % (2**N))

sanket1729 commented at 12:40 pm on December 14, 2020:

minor nit: unnecessary % operation by 2**N as it was done in the previous step.

sipa commented at 6:04 am on December 16, 2020:

Fixed.

sanket1729 commented at 12:57 pm on December 14, 2020: none

Reviewed the Avoiding modulus section comment section.

I wonder how much is it really that expensive to compute f*d % modulus compared to other operations in the algorithm? I agree that the current normalization is better than the modulus one, but the description describes that step as “relatively expensive” when done generically.

The only reason I ask this is that it introduces some reasoning that is not a part of the paper( at least I could not find it). That being said, I am convinced that the values of d are indeed in the range as mentioned and the analysis is correct.

Just curious if our reliance on the extra analysis and (very slight) complexity, but additional complexity compared to the paper is worth it? Or maybe while writing low-level primitives like modinv, we should really care about every small-time cost that we can optimize.

sipa force-pushed on Dec 15, 2020

sipa force-pushed on Dec 16, 2020

sipa commented at 6:07 am on December 16, 2020: contributor

I think this is ready for review.

I’ve significantly expanded the explanation of the algorithm and implementation (see big comment in modinv64_impl.h), including working Python code that demonstrates the full algorithm with all optimizations, except those that depend on number representation. @sanket1729 The implementation here differs significantly from the one in the paper in many ways:

The whole computation of d and e is done very differently, but the approach here is simpler and appears faster.
All the variable-time code (including the multi-step updates in the matrix computation)
Eta instead of delta
The specific bit fiddling logic in the constant-time matrix computation

I’ve expanded the explanation a bit to highlight the differences.

sipa force-pushed on Dec 16, 2020

sipa force-pushed on Dec 17, 2020

sipa force-pushed on Dec 18, 2020

elichai cross-referenced this on Dec 22, 2020 from issue Detecting and linking against libgmp by elichai

sipa force-pushed on Dec 23, 2020

sipa commented at 4:53 am on December 23, 2020: contributor

Added an extra commit that adds tests of the modinv functions themselves (as opposed to the existing tests that only exercise the field_inv and scalar_inv interface). Also added more bounds checking (it now deals with variable-length numbers correctly, and verifies f=+-1 and g=0).

sipa force-pushed on Dec 23, 2020

bitcoin-core deleted a comment on Dec 24, 2020

sipa commented at 10:47 pm on December 25, 2020: contributor

@peterdettman I added a commit that makes f and g variable-length in the constant-time code, using bounds derived using my convex hull approach. When the input x=0, this does throw away significant bits of f, but that doesn’t matter because g=0 all the time, and thus all divsteps will still do the same.

On my Ryzen 2950X it seems like a slight speedup + less code. I’ll benchmark more.

sipa force-pushed on Dec 26, 2020

sipa commented at 7:54 pm on December 26, 2020: contributor

It doesn’t seem to be a big win in any case and it critically relies on rather hastily written convex hull analysis code. I’ve removed it again, but we could pick it up after this is merged.

sipa force-pushed on Dec 26, 2020

peterdettman commented at 7:49 am on December 27, 2020: contributor

@sipa I was going to say essentially that I would prefer no more changes. I think we have robust code now with solid proofs and relatively wide margins of error (as these things go) - and already some big wins for performance. It’s also still simple enough to keep the bar at a reasonable level for reviewers. All in all, it’s in a great position to focus on review and merge.

sipa commented at 8:39 am on December 27, 2020: contributor

@peterdettman Yeah, fair. Also, feel like having a look at the writeup I added to modinv64_impl.h? Perhaps I got some reasonings wrong, or things are missing.

in src/modinv64_impl.h:262 in 1e30f79c38 outdated

854+            limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
855+            /* m is a mask for the bottom min(limit, 6) bits. */
856+            m = (UINT64_MAX >> (64 - limit)) & 63U;
857+            /* Find what multiple of f must be added to g to cancel its bottom min(limit, 6)
858+             * bits. */
859+            w = (f * g * (f * f - 2)) & m;

peterdettman commented at 3:40 am on December 28, 2020:

It may be worth noting that these multiplications should only need a 32x32->32 multiply instruction (or 8x8->8 even!), though I haven’t checked whether compilers see that.

sipa commented at 6:43 am on December 28, 2020:

Looking at compiler output in GCC 10.2.0 -O2 x86_64, casting the multiplication result to uint8_t shuffles a few instructions around, but the net effect is replacing a 64x64->64 multiplication with a 32x32->32 multiplication. Presumably that’s faster. I haven’t benchmarked.

sipa commented at 8:04 pm on December 28, 2020:

Looking at https://www.agner.org/optimize/instruction_tables.ods, on most modern x86_64 systems the size of the multiplicands in a 2-operand IMUL instruction (the one that’s affected here) doesn’t depend on their size. On some early x86_64 ones 64-bit multiplies were slower than other sizes.

real-or-random commented at 11:10 pm on March 3, 2021:

https://github.com/bitcoin-core/secp256k1/pull/831/commits/f8df286b7f826fd71e7732d7124d04d5814b44ee Do you have some reference for that formula? Same below for 4 bits.

sipa commented at 10:41 pm on March 5, 2021:

Added a reference.

sipa force-pushed on Dec 30, 2020

gmaxwell commented at 3:54 pm on December 31, 2020: contributor

FWIW https://twitter.com/hashbreaker/status/1344620536249212933

sipa commented at 1:05 am on January 1, 2021: contributor

Added more unit tests, including specific test cases that trigger various extreme conditions. @gmaxwell has since also discovered that using a slightly different divstep operation (start with delta=0 instead of 1; branch based on delta>=0 rather than delta>0; in the first branch update delta=-delta instead of 1-delta) results in fewer steps on average, and convex bound analysis shows that this in fact reaches g=0 within 590 steps instead of 723 for 256-bit inputs. I’ve implemented that variant here https://github.com/sipa/secp256k1/tree/202012_safegcd_prime. As expected, it is measurably faster for constant-time operation (because it only needs 10 batches of 62 divsteps instead of 12), but surprisingly it is significantly slower for the variable-time version. My theory is that |delta| on average remains lower, and thus there are more transitions between g-halving and cancel-g-bits-out steps… and this somehow compensates for the reduced number of divsteps. Or I did something stupid. Comments welcome, but not going to include that in this PR for now.

gmaxwell commented at 5:08 am on January 1, 2021: contributor

Broadwell-EP 2.4GHz divstep scalar_inverse: min 4.00us / avg 4.01us / max 4.01us field_inverse: min 3.92us / avg 3.92us / max 3.93us divstep’ scalar_inverse: min 3.36us / avg 3.37us / max 3.37us field_inverse: min 3.32us / avg 3.32us / max 3.32us

15.9% speedup for scalar. 15.3% speedup for field.

(I’d test 32-bit, but only 64-bit is implemented for now.)

peterdettman commented at 11:12 am on January 1, 2021: contributor

What wonderful news to start the year!

It’s still worth sticking to ’eta’ I think. The only change needed in divsteps is:

0-        eta = (eta ^ c1) - (c1 + 1);
1+        eta = (eta ^ c1) - 1;

(eta’ is now the one’s complement of delta’ in the new divstep’ algorithm). Also changed the big step loop bounds of course.

64-bit (haswell):

divstep scalar_inverse: min 2.42us / avg 2.55us / max 2.96us field_inverse: min 2.39us / avg 2.44us / max 2.55us divstep’ scalar_inverse: min 1.95us / avg 2.00us / max 2.16us field_inverse: min 1.93us / avg 1.96us / max 2.06us

32-bit (FWIW - 64-bit hardware):

divstep scalar_inverse: min 2.87us / avg 2.97us / max 3.20us field_inverse: min 2.88us / avg 2.90us / max 2.99us divstep’ scalar_inverse: min 2.25us / avg 2.33us / max 2.55us field_inverse: min 2.26us / avg 2.30us / max 2.45us

It’s not surprising that the variable-time version slows down (~14% for me). Recall that it still performs the exact sequence of divsteps that the constant-time one does, just compressing some of them into a single loop iteration (and terminating the outer loop early). For the original algorithm, it almost always finished in 9 big steps anyway (I think it was <1% of inputs that need 10) - if you look at number of divsteps to send g to 0, I recall it being around 530-540 on average for random inputs.

The new divstep reduces the worst-case, but what was the effect on the average-case? I expect not much: what the new algorithm does do though is usually give 1 less possible compressible divstep per ‘switch’, which amounts to the same thing as what you said: more transitions. The impact can probably be softened by not trying to get 6 (then 4), but there’s less opportunities in any case.

Sticking to the same divstep meant we could still rely on the safegcd termination proof, but with new proof tools available it’s feasible to consider them separately, and it’s natural to expect at least some divergence between what’s optimal for each case.

gmaxwell commented at 4:05 pm on January 1, 2021: contributor

The new divstep reduces the worst-case, but what was the effect on the average-case?

On uniformly random inputs with the field modulus the original divstep takes 531.054309 on average and the new one takes 516.852449, so 2.6% fewer. – I had thought the difference was bigger, because I’d just eyeballed it before and not measured it. Still I’m at least a little surprised by the big performance loss.

Aside, Great new results, over 20% now. :)

There are many other candidates I found which get fewer operations on average than the original (though not as few as divstep’) that still might be faster due to whatever reason divstep’ is slower. For example take the original divstep use <= 0 and adjust the even case by 2 instead of 1. 527.84795 on average, 768 iterations provable bound .

It might be reasonable to just search the space of formulas based on average time then hull_bound them to prove convergence of whatever is fastest. Though almost all rules that differ in just delta/eta handling will still converge.

sipa commented at 7:16 pm on January 1, 2021: contributor

@peterdettman Awesome, that makes it pretty clean to use the old divsteps for vartime, and the new one for consttime. I’ve updated my branch.

gmaxwell commented at 9:29 pm on January 1, 2021: contributor

arm7 before scalar_inverse: min 42.5us / avg 42.6us / max 42.7us field_inverse: min 42.5us / avg 42.5us / max 42.5us arm7 after scalar_inverse: min 33.9us / avg 33.9us / max 34.0us field_inverse: min 33.8us / avg 33.8us / max 33.8us

20.4% speedup for the divstep’ branch now on arm7.

sipa force-pushed on Jan 7, 2021

sipa force-pushed on Jan 8, 2021

sipa commented at 0:41 am on January 12, 2021: contributor

Added another optimization in https://github.com/sipa/secp256k1/tree/202012_safegcd_prime: do 10 iterations of 59 steps each (but still with a transformation matrix scaled by 2^62, so update_fg and update_de can be reused).

gmaxwell commented at 3:55 am on January 12, 2021: contributor

I gave a quick benchmark to that additional optimization (technically another version that lowered the loop maximum instead of changing the start) and got an additional 3.6% (scalar) 2.7% (field) speedup.

sipa force-pushed on Jan 13, 2021

sipa commented at 0:04 am on January 13, 2021: contributor

Rebased after #862.

in src/util.h:302 in b4e12d2679 outdated

295+    static const uint8_t debruijn[32] = {
296+        0x00, 0x01, 0x02, 0x18, 0x03, 0x13, 0x06, 0x19, 0x16, 0x04, 0x14, 0x0A,
297+        0x10, 0x07, 0x0C, 0x1A, 0x1F, 0x17, 0x12, 0x05, 0x15, 0x09, 0x0F, 0x0B,
298+        0x1E, 0x11, 0x08, 0x0E, 0x1D, 0x0D, 0x1C, 0x1B
299+    };
300+    return debruijn[((x & -x) * 0x04D7651F) >> 27];

sanket1729 commented at 10:37 pm on January 13, 2021:

Note that this De Bruijn and the corresponding 64-bit one does give secp256k1_ctzxx_var(0) = 0 whereas the built-in on my system gives 32, 64 respectively.

I don’t know what is the expected behavior when x = 0, but the commit message says “If x is 0, ctz(x) is at least as large as the number of bits in its type” which is incorrect.

sipa commented at 10:54 pm on January 13, 2021:

Nice catch, will fix. The safegcd code never invokes it with x=0.

sipa commented at 11:07 pm on January 13, 2021:

Fixed.

sipa force-pushed on Jan 13, 2021

sipa commented at 0:03 am on January 14, 2021: contributor

Markdown version of the explanation: https://gist.github.com/sipa/bddcf6434b25cf1ff03a277cd4927d73 (perhaps this should be included in the repo as a separate file instead of the comment in modinv64_impl.h?).

It’s now part of this PR, see the doc/safegcd_implementation.md file. See https://github.com/sipa/secp256k1/blob/202010_pr767/doc/safegcd_implementation.md

sipa force-pushed on Jan 14, 2021

in src/tests.c:421 in 504a007709 outdated

415@@ -416,6 +416,20 @@ void run_scratch_tests(void) {
416     secp256k1_context_destroy(none);
417 }
418 
419+void run_ctz_tests(void) {
420+    static const uint32_t b32[5] = {1, 0xffffffff, 0x5e56968f, 0xe0d63129};
421+    static const uint64_t b64[5] = {1, 0xffffffffffffffff, 0xbcd02462139b3fc3, 0x98b5f80c769693ef};

real-or-random commented at 11:28 am on January 14, 2021:

nit: Why 5? Maybe b32[4] or b32[], same for b64

sipa commented at 6:57 pm on January 14, 2021:

Fixed.

real-or-random commented at 12:21 pm on January 14, 2021: contributor

Here’s a nit that I noticed while skimming. I’m currently reading the markdown file. It’s really awesome.

sipa force-pushed on Jan 14, 2021

in doc/safegcd_implementation.md:280 in 06a1fd1fd2 outdated

275+Let's look at bounds on the ranges of these numbers. It can be shown that *|u|+|v|* and *|q|+|r|*
276+never exceed *2<sup>N</sup>* (see paragraph 8.3 in the paper), and thus a multiplication with *t* will have
277+outputs whose absolute values are at most *2<sup>N</sup>* times the maximum absolute input value. In case the
278+inputs *d* and *e* are in *(-M,M)*, which is certainly true for the initial values *d=0* and *e=1* assuming
279+*M > 1*, the multiplication results in numbers in range *(-2<sup>N</sup>M,2<sup>N</sup>M)*. Subtracting up to *2<sup>N</sup>-1*
280+times *M* to cancel out *N* bits brings that up to slightly less than *(-2<sup>N+1</sup>M,2<sup>N</sup>M)*, and

real-or-random commented at 9:50 am on January 15, 2021:

nit: “less than” was somewhat unclear to me.

sipa commented at 8:10 pm on January 15, 2021:

I’ve changed it to the rounding up front. Is it clearer now?

in doc/safegcd_implementation.md:52 in 06a1fd1fd2 outdated

47+  - (b) Replace *(f,g)* with *(f,g/2)* (where *g* is guaranteed to be even).
48+- Neither of those two operations change the GCD:
49+  - For (a), assume *gcd(f,g)=c*, then it must be the case that *f=a&thinsp;c* and *g=b&thinsp;c* for some integers *a*
50+    and *b*. As *(g,g-f)=(b&thinsp;c,(b-a)c)* and *(f,f+g)=(a&thinsp;c,(a+b)c)*, the result clearly still has
51+    common factor *c*. Reasoning in the other direction shows that no common factor can be added by
52+    doing so either.

real-or-random commented at 9:55 am on January 15, 2021:

Am I right that this is like in the substraction-based variant of the euclidean algorithm? Might help the intuition of the reader.

sipa commented at 8:14 pm on January 15, 2021:

I believe it is the case that any linear transformation f' = u*f + v*g; g' = q*f + r*g;, which is (a) reversible and (b) has gcd(u,v)=1 gcd(q,r)=1 so it doesn’t introduce new factors has this property of not affecting the overall gcd. Do you think there is another way of formulating this which is easier?

sipa commented at 9:31 pm on January 15, 2021:

Another formulation: if M is a square matrix with integer coefficients and its inverse exists and also has integer coefficients, then applying the matrix M to a vector of integers does not change the gcd of its elements. This is true because in both directions the multiplication at least preserves any common factor the input had.

This holds for [[0,1],[-1,1]] (g,g-f) and for [[0,1],[1,1]] (f,g+f).

sipa commented at 10:12 pm on January 15, 2021:

Yet another: any square matrix M with integer coefficients and determinant 1 or -1 does not change gcds. This is easy to show in this direction, but I believe it also holds in the other direction (every matrix that doesn’t change gcds has integer coefficients and det 1 or -1).

in doc/safegcd_implementation.md:450 in 06a1fd1fd2 outdated

445+```
446+
447+To use that trick, we need a helper mask variable *c1* that resolves the condition *&delta;>0* to *-1*
448+(if true) or *0* (if false). We compute *c1* using right shifting, which always rounds down (in
449+Python, and also in C under the assumption of a typical two's complement system; see
450+`assumptions.h` for tests that this is the case).

real-or-random commented at 10:29 am on January 15, 2021:

TIL that -17 >> 1000 == -1 in python. That’s really neat, I never thought about the semantics of right shifting arbitrary-precision integers.

I got confused by just saying “rounds down”. It might be clearer to say that right shifting by 63 bits is equivalent to dividing by 2^63 and taking floors.

sipa commented at 8:14 pm on January 15, 2021:

I’ve reformulated this.

in doc/safegcd_implementation.md:14 in 06a1fd1fd2 outdated

 9+Most implementation aspects and optimizations are explained, except those that depend on the specific
10+number representation used in the C code.
11+
12+## 1. Computing the Greatest Common Divisor (GCD) using divsteps
13+
14+The algorithm from the paper, at a very high level, is this:

real-or-random commented at 10:38 am on January 15, 2021:

Since the paper offers a lot of algorithms,, it may be good to point the reader to Appendix E

sipa commented at 8:16 pm on January 15, 2021:

Done; the right reference is chapter 11 actually. Appendix E gives a very different algorithm, which is then proven to be equivalent to safegcd.

in doc/safegcd_implementation.md:553 in 06a1fd1fd2 outdated

547+```
548+
549+Whenever *g* is even, the loop only shifts *g* down and decreases *&eta;*. When *g* ends in multiple zero
550+bits, these iterations can be consolidated into one step. This requires counting the bottom zero
551+bits efficiently, which is possible on most platforms; it is abstracted here as the function
552+`count_trailing_zeros`.

real-or-random commented at 10:47 am on January 15, 2021:

Please add a possible definition of count_trailing_zeros. Then the reader can easier play around with it. This one seems nice to me: https://stackoverflow.com/a/63552117

sipa commented at 8:16 pm on January 15, 2021:

Done, nice.

real-or-random commented at 10:47 am on January 15, 2021: contributor

Let me say again that the Markdown file is very helpful!

real-or-random commented at 1:39 pm on January 15, 2021: contributor

This should update the implementation section in README.md.

Tip for other reviewers: I’m trying to review this commit-by-commit, which is probably a good idea. But for this PR, it really makes sense to look at multiple commits at once. (If you review on Github, see https://stackoverflow.com/a/50417336.) I was looking at the second commit and wanted to ask for more comments at many places just to figure out later that the third commit adds all of these comments! :D (@sipa Not saying you should change this, it’s good to preserve authorship. :+1: )

sipa force-pushed on Jan 15, 2021

sipa commented at 8:24 pm on January 15, 2021: contributor

@real-or-random I think I’ve addressed your comments.

sipa force-pushed on Jan 15, 2021

sipa force-pushed on Jan 17, 2021

sipa force-pushed on Jan 18, 2021

gmaxwell commented at 10:51 pm on January 18, 2021: contributor

An observation for people’s idle amusement: Almost (?) always after inverting we multiply the inverse with something. If the inverse is run with the initial e equal to the value we’d like to multiply the inverse with, we save the multiply. Sounds kinda pointless but var time is so fast this should be roughly a 2% speedup. Pieter pointed out to me though a that conversion to signed62 is needed and potentially a normalization of the incoming number, which would eat into the speedup.

gmaxwell commented at 3:14 am on January 19, 2021: contributor

This PR (x86_86 zen2): scalar_inverse_var: min 1.34us / avg 1.35us / max 1.35us field_inverse_var: min 1.32us / avg 1.33us / max 1.33us Make w uint32_t: scalar_inverse_var: min 1.33us / avg 1.33us / max 1.34us field_inverse_var: min 1.31us / avg 1.31us / max 1.31us

This PR (arm8): scalar_inverse_var: min 5.36us / avg 5.36us / max 5.37us field_inverse_var: min 5.17us / avg 5.17us / max 5.17us Make w uint32_t: scalar_inverse_var: min 4.92us / avg 4.92us / max 4.92us field_inverse_var: min 4.74us / avg 4.74us / max 4.74us

sipa force-pushed on Jan 19, 2021

in src/modinv64_impl.h:255 in dd9b381274 outdated

250+            /* In this branch, use a simpler formula that only lets us cancel up to 4 bits of g, as
251+             * eta tends to be smaller here. */
252+            limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
253+            /* m is a mask for the bottom min(limit, 4) bits. */
254+            m = (UINT64_MAX >> (64 - limit)) & 15U;
255+            /* Find what multiple of f must be added to g to cancel its bottom min(limit, 6)

peterdettman commented at 6:34 am on January 19, 2021:

Typo: 6 -> 4.

sipa commented at 7:15 am on January 19, 2021:

Fixed.

sipa force-pushed on Jan 19, 2021

sipa commented at 7:16 am on January 19, 2021: contributor

@gmaxwell Switched the w variable in divsteps_var from 64 to 32 bits, and from 32 to 16 bits. Sounds like that’s a small win on most systems, and harmless. I first tried making the mask computation also restricted in width, but that doesn’t work as eta may exceed 32 resp. 16 bits.

sipa commented at 4:52 am on January 20, 2021: contributor

@peterdettman Is there a particular reason for representing eta as uint64_t everywhere, and casting to int64_t just when signed semantics are needed? I think everythink works fine if it’s just always a signed int.

in src/modinv64_impl.h:90 in a16192bdd0 outdated

27+    int64_t cond_add;
28+
29+    /* In a first step, add the modulus if the input is negative, and then negate if requested.
30+     * This brings r from range (-2*modulus,modulus) to range (-modulus,modulus). As all input
31+     * limbs are in range (-2^62,2^62), this cannot overflow an int64_t. */
32+    cond_add = r4 >> 63;

real-or-random commented at 4:05 pm on January 20, 2021:

nit: Maybe refer to assumptions.h here again for the semantics of the right shift.

sipa commented at 0:39 am on January 21, 2021:

Done.

in src/modinv64_impl.h:101 in a16192bdd0 outdated

37+    r4 += modinfo->modulus.v[4] & cond_add;
38+    r0 = (r0 ^ cond_negate) - cond_negate;
39+    r1 = (r1 ^ cond_negate) - cond_negate;
40+    r2 = (r2 ^ cond_negate) - cond_negate;
41+    r3 = (r3 ^ cond_negate) - cond_negate;
42+    r4 = (r4 ^ cond_negate) - cond_negate;

real-or-random commented at 4:08 pm on January 20, 2021:

nit: the function is not self-contained because it’s not clear what cond_negate should be. Perhaps make that a normal boolean (int) and compute the mask here for clarity.

On a second thought, this may be marginally slower. If that’s the case, maybe just explain and VERIFY_CHECK the allowed values of cond_negate.

sipa commented at 0:39 am on January 21, 2021:

Done. I’ve changed the _normalize function to take in a “sign” variable instead, and make it do the mask computation internally.

in src/modinv64_impl.h:134 in a16192bdd0 outdated

69+#ifdef VERIFY
70+    VERIFY_CHECK(r->v[0] >> 62 == 0);
71+    VERIFY_CHECK(r->v[1] >> 62 == 0);
72+    VERIFY_CHECK(r->v[2] >> 62 == 0);
73+    VERIFY_CHECK(r->v[3] >> 62 == 0);
74+    VERIFY_CHECK(r->v[4] >>  8 == 0);

real-or-random commented at 4:26 pm on January 20, 2021:

I think this is because the modulus is at most 256 bits? Or should this be anyway true because secp256k1_modinv64_signed62 is supposed to hold only 256 bit integers? Maybe add comment to say where this comes from.

sipa commented at 0:38 am on January 21, 2021:

I’ve moved this set of assertions from modinv to the caller’s (the from_signed{30,62} functions in field/scalar), because that’s where it matters (the conversion relies on the numbers being in range and normalized). Added some extra comments as well there.

in src/modinv64_impl.h:239 in a16192bdd0 outdated

184+        /* If eta is negative, negate it and replace f,g with g,-f. */
185+        if ((int64_t)eta < 0) {
186+            eta = -eta;
187+            x = f; f = g; g = -x;
188+            y = u; u = q; q = -y;
189+            z = v; v = r; r = -z;

real-or-random commented at 4:39 pm on January 20, 2021:

nit: Maybe rename and unify x,y,z in a single tmp or swap.

I’d be curious if the compiler really finds code that is faster than our branchless version.

sipa commented at 11:44 pm on January 22, 2021:

Done. Changed for a local tmp variable.

in src/modinv64_impl.h:194 in a16192bdd0 outdated

189+            z = v; v = r; r = -z;
190+        }
191+        /* eta is now >= 0. In what follows we're going to cancel out the bottom bits of g. No more
192+         * than i can be cancelled out (as we'd be done before that point), and no more than eta+1
193+         * can be done as its sign will flip once that happens. */
194+        limit = ((int)eta + 1) > i ? i : ((int)eta + 1);

real-or-random commented at 4:57 pm on January 20, 2021:

Can you say something in general on the range of eta? eta should simply be bounded by the number of divsteps, so the casts and the +1 here are far from being problematic. Still, it may be more natural/easier to review to upcast i instead of downcasting eta in the comparisons. (Or simply make eta signed as you suggested, then upcasting i is done implicitly by the integer conversions and you don’t need an explicit cast to suppress warnings.)

sipa commented at 0:37 am on January 21, 2021:

Yeah eta can go in theory up to +-(1+steps), though finding cases for which it exceeds 256 is very hard (because the first 256 bits you can directly reason about based on the input, but all further steps operate on g bits that are a complex function of the effect of the previous steps).

sipa commented at 11:48 pm on January 22, 2021:

I’ve added VERIFY_CHECK statements to check the range of eta.

in src/modinv64_impl.h:196 in a16192bdd0 outdated

191+        /* eta is now >= 0. In what follows we're going to cancel out the bottom bits of g. No more
192+         * than i can be cancelled out (as we'd be done before that point), and no more than eta+1
193+         * can be done as its sign will flip once that happens. */
194+        limit = ((int)eta + 1) > i ? i : ((int)eta + 1);
195+        /* m is a mask for the bottom min(limit, 8) bits (our table only supports 8 bits). */
196+        m = (UINT64_MAX >> (64 - limit)) & 255U;

real-or-random commented at 5:07 pm on January 20, 2021:

n.b. limit > 0 so 64 - limit < 64.

sipa commented at 11:48 pm on January 22, 2021:

Added a VERIFY_CHECK for the range of limit.

in README.md:41 in 876b5d06ea outdated

33@@ -34,11 +34,11 @@ Implementation details
34   * Optimized implementation of arithmetic modulo the curve's field size (2^256 - 0x1000003D1).
35     * Using 5 52-bit limbs (including hand-optimized assembly for x86_64, by Diederik Huys).
36     * Using 10 26-bit limbs (including hand-optimized assembly for 32-bit ARM, by Wladimir J. van der Laan).
37-  * Field inverses and square roots using a sliding window over blocks of 1s (by Peter Dettman).
38 * Scalar operations
39   * Optimized implementation without data-dependent branches of arithmetic modulo the curve's order.
40     * Using 4 64-bit limbs (relying on __int128 support in the compiler).
41     * Using 8 32-bit limbs.
42+* Modular inverses (both field elements and scalars) based on [safegcd](https://gcd.cr.yp.to/index.html) with some modifications, and a variable-time variant.

real-or-random commented at 5:11 pm on January 20, 2021:

Should still mention Peter. :)

sipa commented at 0:38 am on January 21, 2021:

Done.

in src/modinv64_impl.h:337 in 62a4c35999 outdated

319@@ -320,22 +320,28 @@ static void secp256k1_modinv64_update_de_62(secp256k1_modinv64_signed62 *d, secp
320     /* Compute limb 1 of t*[d,e]+modulus*[md,me], and store it as output limb 0 (= down shift). */
321     cd += (int128_t)u * d1 + (int128_t)v * e1;
322     ce += (int128_t)q * d1 + (int128_t)r * e1;
323-    cd += (int128_t)modinfo->modulus.v[1] * md;
324-    ce += (int128_t)modinfo->modulus.v[1] * me;
325+    if (modinfo->modulus.v[1]) { /* Optimize for the case where limb of modulus is zero. */

real-or-random commented at 5:17 pm on January 20, 2021:

Do we know if this is evaluated at compile-time? Maybe that’s the reason why this is only sometimes faster.

sipa commented at 5:55 pm on January 20, 2021:

It’s not in general (it is when modinv* is only invoked once in a binary, in which case it’s inlined in its entirety).

I believe that the cost of the conditional jump here outweighs the cost of 2 wide multiplications.

It is hard to be actually sure here though, as our benchmarks may be priming the branch predictor in a way that isn’t representative of what happens in a real world load.

real-or-random commented at 11:07 pm on March 3, 2021:

A while ago I was reading up on defenses against Spectre, and since it is based on branch mispredictions, one of the simplest defenses is of course to avoid branches in context where we process secret data, even if the branch is on public data. [1] I think we have a few “interesting” branches in our code base, where I could see how we compute on secret data if the wrong branch is predicted. This branch is not obviously of that kind but I may be a good rule of thumb to avoid branches when in doubt.

This is more of a note, I’m not saying that this branch must be removed. I should provide more evidence, and it’s on my list to open an issue about this and point to more interesting cases in our code.

[1] See also https://fc21.ifca.ai/papers/18.pdf but this is different because they do automatic rewriting.

sipa force-pushed on Jan 21, 2021

peterdettman commented at 5:29 am on January 21, 2021: contributor

@peterdettman Is there a particular reason for representing eta as uint64_t everywhere, and casting to int64_t just when signed semantics are needed? I think everythink works fine if it’s just always a signed int.

Historically the divsteps methods were unsigned everything because the project had no policy on signed arithmetic right shift. At this point quite a few variables in divsteps would more naturally be signed (which they are in the algorithm itself).

sipa force-pushed on Jan 21, 2021

sipa commented at 6:02 am on January 21, 2021: contributor

@peterdettman Ok, switched eta everywhere to a signed type. All tests pass (including ubsan).

sipa force-pushed on Jan 22, 2021

sipa force-pushed on Jan 24, 2021

sipa commented at 3:26 am on January 24, 2021: contributor

Added a commit that removes scalar_sqr now that that’s entirely unused outside of tests as well.

sipa force-pushed on Jan 25, 2021

sipa commented at 7:45 pm on January 25, 2021: contributor

Rebased now #878 is merged.

real-or-random cross-referenced this on Jan 27, 2021 from issue Use blinded instead of constant time inverse in secret gej->ge? by gmaxwell

sanket1729 commented at 7:43 pm on January 28, 2021: none

nit: commit 8a61a85ba3296dd3f12eb2d2e9f35f43689dfa6b does not build.

sipa force-pushed on Jan 30, 2021

sipa commented at 2:57 am on January 30, 2021: contributor

Rebased on top of #864 (and then on master), made the “Remove unused scalar_sqr” commit compile again, removed a stale libgmp leftover in build_aux/m4/bitcoin_secp.m4.

sipa force-pushed on Jan 30, 2021

mratsim commented at 11:29 pm on February 11, 2021: none

Bernstein has a new release of its code which includes secp256k1 and it seems like there was a large improvement at the beginning of January: https://gcd.cr.yp.to/software.html

gmaxwell commented at 11:30 pm on February 11, 2021: contributor

@mratsim yes, that update is significantly a result of our contributions. https://twitter.com/hashbreaker/status/1348247397843968000 Also discussed in prior comments: #831 (comment)

That code uses AVX2 to concurrently update F/G/D/E all at once. It’s a great optimization, plus a lot of really impressive instruction scheduling. Unfortunately it’s also x86_64 AVX2 only code in hand code ASM. It’s perhaps 2x faster than this code. But this code is infinity faster on non-x86/32-bit/non-avx2 hardware. :p

in src/modinv32_impl.h:44 in bf8c6e7f2f outdated

39+
40+/* Compare af with b*factor. */
41+static int secp256k1_modinv32_mul_cmp_30(const secp256k1_modinv32_signed30 *a, const secp256k1_modinv32_signed30 *b, int32_t factor) {
42+    int i;
43+    secp256k1_modinv32_signed30 am, bm;
44+    secp256k1_modinv32_mul_30(&am, a, 1);

sanket1729 commented at 3:09 pm on February 22, 2021:

In bf8c6e7f2f309f34e53f10718ee55cda1b5db130 Why do we have this function call for multiplying by 1? It seems that function is cloning a into am when factor = 1. Unless we are unsure that a may have limbs exceeding 2^30.

sipa commented at 8:18 pm on March 1, 2021:

Yes, a is not necessarily normalized. I’ll add a comment.

in src/modinv32_impl.h:32 in bf8c6e7f2f outdated

27+static void secp256k1_modinv32_mul_30(secp256k1_modinv32_signed30 *r, const secp256k1_modinv32_signed30 *a, int32_t factor) {
28+    const int32_t M30 = (int32_t)(UINT32_MAX >> 2);
29+    int64_t c = 0;
30+    int i;
31+    for (i = 0; i < 8; ++i) {
32+        c += (factor == 1) ? (int64_t)a->v[i] : (int64_t)a->v[i] * factor;

sanket1729 commented at 3:12 pm on February 22, 2021:

In bf8c6e7f2f309f34e53f10718ee55cda1b5db130 If the following comment about factor = 1 is correct, we can remove all checks for factor ==1

sipa commented at 8:19 pm on March 1, 2021:

I’ll just drop this, it’s an optimization for test-only code.

in src/modinv32_impl.h:40 in bf8c6e7f2f outdated

35+    c += (factor == 1) ? (int64_t)a->v[8] : (int64_t)a->v[8] * factor;
36+    VERIFY_CHECK(c == (int32_t)c);
37+    r->v[8] = (int32_t)c;
38+}
39+
40+/* Compare af with b*factor. */

sanket1729 commented at 9:30 am on March 1, 2021:

In bf8c6e7f2f309f34e53f10718ee55cda1b5db130 typo: a with b*factor

sipa commented at 8:38 pm on March 1, 2021:

Fixed.

in src/modinv32_impl.h:220 in bf8c6e7f2f outdated

208@@ -149,12 +209,19 @@ static int32_t secp256k1_modinv32_divsteps_30(int32_t eta, uint32_t f0, uint32_t
209         g >>= 1;
210         u <<= 1;
211         v <<= 1;
212+        /* Bounds on eta that follow from the bounds on iteration count (max 25*30 divsteps). */
213+        VERIFY_CHECK(eta >= -751 && eta <= 751);

sanket1729 commented at 9:36 am on March 1, 2021:

in bf8c6e7f2f309f34e53f10718ee55cda1b5db130 This seems too loose. Can we say (eta >= -725 && eta <= 724) ?. If the point of this is just a sanity check and facilitate review maybe it’s not worth thinking about the exact numbers here

sipa commented at 8:45 pm on March 1, 2021:

Don’t forget that we perform divsteps in batches of 62/30, so you can’t assume we stop immediately when g=0 is reached. The ongoing batch will still be completed, and during that, eta and u/v/q/r will still change.

sanket1729 commented at 9:53 pm on March 1, 2021:

Yes, you are right. It is apparent in the case where we set g = 0 directly.

in src/modinv32_impl.h:283 in bf8c6e7f2f outdated

270@@ -204,6 +271,8 @@ static int32_t secp256k1_modinv32_divsteps_30_var(int32_t eta, uint32_t f0, uint
271         VERIFY_CHECK((g & 1) == 1);
272         VERIFY_CHECK((u * f0 + v * g0) == f << (30 - i));
273         VERIFY_CHECK((q * f0 + r * g0) == g << (30 - i));
274+        /* Bounds on eta that follow from the bounds on iteration count (max 12*62 divsteps). */
275+        VERIFY_CHECK(eta >= -751 && eta <= 751);

sanket1729 commented at 9:37 am on March 1, 2021:

in bf8c6e7f2f309f34e53f10718ee55cda1b5db130, Should be 25*30 instead of 12*62

sipa commented at 8:36 pm on March 1, 2021:

Fixed.

sanket1729 approved

sanket1729 commented at 10:02 am on March 1, 2021: none

Reviewed all the commits that deal with implementing/optimizing safegcd. Left some minor nits/typos.

Did not review the commits that move the code or remove the code. They are better done by with more experience with the codebase.

sipa force-pushed on Mar 1, 2021

sanket1729 approved

sanket1729 commented at 9:58 pm on March 1, 2021: none

reACK 1d2b20197660b7cd2053a0b0b7b40d0ee60beba3

in src/util.h:326 in a772bfba53 outdated

321+        0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28,
322+        62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11,
323+        63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10,
324+        51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12
325+    };
326+    return debruijn[((x & -x) * 0x022FDD63CC95386D) >> 58];

real-or-random commented at 9:15 pm on March 3, 2021:

a772bfba5380e96036650fb52005a8b9f58aefe3

I’m concerned about the ability to test these fallback implementations, because all tests we typically run locally and in CI won’t even use these. Could you move these implementations into separate functions such that we can always test them independently?

sipa commented at 3:46 am on March 5, 2021:

Done.

in src/modinv64_impl.h:166 in faff81b3ae outdated

90+ *
91+ * Implements the divsteps_n_matrix function from the explanation.
92+ */
93+static int64_t secp256k1_modinv64_divsteps_62(int64_t eta, uint64_t f0, uint64_t g0, secp256k1_modinv64_trans2x2 *t) {
94+    uint64_t u = 1, v = 0, q = 0, r = 1; /* start with identity matrix */
95+    uint64_t c1, c2, f = f0, g = g0, x, y, z;

real-or-random commented at 9:19 pm on March 3, 2021:

https://github.com/bitcoin-core/secp256k1/pull/831/commits/c6de80260d758c2deed2b6a2b74665a050643fb4

Is there a reason why these are still unsigned?

real-or-random commented at 11:35 pm on March 4, 2021:

ah I guess to avoid signed overflow with << ?

sipa commented at 3:46 am on March 5, 2021:

Added a comment.

in src/modinv64_impl.h:245 in faff81b3ae outdated

240+    VERIFY_CHECK(((int64_t)ce & M62) == 0); ce >>= 62;
241+    /* Compute limb 1 of t*[d,e]+modulus*[md,me], and store it as output limb 0 (= down shift). */
242+    cd += (int128_t)u * d1 + (int128_t)v * e1;
243+    ce += (int128_t)q * d1 + (int128_t)r * e1;
244+    cd += (int128_t)modinfo->modulus.v[1] * md;
245+    ce += (int128_t)modinfo->modulus.v[1] * me;

real-or-random commented at 10:14 pm on March 3, 2021:

https://github.com/bitcoin-core/secp256k1/pull/831/commits/c6de80260d758c2deed2b6a2b74665a050643fb4

Maybe this helps others. I think cd and ce don’t overflow in the last line here because

after 239: cd is at most 128 - 62 bits = 66 bits
after 242: u * d1 is at most 63 + 62 = 125 bits, same for the other summand, so cd will end up being at most 125 + 1 + 1 = 127 bits [1]
after 245: v[1] * m is at most 62 + 64 = 126 bits, so the sum cd is at most 128 bits

[1] not sure if u is in fact even 62 bits instead of 63?

sipa commented at 4:24 am on March 5, 2021:

If you reason at a slightly higher level, and accept/verify that cd and ce act as accumulators for a big integer computation (values scaled by 1 get added to it, one limb is extracted/shifted, values scaled by 2^62 get added to it, another limb is extracted/shifted, …) you can look at the sum of all those scaled values - and we know that the value of those is d and e itself, which are in range (-modulus,2*modulus).

That’s not a guarantee on itself about the intermediary ranges during the computation, but if no overflows happen there, it implies that after the last addition, the cd/ce values are in range floor(-modulus/2^(3*62))..floor(2*modulus/2^(3*62)), or just 71 bit (with 256-bit modulus).

in src/modinv64_impl.h:539 in faff81b3ae outdated

362+        /* Update d,e using that transition matrix. */
363+        secp256k1_modinv64_update_de_62(&d, &e, &t, modinfo);
364+        /* Update f,g using that transition matrix. */
365+        secp256k1_modinv64_update_fg_62(&f, &g, &t);
366+        /* If the bottom limb of g is zero, there is a chance that g=0. */
367+        if (g.v[0] == 0) {

real-or-random commented at 10:36 pm on March 3, 2021:

https://github.com/bitcoin-core/secp256k1/pull/831/commits/c6de80260d758c2deed2b6a2b74665a050643fb4

This is correct but I’m curious if you have benchmarked whether it’s worth to have this branch or if this is just an educated guess. We may also want to add an EXPECT here.

and nit: You could declare cond here.

sipa commented at 3:11 am on March 5, 2021:

I did benchmark it, and it was a small but statistically significant improvement for me. I’m also perfectly fine with just dropping this commit if it’s not considered worth the complexity.

real-or-random commented at 11:05 am on March 5, 2021:

No, I think it’s fine, I was really just curious. You may still want to add the EXPECT then.

sipa commented at 10:07 pm on March 5, 2021:

I tried a few alternatives (looping and breaking at the first nonzero, looping like the current code but without branch on the 1st limb, same code with EXPECT(…,0), same code with EXPECT(…,1)), but they’re all measurably slower (some less than 1%, which may be the result of random binary alignments, but it’s still consistently the case).

in src/modinv64_impl.h:521 in faff81b3ae outdated

353+    int i, j;
354+    int64_t eta = -1;
355+    int64_t cond;
356+
357+    /* Do up to 12 iterations of 62 divsteps each = 744 divsteps, or until g=0 (whichever comes first). */
358+    for (i = 0; i < 12; ++i) {

real-or-random commented at 10:46 pm on March 3, 2021:

https://github.com/bitcoin-core/secp256k1/pull/831/commits/c6de80260d758c2deed2b6a2b74665a050643fb4

Is there a reason to keep the iteration bound? I see that it may help the compiler prove termination, whatever this is good for.

real-or-random commented at 11:18 pm on March 3, 2021:

Ah I see this is removed later.

sipa commented at 3:14 am on March 5, 2021:

It’s not removed?

I prefer keeping it because it means that at least in tests we’d hit an error in the VERIFY_CHECK that follows rather than going into a possibly infinite loop. I guess an argument could be made about not doing that in non-test builds (it could be a while (true) loop that breaks when g=0, with a VERIFY_CHECK(i++ < 12); in it for example).

real-or-random commented at 11:04 am on March 5, 2021:

Oh yes, I was looking at the wrong loop.

I guess an argument could be made about not doing that in non-test builds (it could be a while (true) loop that breaks when g=0, with a VERIFY_CHECK(i++ < 12); in it for example).

Yeah, I think this will be a little bit more conservative and it also expresses the algorithm more in a more natural way.

sipa commented at 10:12 pm on March 5, 2021:

Done.

in src/tests.c:658 in 24f17be8f3 outdated

819@@ -814,8 +820,437 @@ void run_num_smalltests(void) {
820 }
821 #endif
822 
823+/***** MODINV TESTS *****/
824+
825+/* compute out = (a*b) mod m; if b=NULL, treat b=1. */
826+void mulmod256(uint16_t* out, const uint16_t* a, const uint16_t* b, const uint16_t* m) {

real-or-random commented at 9:16 pm on March 4, 2021:

https://github.com/bitcoin-core/secp256k1/pull/831/commits/24f17be8f301d91ebd84caf7cf611e105a70a0e2

This was a little bit hard to read at first without specifying the lengths of the argument arrays.

0void mulmod256(uint16_t[32] out, const uint16_t[16] a, const uint16_t[16] b, const uint16_t[16] m) {

or simply add a comment. (The array notation may not be our style.)

sipa commented at 3:43 am on March 5, 2021:

I’ve added a comment.

in src/tests.c:923 in 24f17be8f3 outdated

918+    for (i = 0; i < 256; ++i) {
919+        out[i >> 4] |= (((in->v[i / 30]) >> (i % 30)) & 1) << (i & 15);
920+    }
921+}
922+
923+/* Randomly mutate the sign of limbs in signed62 representation. */

real-or-random commented at 9:55 pm on March 4, 2021:

https://github.com/bitcoin-core/secp256k1/pull/831/commits/24f17be8f301d91ebd84caf7cf611e105a70a0e2 … without changing the number

sipa commented at 3:43 am on March 5, 2021:

Done.

in src/tests.c:927 in 24f17be8f3 outdated

922+
923+/* Randomly mutate the sign of limbs in signed62 representation. */
924+void mutate_sign_signed30(secp256k1_modinv32_signed30* x) {
925+    int i;
926+    for (i = 0; i < 16; ++i) {
927+        int pos = secp256k1_testrand_int(7);

real-or-random commented at 10:00 pm on March 4, 2021:

https://github.com/bitcoin-core/secp256k1/pull/831/commits/24f17be8f301d91ebd84caf7cf611e105a70a0e2

0        int pos = secp256k1_testrand_int(8);

testrand(n) returns an integer in [0..n-1]

sipa commented at 3:43 am on March 5, 2021:

Done.

in src/tests.c:1004 in 24f17be8f3 outdated

 999+/* Randomly mutate the sign of limbs in signed62 representation. */
1000+void mutate_sign_signed62(secp256k1_modinv64_signed62* x) {
1001+    static const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
1002+    int i;
1003+    for (i = 0; i < 8; ++i) {
1004+        int pos = secp256k1_testrand_int(3);

real-or-random commented at 10:39 pm on March 4, 2021:

https://github.com/bitcoin-core/secp256k1/pull/831/commits/24f17be8f301d91ebd84caf7cf611e105a70a0e2

0        int pos = secp256k1_testrand_int(4);

sipa commented at 3:43 am on March 5, 2021:

Done.

in src/tests.c:1051 in 24f17be8f3 outdated

1228+            secp256k1_testrand256_test((unsigned char*)md);
1229+            md[0] |= 1; /* modulus must be odd */
1230+            /* If modulus is 1, find another one. */
1231+            ok = md[0] != 1;
1232+            for (j = 1; j < 16; ++j) ok |= md[j] != 0;
1233+            mulmod256(xd, xd, NULL, md); /* Make xd = xd mod m32 */

real-or-random commented at 11:22 pm on March 4, 2021:

https://github.com/bitcoin-core/secp256k1/pull/831/commits/24f17be8f301d91ebd84caf7cf611e105a70a0e2

0            mulmod256(xd, xd, NULL, md); /* Make xd = xd mod md */

sipa commented at 3:43 am on March 5, 2021:

Done.

sipa force-pushed on Mar 5, 2021

sipa commented at 3:40 am on March 5, 2021: contributor

Pushed the following diff (+ rebase) to address @real-or-random’s comments:

  0diff --git a/configure.ac b/configure.ac
  1index 7f118160..e84005ed 100644
  2diff --git a/src/modinv32_impl.h b/src/modinv32_impl.h
  3index 956cde20..e30c0ff9 100644
  4--- a/src/modinv32_impl.h
  5+++ b/src/modinv32_impl.h
  6@@ -179,7 +179,13 @@ typedef struct {
  7  * Implements the divsteps_n_matrix function from the explanation.
  8  */
  9 static int32_t secp256k1_modinv32_divsteps_30(int32_t eta, uint32_t f0, uint32_t g0, secp256k1_modinv32_trans2x2 *t) {
 10-    uint32_t u = 1, v = 0, q = 0, r = 1; /* start with the identity matrix */
 11+    /* u,v,q,r are the elements of the transformation matrix being built up,
 12+     * starting with the identity matrix. Semantically they are signed integers
 13+     * in range [-2^30,2^30], but here represented as unsigned mod 2^32. This
 14+     * permits left shifting (which is UB for negative numbers). The range
 15+     * being inside [-2^31,2^31) means that casting to signed works correctly.
 16+     */
 17+    uint32_t u = 1, v = 0, q = 0, r = 1;
 18     uint32_t c1, c2, f = f0, g = g0, x, y, z;
 19     int i;
 20 
 21@@ -252,7 +258,8 @@ static int32_t secp256k1_modinv32_divsteps_30_var(int32_t eta, uint32_t f0, uint
 22         0xEF, 0xC5, 0xA3, 0x39, 0xB7, 0xCD, 0xAB, 0x01
 23     };
 24 
 25-    uint32_t u = 1, v = 0, q = 0, r = 1; /* Start with identity matrix */
 26+    /* Transformation matrix; see comments in secp256k1_modinv32_divsteps_30. */
 27+    uint32_t u = 1, v = 0, q = 0, r = 1;
 28     uint32_t f = f0, g = g0, m;
 29     uint16_t w;
 30     int i = 30, limit, zeros;
 31diff --git a/src/modinv64_impl.h b/src/modinv64_impl.h
 32index 66f06ff9..b0106311 100644
 33--- a/src/modinv64_impl.h
 34+++ b/src/modinv64_impl.h
 35@@ -156,7 +156,13 @@ typedef struct {
 36  * Implements the divsteps_n_matrix function from the explanation.
 37  */
 38 static int64_t secp256k1_modinv64_divsteps_62(int64_t eta, uint64_t f0, uint64_t g0, secp256k1_modinv64_trans2x2 *t) {
 39-    uint64_t u = 1, v = 0, q = 0, r = 1; /* start with identity matrix */
 40+    /* u,v,q,r are the elements of the transformation matrix being built up,
 41+     * starting with the identity matrix. Semantically they are signed integers
 42+     * in range [-2^62,2^62], but here represented as unsigned mod 2^64. This
 43+     * permits left shifting (which is UB for negative numbers). The range
 44+     * being inside [-2^63,2^63) means that casting to signed works correctly.
 45+     */
 46+    uint64_t u = 1, v = 0, q = 0, r = 1;
 47     uint64_t c1, c2, f = f0, g = g0, x, y, z;
 48     int i;
 49 
 50@@ -214,7 +220,8 @@ static int64_t secp256k1_modinv64_divsteps_62(int64_t eta, uint64_t f0, uint64_t
 51  * Implements the divsteps_n_matrix_var function from the explanation.
 52  */
 53 static int64_t secp256k1_modinv64_divsteps_62_var(int64_t eta, uint64_t f0, uint64_t g0, secp256k1_modinv64_trans2x2 *t) {
 54-    uint64_t u = 1, v = 0, q = 0, r = 1; /* Start with identity matrix */
 55+    /* Transformation matrix; see comments in secp256k1_modinv64_divsteps_62. */
 56+    uint64_t u = 1, v = 0, q = 0, r = 1;
 57     uint64_t f = f0, g = g0, m;
 58     uint32_t w;
 59     int i = 62, limit, zeros;
 60diff --git a/src/tests.c b/src/tests.c
 61index bb77326c..ac020e3c 100644
 62--- a/src/tests.c
 63+++ b/src/tests.c
 64@@ -429,11 +429,13 @@ void run_ctz_tests(void) {
 65     unsigned i;
 66     for (i = 0; i < sizeof(b32) / sizeof(b32[0]); ++i) {
 67         for (shift = 0; shift < 32; ++shift) {
 68+            CHECK(secp256k1_ctz32_var_debruijn(b32[i] << shift) == shift);
 69             CHECK(secp256k1_ctz32_var(b32[i] << shift) == shift);
 70         }
 71     }
 72     for (i = 0; i < sizeof(b64) / sizeof(b64[0]); ++i) {
 73         for (shift = 0; shift < 64; ++shift) {
 74+            CHECK(secp256k1_ctz64_var_debruijn(b64[i] << shift) == shift);
 75             CHECK(secp256k1_ctz64_var(b64[i] << shift) == shift);
 76         }
 77     }
 78@@ -636,7 +638,10 @@ void run_rand_int(void) {
 79 
 80 /***** MODINV TESTS *****/
 81 
 82-/* compute out = (a*b) mod m; if b=NULL, treat b=1. */
 83+/* compute out = (a*b) mod m; if b=NULL, treat b=1.
 84+ *
 85+ * Out is a 512-bit number (represented as 32 uint16_t's in LE order). The other
 86+ * arguments are 256-bit numbers (represented as 16 uint16_t's in LE order). */
 87 void mulmod256(uint16_t* out, const uint16_t* a, const uint16_t* b, const uint16_t* m) {
 88     uint16_t mul[32];
 89     uint64_t c = 0;
 90@@ -689,16 +694,18 @@ void mulmod256(uint16_t* out, const uint16_t* a, const uint16_t* b, const uint16
 91         int64_t cs;
 92 
 93         /* Compute mul2 = mul - m<<i. */
 94-        cs = 0;
 95-        for (j = (i >> 4); j < 32; ++j) { /* skip limbs before i/16 */
 96+        cs = 0; /* accumulator */
 97+        for (j = 0; j < 32; ++j) { /* j loops over the output limbs in mul2. */
 98+            /* Compute sub: the 16 bits in m that will be subtracted from mul2[j]. */
 99             uint16_t sub = 0;
100             int p;
101-            for (p = 0; p < 16; ++p) {
102-                int bitpos = j * 16 - i + p;
103+            for (p = 0; p < 16; ++p) { /* p loops over the bit positions in mul2[j]. */
104+                int bitpos = j * 16 - i + p; /* bitpos is the correspond bit position in m. */
105                 if (bitpos >= 0 && bitpos < 256) {
106                     sub |= ((m[bitpos >> 4] >> (bitpos & 15)) & 1) << p;
107                 }
108             }
109+            /* Add mul[j]-sub to accumulator, and shift bottom 16 bits out to mul2[j]. */
110             cs += mul[j];
111             cs -= sub;
112             mul2[j] = (cs & 0xFFFF);
113@@ -706,10 +713,10 @@ void mulmod256(uint16_t* out, const uint16_t* a, const uint16_t* b, const uint16
114         }
115         /* If remainder of subtraction is 0, set mul = mul2. */
116         if (cs == 0) {
117-            memcpy(mul + (i >> 4), mul2 + (i >> 4), sizeof(mul) - 2 * (i >> 4));
118+            memcpy(mul, mul2, sizeof(mul));
119         }
120     }
121-    /* Test that all limbs higher than m's highest are zero */
122+    /* Sanity check: test that all limbs higher than m's highest are zero */
123     for (i = (m_bitlen >> 4) + 1; i < 32; ++i) {
124         CHECK(mul[i] == 0);
125     }
126@@ -734,11 +741,11 @@ void signed30_to_uint16(uint16_t* out, const secp256k1_modinv32_signed30* in) {
127     }
128 }
129 
130-/* Randomly mutate the sign of limbs in signed62 representation. */
131+/* Randomly mutate the sign of limbs in signed30 representation, without changing the value. */
132 void mutate_sign_signed30(secp256k1_modinv32_signed30* x) {
133     int i;
134     for (i = 0; i < 16; ++i) {
135-        int pos = secp256k1_testrand_int(7);
136+        int pos = secp256k1_testrand_int(8);
137         if (x->v[pos] > 0 && x->v[pos + 1] <= 0x3fffffff) {
138             x->v[pos] -= 0x40000000;
139             x->v[pos + 1] += 1;
140@@ -763,10 +770,9 @@ void test_modinv32_uint16(uint16_t* out, const uint16_t* in, const uint16_t* mod
141 
142     /* compute 1/modulus mod 2^30 */
143     m.modulus_inv30 = m.modulus.v[0];
144-    m.modulus_inv30 *= (2 - m.modulus_inv30 * m.modulus.v[0]);
145-    m.modulus_inv30 *= (2 - m.modulus_inv30 * m.modulus.v[0]);
146-    m.modulus_inv30 *= (2 - m.modulus_inv30 * m.modulus.v[0]);
147-    m.modulus_inv30 *= (2 - m.modulus_inv30 * m.modulus.v[0]);
148+    for (i = 0; i < 4; ++i) {
149+        m.modulus_inv30 *= (2 - m.modulus_inv30 * m.modulus.v[0]);
150+    }
151     m.modulus_inv30 &= 0x3fffffff;
152     CHECK(((m.modulus_inv30 * m.modulus.v[0]) & 0x3fffffff) == 1);
153 
154@@ -810,12 +816,12 @@ void signed62_to_uint16(uint16_t* out, const secp256k1_modinv64_signed62* in) {
155     }
156 }
157 
158-/* Randomly mutate the sign of limbs in signed62 representation. */
159+/* Randomly mutate the sign of limbs in signed62 representation, without changing the value. */
160 void mutate_sign_signed62(secp256k1_modinv64_signed62* x) {
161     static const int64_t M62 = (int64_t)(UINT64_MAX >> 2);
162     int i;
163     for (i = 0; i < 8; ++i) {
164-        int pos = secp256k1_testrand_int(3);
165+        int pos = secp256k1_testrand_int(4);
166         if (x->v[pos] > 0 && x->v[pos + 1] <= M62) {
167             x->v[pos] -= (M62 + 1);
168             x->v[pos + 1] += 1;
169@@ -841,11 +847,9 @@ void test_modinv64_uint16(uint16_t* out, const uint16_t* in, const uint16_t* mod
170 
171     /* compute 1/modulus mod 2^62 */
172     m.modulus_inv62 = m.modulus.v[0];
173-    m.modulus_inv62 *= (2 - m.modulus_inv62 * m.modulus.v[0]);
174-    m.modulus_inv62 *= (2 - m.modulus_inv62 * m.modulus.v[0]);
175-    m.modulus_inv62 *= (2 - m.modulus_inv62 * m.modulus.v[0]);
176-    m.modulus_inv62 *= (2 - m.modulus_inv62 * m.modulus.v[0]);
177-    m.modulus_inv62 *= (2 - m.modulus_inv62 * m.modulus.v[0]);
178+    for (i = 0; i < 5; ++i) {
179+        m.modulus_inv62 *= (2 - m.modulus_inv62 * m.modulus.v[0]);
180+    }
181     m.modulus_inv62 &= M62;
182     CHECK(((m.modulus_inv62 * m.modulus.v[0]) & M62) == 1);
183 
184@@ -1044,7 +1048,7 @@ void run_modinv_tests(void) {
185             /* If modulus is 1, find another one. */
186             ok = md[0] != 1;
187             for (j = 1; j < 16; ++j) ok |= md[j] != 0;
188-            mulmod256(xd, xd, NULL, md); /* Make xd = xd mod m32 */
189+            mulmod256(xd, xd, NULL, md); /* Make xd = xd mod md */
190         } while (!(ok && coprime(xd, md)));
191 
192         test_modinv32_uint16(id, xd, md);
193diff --git a/src/util.h b/src/util.h
194index a0b6f318..f7884683 100644
195--- a/src/util.h
196+++ b/src/util.h
197@@ -280,6 +280,31 @@ SECP256K1_GNUC_EXT typedef __int128 int128_t;
198 #define __has_builtin(x) 0
199 #endif
200 
201+/* Determine the number of trailing zero bits in a (non-zero) 32-bit x.
202+ * This function is only intended to be used as fallback for
203+ * secp256k1_ctz32_var, but permits it to be tested separately. */
204+static SECP256K1_INLINE int secp256k1_ctz32_var_debruijn(uint32_t x) {
205+    static const uint8_t debruijn[32] = {
206+        0x00, 0x01, 0x02, 0x18, 0x03, 0x13, 0x06, 0x19, 0x16, 0x04, 0x14, 0x0A,
207+        0x10, 0x07, 0x0C, 0x1A, 0x1F, 0x17, 0x12, 0x05, 0x15, 0x09, 0x0F, 0x0B,
208+        0x1E, 0x11, 0x08, 0x0E, 0x1D, 0x0D, 0x1C, 0x1B
209+    };
210+    return debruijn[((x & -x) * 0x04D7651F) >> 27];
211+}
212+
213+/* Determine the number of trailing zero bits in a (non-zero) 64-bit x.
214+ * This function is only intended to be used as fallback for
215+ * secp256k1_ctz64_var, but permits it to be tested separately. */
216+static SECP256K1_INLINE int secp256k1_ctz64_var_debruijn(uint64_t x) {
217+    static const uint8_t debruijn[64] = {
218+        0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28,
219+        62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11,
220+        63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10,
221+        51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12
222+    };
223+    return debruijn[((x & -x) * 0x022FDD63CC95386D) >> 58];
224+}
225+
226 /* Determine the number of trailing zero bits in a (non-zero) 32-bit x. */
227 static SECP256K1_INLINE int secp256k1_ctz32_var(uint32_t x) {
228     VERIFY_CHECK(x != 0);
229@@ -294,12 +319,7 @@ static SECP256K1_INLINE int secp256k1_ctz32_var(uint32_t x) {
230     return __builtin_ctzl(x);
231 #else
232     /* If no suitable CTZ builtin is available, use a (variable time) software emulation. */
233-    static const uint8_t debruijn[32] = {
234-        0x00, 0x01, 0x02, 0x18, 0x03, 0x13, 0x06, 0x19, 0x16, 0x04, 0x14, 0x0A,
235-        0x10, 0x07, 0x0C, 0x1A, 0x1F, 0x17, 0x12, 0x05, 0x15, 0x09, 0x0F, 0x0B,
236-        0x1E, 0x11, 0x08, 0x0E, 0x1D, 0x0D, 0x1C, 0x1B
237-    };
238-    return debruijn[((x & -x) * 0x04D7651F) >> 27];
239+    return secp256k1_ctz32_var_debruijn(x);
240 #endif
241 }
242 
243@@ -317,13 +337,7 @@ static SECP256K1_INLINE int secp256k1_ctz64_var(uint64_t x) {
244     return __builtin_ctzll(x);
245 #else
246     /* If no suitable CTZ builtin is available, use a (variable time) software emulation. */
247-    static const uint8_t debruijn[64] = {
248-        0, 1, 2, 53, 3, 7, 54, 27, 4, 38, 41, 8, 34, 55, 48, 28,
249-        62, 5, 39, 46, 44, 42, 22, 9, 24, 35, 59, 56, 49, 18, 29, 11,
250-        63, 52, 6, 26, 37, 40, 33, 47, 61, 45, 43, 21, 23, 58, 17, 10,
251-        51, 25, 36, 32, 60, 20, 57, 16, 50, 31, 19, 15, 30, 14, 13, 12
252-    };
253-    return debruijn[((x & -x) * 0x022FDD63CC95386D) >> 58];
254+    return secp256k1_ctz64_var_debruijn(x);
255 #endif
256 }

sipa force-pushed on Mar 5, 2021

real-or-random commented at 11:12 am on March 5, 2021: contributor

Changes look good. :) In general, the PR is very clean. I believe I’m doing a pretty deep review, but all I have is some nits.

I still need to go through commits “Improve bounds checks in modinv modules”..“Improve field/scalar inverse tests”.

f8df286 Do you have some reference for that formula? Same below for 4 bits.

You marked this resolved but I don’t see a reply or a change. I think what I’m actually asking for is some comment that says that “this computes the inverse of …”. Of course, a reference for the trick itself would still be neat.

I guess this is the trick as used in the tests? It may be more self-documenting to give it a name (macro?) and reuse it. (I can’t tell if that’s appropriate, feel free to dismiss).

(Posting here in main thread because Github threading is fucked up.)

sipa force-pushed on Mar 5, 2021

sipa commented at 10:11 pm on March 5, 2021: contributor

Pushed a few more changes:

  0diff --git a/doc/safegcd_implementation.md b/doc/safegcd_implementation.md
  1index 1a059912..8346d22e 100644
  2--- a/doc/safegcd_implementation.md
  3+++ b/doc/safegcd_implementation.md
  4@@ -616,7 +616,9 @@ while True:
  5 
  6 By using a bigger table more bits can be cancelled at once. The table can also be implemented
  7-as a formula:
  8+as a formula. Several formulas are known for computing modular inverses modulo powers of two;
  9+some can be found in Hacker's Delight second edition by Henry S. Warren, Jr. pages 245-247.
 10+Here we need the negated modular inverse, which is a simple transformation of those:
 11 
 12 - Instead of a 3-bit table:
 13   - *-f* or *f ^ 6*
 14diff --git a/src/modinv32_impl.h b/src/modinv32_impl.h
 15index e30c0ff9..3eba04fc 100644
 16--- a/src/modinv32_impl.h
 17+++ b/src/modinv32_impl.h
 18@@ -511,12 +511,15 @@ static void secp256k1_modinv32_var(secp256k1_modinv32_signed30 *x, const secp256
 19     secp256k1_modinv32_signed30 e = {{1, 0, 0, 0, 0, 0, 0, 0, 0}};
 20     secp256k1_modinv32_signed30 f = modinfo->modulus;
 21     secp256k1_modinv32_signed30 g = *x;
 22-    int i, j, len = 9;
 23+#ifdef VERIFY
 24+    int i = 0;
 25+#endif
 26+    int j, len = 9;
 27     int32_t eta = -1;
 28     int32_t cond, fn, gn;
 29 
 30-    /* Do up to 25 iterations of 30 divsteps each, or until g=0 (whichever comes first). */
 31-    for (i = 0; i < 25; ++i) {
 32+    /* Do iterations of 30 divsteps each until g=0. */
 33+    while (1) {
 34         /* Compute transition matrix and new eta after 30 divsteps. */
 35         secp256k1_modinv32_trans2x2 t;
 36         eta = secp256k1_modinv32_divsteps_30_var(eta, f.v[0], g.v[0], &t);
 37@@ -554,6 +557,7 @@ static void secp256k1_modinv32_var(secp256k1_modinv32_signed30 *x, const secp256
 38             --len;
 39         }
 40 #ifdef VERIFY
 41+        VERIFY_CHECK(++i < 25); /* We should never need more than 25*30 = 750 divsteps */
 42         VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, -1) > 0); /* f > -modulus */
 43         VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&f, len, &modinfo->modulus, 1) <= 0); /* f <= modulus */
 44         VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(&g, len, &modinfo->modulus, -1) > 0); /* g > -modulus */
 45diff --git a/src/modinv64_impl.h b/src/modinv64_impl.h
 46index b0106311..e1ba8b63 100644
 47--- a/src/modinv64_impl.h
 48+++ b/src/modinv64_impl.h
 49@@ -513,12 +513,15 @@ static void secp256k1_modinv64_var(secp256k1_modinv64_signed62 *x, const secp256
 50     secp256k1_modinv64_signed62 e = {{1, 0, 0, 0, 0}};
 51     secp256k1_modinv64_signed62 f = modinfo->modulus;
 52     secp256k1_modinv64_signed62 g = *x;
 53-    int i, j, len = 5;
 54+#ifdef VERIFY
 55+    int i = 0;
 56+#endif
 57+    int j, len = 5;
 58     int64_t eta = -1;
 59     int64_t cond, fn, gn;
 60 
 61-    /* Do up to 12 iterations of 62 divsteps each, or until g=0 (whichever comes first). */
 62-    for (i = 0; i < 12; ++i) {
 63+    /* Do iterations of 62 divsteps each until g=0. */
 64+    while (1) {
 65         /* Compute transition matrix and new eta after 62 divsteps. */
 66         secp256k1_modinv64_trans2x2 t;
 67         eta = secp256k1_modinv64_divsteps_62_var(eta, f.v[0], g.v[0], &t);
 68@@ -556,6 +559,7 @@ static void secp256k1_modinv64_var(secp256k1_modinv64_signed62 *x, const secp256
 69             --len;
 70         }
 71 #ifdef VERIFY
 72+        VERIFY_CHECK(++i < 12); /* We should never need more than 12*62 = 744 divsteps */
 73         VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, len, &modinfo->modulus, -1) > 0); /* f > -modulus */
 74         VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&f, len, &modinfo->modulus, 1) <= 0); /* f <= modulus */
 75         VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(&g, len, &modinfo->modulus, -1) > 0); /* g > -modulus */
 76diff --git a/src/tests.c b/src/tests.c
 77index ac020e3c..ba645bbe 100644
 78--- a/src/tests.c
 79+++ b/src/tests.c
 80@@ -638,6 +638,19 @@ void run_rand_int(void) {
 81 
 82 /***** MODINV TESTS *****/
 83 
 84+/* Compute the modular inverse of (odd) x mod 2^64. */
 85+uint64_t modinv2p64(uint64_t x) {
 86+    /* If w = 1/x mod 2^(2^L), then w*(2 - w*x) = 1/x mod 2^(2^(L+1)). See
 87+     * Hacker's Delight second edition, Henry S. Warren, Jr., pages 245-247 for
 88+     * why. Start with L=0, for which it is true for every odd x that
 89+     * 1/x=1 mod 2. Iterating 6 times gives us 1/x mod 2^64. */
 90+    int l;
 91+    uint64_t w = 1;
 92+    CHECK(x & 1);
 93+    for (l = 0; l < 6; ++l) w *= (2 - w*x);
 94+    return w;
 95+}
 96+
 97 /* compute out = (a*b) mod m; if b=NULL, treat b=1.
 98  *
 99  * Out is a 512-bit number (represented as 32 uint16_t's in LE order). The other
100@@ -769,11 +782,7 @@ void test_modinv32_uint16(uint16_t* out, const uint16_t* in, const uint16_t* mod
101     mutate_sign_signed30(&m.modulus);
102 
103     /* compute 1/modulus mod 2^30 */
104-    m.modulus_inv30 = m.modulus.v[0];
105-    for (i = 0; i < 4; ++i) {
106-        m.modulus_inv30 *= (2 - m.modulus_inv30 * m.modulus.v[0]);
107-    }
108-    m.modulus_inv30 &= 0x3fffffff;
109+    m.modulus_inv30 = modinv2p64(m.modulus.v[0]) & 0x3fffffff;
110     CHECK(((m.modulus_inv30 * m.modulus.v[0]) & 0x3fffffff) == 1);
111 
112     for (vartime = 0; vartime < 2; ++vartime) {
113@@ -846,11 +855,7 @@ void test_modinv64_uint16(uint16_t* out, const uint16_t* in, const uint16_t* mod
114     mutate_sign_signed62(&m.modulus);
115 
116     /* compute 1/modulus mod 2^62 */
117-    m.modulus_inv62 = m.modulus.v[0];
118-    for (i = 0; i < 5; ++i) {
119-        m.modulus_inv62 *= (2 - m.modulus_inv62 * m.modulus.v[0]);
120-    }
121-    m.modulus_inv62 &= M62;
122+    m.modulus_inv62 = modinv2p64(m.modulus.v[0]) & M62;
123     CHECK(((m.modulus_inv62 * m.modulus.v[0]) & M62) == 1);
124 
125     for (vartime = 0; vartime < 2; ++vartime) {

sipa commented at 10:20 pm on March 5, 2021: contributor

@real-or-random I think I’ve addressed all your comments, including adding references for the modular inverse mod power of two formula (for some reason I can’t comment where you actually mentioned it, i didn’t mark it resolved…).

Regarding the conditionals on the zero limbs in the modulus, I’m not sure that’s the sort of thing we can and/or should care about. If any branch at all is a concern, I don’t see how e.g. the various loops aren’t a problem either. That said, I’m also perfectly fine with just dropping that commit, as the benefit is fairly small (and I’m not sure it’s even observable on all platforms we tested).

Add secp256k1_ctz{32,64}_var functions

These functions count the number of trailing zeroes in non-zero integers.

de0a643c3d

Add safegcd based modular inverse modules

Refactored by: Pieter Wuille <pieter@wuille.net>

8e415acba2

Add extensive comments on the safegcd algorithm and implementation

This adds a long comment explaining the algorithm and implementation choices by building
it up step by step in Python.

Comments in the code are also reworked/added, with references to the long explanation.

d8a92fcc4c

Add tests for modinv modules

This adds tests for the modinv{32,64}_impl.h directly (before the functions are used
inside the field/scalar code). It uses a naive implementation of modular multiplication
and gcds in order to verify the modular inverses themselves.

151aac00d3

sipa force-pushed on Mar 8, 2021

sipa commented at 5:59 pm on March 8, 2021: contributor

Rebased on top of merged #901.

in src/modinv32_impl.h:54 in 639769bee4 outdated

49+            VERIFY_CHECK(am.v[i] >> 30 == 0);
50+            VERIFY_CHECK(bm.v[i] >> 30 == 0);
51+        }
52+        if (am.v[i] < bm.v[i]) return -1;
53+        if (am.v[i] > bm.v[i]) return 1;
54+    }

real-or-random commented at 9:44 am on March 10, 2021:

The code doesn’t exactly hold the promise in the comment because the loop may terminate early. Maybe do the VERIFY_CHECKs in a separate loop before this one?

(I mean it’s obvious from the implementation of secp256k1_modinv32_mul_30 but I like that you added the VERIFY_CHECKs here to check the contract with secp256k1_modinv32_mul_30.)

sipa commented at 6:47 pm on March 11, 2021:

Done, moved them to a separate loop.

in src/modinv32_impl.h:40 in 639769bee4 outdated

35+    c += (int64_t)a->v[8] * factor;
36+    VERIFY_CHECK(c == (int32_t)c);
37+    r->v[8] = (int32_t)c;
38+}
39+
40+/* Compare a with b*factor. */

real-or-random commented at 9:48 am on March 10, 2021:

nit: maybe specify return value in a sentence.

sipa commented at 6:47 pm on March 11, 2021:

Done.

in src/modinv32_impl.h:74 in 639769bee4 outdated

65@@ -30,6 +66,17 @@ static void secp256k1_modinv32_normalize_30(secp256k1_modinv32_signed30 *r, int3
66             r5 = r->v[5], r6 = r->v[6], r7 = r->v[7], r8 = r->v[8];
67     int32_t cond_add, cond_negate;
68 
69+#ifdef VERIFY
70+    /* Verify that all limbs are in range (-2^30,2^30). */
71+    int i;
72+    for (i = 0; i < 9; ++i) {
73+        VERIFY_CHECK(r->v[i] >= -M30);
74+        VERIFY_CHECK(r->v[i] <= M30);

real-or-random commented at 10:07 am on March 10, 2021:

(-2^30,2^30) or [-2^30,2^30]?

sipa commented at 6:23 pm on March 11, 2021:

(-2^20,2^30) is correct. It’s even stronger in practice I believe, and the bottom 8 limbs are in [0,2^30), and the top limb is in [-2^18,2^17) or so. Note that _normalize is only called on the d variable, and d is only ever updated through _update_de (which produces 8 output limbs masked by M30, and the range of the top limb can be reasoned about through the range of d itself).

sipa commented at 6:48 pm on March 11, 2021:

Also, note that M30 is 0x3fffffff, so it’s actually checking (-2^30,2^30).

in src/modinv32_impl.h:158 in 639769bee4 outdated

153+    VERIFY_CHECK(r5 >> 30 == 0);
154+    VERIFY_CHECK(r6 >> 30 == 0);
155+    VERIFY_CHECK(r7 >> 30 == 0);
156+    VERIFY_CHECK(r8 >> 30 == 0);
157+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, &modinfo->modulus, 0) >= 0); /* r >= 0 */
158+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, &modinfo->modulus, 1) < 0); /* r < P */

real-or-random commented at 10:15 am on March 10, 2021:

nit: s/P/modulus

sipa commented at 6:48 pm on March 11, 2021:

Done.

real-or-random commented at 10:21 am on March 10, 2021: contributor

I think all comments apply analogously to the 64 bit code.

in src/field_5x52_impl.h:565 in a71a582c46 outdated

618-    for (j=0; j<3; j++) {
619-        secp256k1_fe_sqr(&x223, &x223);
620-    }
621-    secp256k1_fe_mul(&x223, &x223, &x3);
622+#ifdef VERIFY
623+    VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == zero_in);

real-or-random commented at 11:53 am on March 10, 2021:

I think this could be written without the aux variable zero_in, which would make it possible to get rid of the three #ifdef blocks here. (Even if we keep zero_in, the single #ifdef here is not necessary.)

sipa commented at 6:50 pm on March 11, 2021:

Done. This works in the field code, because there is a temporary copy. In the scalar code that’s not the case because we can’t assume r and x don’t alias.

in src/scalar_4x64_impl.h:813 in a71a582c46 outdated

994-    secp256k1_scalar_sqr(&x8, &x6);
995-    secp256k1_scalar_sqr(&x8, &x8);
996-    secp256k1_scalar_mul(&x8, &x8,  &x2);
997+#ifdef VERIFY
998+    VERIFY_CHECK(secp256k1_scalar_check_overflow(r) == 0);
999+#endif

real-or-random commented at 11:54 am on March 10, 2021:

nit: no need for #ifdef here

sipa commented at 6:50 pm on March 11, 2021:

Done.

in src/scalar_4x64_impl.h:822 in a71a582c46 outdated

1013-        secp256k1_scalar_sqr(&x28, &x28);
1014-    }
1015-    secp256k1_scalar_mul(&x28, &x28, &x14);
1016+#ifdef VERIFY
1017+    VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0);
1018+#endif

real-or-random commented at 11:55 am on March 10, 2021:

same here

sipa commented at 6:50 pm on March 11, 2021:

Done.

real-or-random commented at 12:29 pm on March 10, 2021: contributor

a71a582c46fac1cff02a7855bb836ad2722e7dcc

some of these comments apply to the other variants of the code (field vs scalar, 32 bits vs 64 bit, constant time vs _var…)

in src/tests.c:2115 in 7cda96e813 outdated

2445+
2446+/* These tests test the following identities:
2447+ *
2448+ * for x==0: 1/x == 0
2449+ * for x!=0: x*(1/x) == 1
2450+ * for x!=0 and x!=1: 1/(1/x - 1) + 1 == -1/(x-1)

real-or-random commented at 1:04 pm on March 10, 2021:

Doesn’t this hold for x==1 too?

sipa commented at 6:51 pm on March 11, 2021:

No, if we follows 1/0=0 (as implemented by the modinv code), then the left hand side evaluates to 1/(1/1-1)+1 = 1/0+1 = 1, and the right hand side to -1/(1-1) = -1.

real-or-random commented at 7:01 pm on March 11, 2021:

indeed, I had a mistake in my calculation

real-or-random commented at 1:23 pm on March 10, 2021: contributor

ACK mod my comments

Improve bounds checks in modinv modules

This commit adds functions to verify and compare numbers in signed{30,62} notation,
and uses that to do more extensive bounds checking on various variables in the modinv
code.

08d54964e5

Move secp256k1_scalar_{inverse{_var},is_even} to per-impl files

This temporarily duplicates the inversion code across the 4x64 and 8x32
implementations. Those implementations will be replaced in a later commit.

aa404d53be

Move secp256k1_fe_inverse{_var} to per-impl files

This temporarily duplicates the inversion code across the 5x52 and 10x26
implementations. Those implementations will be replaced in a next commit.

436281afdc

sipa force-pushed on Mar 11, 2021

sipa commented at 6:46 pm on March 11, 2021: contributor

Addressed @real-or-random’s comments:

  0diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
  1index 1a33cd81..c2802514 100644
  2--- a/src/field_10x26_impl.h
  3+++ b/src/field_10x26_impl.h
  4@@ -1230,43 +1230,27 @@ static const secp256k1_modinv32_modinfo secp256k1_const_modinfo_fe = {
  5 static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *x) {
  6     secp256k1_fe tmp;
  7     secp256k1_modinv32_signed30 s;
  8-#ifdef VERIFY
  9-    int zero_in;
 10-#endif
 11 
 12     tmp = *x;
 13     secp256k1_fe_normalize(&tmp);
 14-#ifdef VERIFY
 15-    zero_in = secp256k1_fe_normalizes_to_zero(&tmp);
 16-#endif
 17     secp256k1_fe_to_signed30(&s, &tmp);
 18     secp256k1_modinv32(&s, &secp256k1_const_modinfo_fe);
 19     secp256k1_fe_from_signed30(r, &s);
 20 
 21-#ifdef VERIFY
 22-    VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == zero_in);
 23-#endif
 24+    VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == secp256k1_fe_normalizes_to_zero(&tmp));
 25 }
 26 
 27 static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *x) {
 28     secp256k1_fe tmp;
 29     secp256k1_modinv32_signed30 s;
 30-#ifdef VERIFY
 31-    int zero_in;
 32-#endif
 33 
 34     tmp = *x;
 35     secp256k1_fe_normalize_var(&tmp);
 36-#ifdef VERIFY
 37-    zero_in = secp256k1_fe_normalizes_to_zero(&tmp);
 38-#endif
 39     secp256k1_fe_to_signed30(&s, &tmp);
 40     secp256k1_modinv32_var(&s, &secp256k1_const_modinfo_fe);
 41     secp256k1_fe_from_signed30(r, &s);
 42 
 43-#ifdef VERIFY
 44-    VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == zero_in);
 45-#endif
 46+    VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == secp256k1_fe_normalizes_to_zero(&tmp));
 47 }
 48 
 49 #endif /* SECP256K1_FIELD_REPR_IMPL_H */
 50diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
 51index b08a087f..56e71bc9 100644
 52--- a/src/field_5x52_impl.h
 53+++ b/src/field_5x52_impl.h
 54@@ -548,43 +548,27 @@ static const secp256k1_modinv64_modinfo secp256k1_const_modinfo_fe = {
 55 static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *x) {
 56     secp256k1_fe tmp;
 57     secp256k1_modinv64_signed62 s;
 58-#ifdef VERIFY
 59-    int zero_in;
 60-#endif
 61 
 62     tmp = *x;
 63     secp256k1_fe_normalize(&tmp);
 64-#ifdef VERIFY
 65-    zero_in = secp256k1_fe_normalizes_to_zero(&tmp);
 66-#endif
 67     secp256k1_fe_to_signed62(&s, &tmp);
 68     secp256k1_modinv64(&s, &secp256k1_const_modinfo_fe);
 69     secp256k1_fe_from_signed62(r, &s);
 70 
 71-#ifdef VERIFY
 72-    VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == zero_in);
 73-#endif
 74+    VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == secp256k1_fe_normalizes_to_zero(&tmp));
 75 }
 76 
 77 static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *x) {
 78     secp256k1_fe tmp;
 79     secp256k1_modinv64_signed62 s;
 80-#ifdef VERIFY
 81-    int zero_in;
 82-#endif
 83 
 84     tmp = *x;
 85     secp256k1_fe_normalize_var(&tmp);
 86-#ifdef VERIFY
 87-    zero_in = secp256k1_fe_normalizes_to_zero(&tmp);
 88-#endif
 89     secp256k1_fe_to_signed62(&s, &tmp);
 90     secp256k1_modinv64_var(&s, &secp256k1_const_modinfo_fe);
 91     secp256k1_fe_from_signed62(r, &s);
 92 
 93-#ifdef VERIFY
 94-    VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == zero_in);
 95-#endif
 96+    VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == secp256k1_fe_normalizes_to_zero(&tmp));
 97 }
 98 
 99 #endif /* SECP256K1_FIELD_REPR_IMPL_H */
100diff --git a/src/modinv32_impl.h b/src/modinv32_impl.h
101index 3eba04fc..aa7988c4 100644
102--- a/src/modinv32_impl.h
103+++ b/src/modinv32_impl.h
104@@ -37,18 +37,18 @@ static void secp256k1_modinv32_mul_30(secp256k1_modinv32_signed30 *r, const secp
105     r->v[8] = (int32_t)c;
106 }
107 
108-/* Compare a with b*factor. */
109+/* Return -1 for a<b*factor, 0 for a==b*factor, 1 for a>b*factor. A consists of alen limbs; b has 9. */
110 static int secp256k1_modinv32_mul_cmp_30(const secp256k1_modinv32_signed30 *a, int alen, const secp256k1_modinv32_signed30 *b, int32_t factor) {
111     int i;
112     secp256k1_modinv32_signed30 am, bm;
113     secp256k1_modinv32_mul_30(&am, a, alen, 1); /* Normalize all but the top limb of a. */
114     secp256k1_modinv32_mul_30(&bm, b, 9, factor);
115+    for (i = 0; i < 8; ++i) {
116+        /* Verify that all but the top limb of a and b are normalized. */
117+        VERIFY_CHECK(am.v[i] >> 30 == 0);
118+        VERIFY_CHECK(bm.v[i] >> 30 == 0);
119+    }
120     for (i = 8; i >= 0; --i) {
121-        if (i != 8) {
122-            /* Verify that all but the top limb of a and b are normalized. */
123-            VERIFY_CHECK(am.v[i] >> 30 == 0);
124-            VERIFY_CHECK(bm.v[i] >> 30 == 0);
125-        }
126         if (am.v[i] < bm.v[i]) return -1;
127         if (am.v[i] > bm.v[i]) return 1;
128     }
129@@ -155,7 +155,7 @@ static void secp256k1_modinv32_normalize_30(secp256k1_modinv32_signed30 *r, int3
130     VERIFY_CHECK(r7 >> 30 == 0);
131     VERIFY_CHECK(r8 >> 30 == 0);
132     VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, 9, &modinfo->modulus, 0) >= 0); /* r >= 0 */
133-    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, 9, &modinfo->modulus, 1) < 0); /* r < P */
134+    VERIFY_CHECK(secp256k1_modinv32_mul_cmp_30(r, 9, &modinfo->modulus, 1) < 0); /* r < modulus */
135 #endif
136 }
137 
138diff --git a/src/modinv64_impl.h b/src/modinv64_impl.h
139index e1ba8b63..78505fa1 100644
140--- a/src/modinv64_impl.h
141+++ b/src/modinv64_impl.h
142@@ -43,18 +43,18 @@ static void secp256k1_modinv64_mul_62(secp256k1_modinv64_signed62 *r, const secp
143     r->v[4] = (int64_t)c;
144 }
145 
146-/* Compare a with b*bf. */
147+/* Return -1 for a<b*factor, 0 for a==b*factor, 1 for a>b*factor. A has alen limbs; b has 5. */
148 static int secp256k1_modinv64_mul_cmp_62(const secp256k1_modinv64_signed62 *a, int alen, const secp256k1_modinv64_signed62 *b, int64_t factor) {
149     int i;
150     secp256k1_modinv64_signed62 am, bm;
151     secp256k1_modinv64_mul_62(&am, a, alen, 1); /* Normalize all but the top limb of a. */
152     secp256k1_modinv64_mul_62(&bm, b, 5, factor);
153+    for (i = 0; i < 4; ++i) {
154+        /* Verify that all but the top limb of a and b are normalized. */
155+        VERIFY_CHECK(am.v[i] >> 62 == 0);
156+        VERIFY_CHECK(bm.v[i] >> 62 == 0);
157+    }
158     for (i = 4; i >= 0; --i) {
159-        if (i != 4) {
160-            /* Verify that all but the top limb of a and b are normalized. */
161-            VERIFY_CHECK(am.v[i] >> 62 == 0);
162-            VERIFY_CHECK(bm.v[i] >> 62 == 0);
163-        }
164         if (am.v[i] < bm.v[i]) return -1;
165         if (am.v[i] > bm.v[i]) return 1;
166     }
167@@ -132,7 +132,7 @@ static void secp256k1_modinv64_normalize_62(secp256k1_modinv64_signed62 *r, int6
168     VERIFY_CHECK(r3 >> 62 == 0);
169     VERIFY_CHECK(r4 >> 62 == 0);
170     VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, 5, &modinfo->modulus, 0) >= 0); /* r >= 0 */
171-    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, 5, &modinfo->modulus, 1) < 0); /* r < P */
172+    VERIFY_CHECK(secp256k1_modinv64_mul_cmp_62(r, 5, &modinfo->modulus, 1) < 0); /* r < modulus */
173 #endif
174 }
175 
176diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
177index a1def26f..f8f37ff8 100644
178--- a/src/scalar_4x64_impl.h
179+++ b/src/scalar_4x64_impl.h
180@@ -808,18 +808,14 @@ static void secp256k1_scalar_from_signed62(secp256k1_scalar *r, const secp256k1_
181     r->d[2] = a2 >> 4 | a3 << 58;
182     r->d[3] = a3 >> 6 | a4 << 56;
183 
184-#ifdef VERIFY
185     VERIFY_CHECK(secp256k1_scalar_check_overflow(r) == 0);
186-#endif
187 }
188 
189 static void secp256k1_scalar_to_signed62(secp256k1_modinv64_signed62 *r, const secp256k1_scalar *a) {
190     const uint64_t M62 = UINT64_MAX >> 2;
191     const uint64_t a0 = a->d[0], a1 = a->d[1], a2 = a->d[2], a3 = a->d[3];
192 
193-#ifdef VERIFY
194     VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0);
195-#endif
196 
197     r->v[0] =  a0                   & M62;
198     r->v[1] = (a0 >> 62 | a1 <<  2) & M62;
199diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
200index 62c7ae71..3282c68d 100644
201--- a/src/scalar_8x32_impl.h
202+++ b/src/scalar_8x32_impl.h
203@@ -670,9 +670,7 @@ static void secp256k1_scalar_from_signed30(secp256k1_scalar *r, const secp256k1_
204     r->d[6] = a6 >> 12 | a7 << 18;
205     r->d[7] = a7 >> 14 | a8 << 16;
206 
207-#ifdef VERIFY
208     VERIFY_CHECK(secp256k1_scalar_check_overflow(r) == 0);
209-#endif
210 }
211 
212 static void secp256k1_scalar_to_signed30(secp256k1_modinv32_signed30 *r, const secp256k1_scalar *a) {
213@@ -680,9 +678,7 @@ static void secp256k1_scalar_to_signed30(secp256k1_modinv32_signed30 *r, const s
214     const uint32_t a0 = a->d[0], a1 = a->d[1], a2 = a->d[2], a3 = a->d[3],
215                    a4 = a->d[4], a5 = a->d[5], a6 = a->d[6], a7 = a->d[7];
216 
217-#ifdef VERIFY
218     VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0);
219-#endif
220 
221     r->v[0] =  a0                   & M30;
222     r->v[1] = (a0 >> 30 | a1 <<  2) & M30;

sipa commented at 7:27 pm on March 11, 2021: contributor

FWIW, ACK the parts of the code by @peterdettman here. I assume that’s implied by it being my PR, but still - most of the non-test code isn’t mine here.

Make field/scalar code use the new modinv modules for inverses 1e0e885c8a

Improve field/scalar inverse tests

Add a new run_inverse_tests that replaces all existing field/scalar inverse tests,
and tests a few identities for fixed inputs, small numbers (-999...999), random
inputs (structured and unstructured), as well as comparing with the output of
secp256k1_fe_inv_all_var.

aa9cc52180

Remove unused scalar_sqr 5437e7bdfb

Remove unused Jacobi symbol support

No exposed functions rely on Jacobi symbol computation anymore. Remove it; it can always
be brough back later if needed.

20448b8d09

sipa force-pushed on Mar 12, 2021

sipa commented at 6:07 pm on March 12, 2021: contributor

To make sure we the unused evaluations of expressions inside VERIFY_CHECKs don’t affect runtime, I made this change:

 0diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
 1index 56e71bc9..b73cfea2 100644
 2--- a/src/field_5x52_impl.h
 3+++ b/src/field_5x52_impl.h
 4@@ -555,7 +555,9 @@ static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *x) {
 5     secp256k1_modinv64(&s, &secp256k1_const_modinfo_fe);
 6     secp256k1_fe_from_signed62(r, &s);
 7 
 8+#ifdef VERIFY
 9     VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == secp256k1_fe_normalizes_to_zero(&tmp));
10+#endif
11 }
12 
13 static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *x) {
14@@ -568,7 +570,9 @@ static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *x) {
15     secp256k1_modinv64_var(&s, &secp256k1_const_modinfo_fe);
16     secp256k1_fe_from_signed62(r, &s);
17 
18+#ifdef VERIFY
19     VERIFY_CHECK(secp256k1_fe_normalizes_to_zero(r) == secp256k1_fe_normalizes_to_zero(&tmp));
20+#endif
21 }
22 
23 #endif /* SECP256K1_FIELD_REPR_IMPL_H */
24diff --git a/src/scalar_4x64_impl.h b/src/scalar_4x64_impl.h
25index f8f37ff8..a1def26f 100644
26--- a/src/scalar_4x64_impl.h
27+++ b/src/scalar_4x64_impl.h
28@@ -808,14 +808,18 @@ static void secp256k1_scalar_from_signed62(secp256k1_scalar *r, const secp256k1_
29     r->d[2] = a2 >> 4 | a3 << 58;
30     r->d[3] = a3 >> 6 | a4 << 56;
31 
32+#ifdef VERIFY
33     VERIFY_CHECK(secp256k1_scalar_check_overflow(r) == 0);
34+#endif
35 }
36 
37 static void secp256k1_scalar_to_signed62(secp256k1_modinv64_signed62 *r, const secp256k1_scalar *a) {
38     const uint64_t M62 = UINT64_MAX >> 2;
39     const uint64_t a0 = a->d[0], a1 = a->d[1], a2 = a->d[2], a3 = a->d[3];
40 
41+#ifdef VERIFY
42     VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0);
43+#endif
44 
45     r->v[0] =  a0                   & M62;
46     r->v[1] = (a0 >> 62 | a1 <<  2) & M62;
47diff --git a/src/scalar_8x32_impl.h b/src/scalar_8x32_impl.h
48index 3282c68d..62c7ae71 100644
49--- a/src/scalar_8x32_impl.h
50+++ b/src/scalar_8x32_impl.h
51@@ -670,7 +670,9 @@ static void secp256k1_scalar_from_signed30(secp256k1_scalar *r, const secp256k1_
52     r->d[6] = a6 >> 12 | a7 << 18;
53     r->d[7] = a7 >> 14 | a8 << 16;
54 
55+#ifdef VERIFY
56     VERIFY_CHECK(secp256k1_scalar_check_overflow(r) == 0);
57+#endif
58 }
59 
60 static void secp256k1_scalar_to_signed30(secp256k1_modinv32_signed30 *r, const secp256k1_scalar *a) {
61@@ -678,7 +680,9 @@ static void secp256k1_scalar_to_signed30(secp256k1_modinv32_signed30 *r, const s
62     const uint32_t a0 = a->d[0], a1 = a->d[1], a2 = a->d[2], a3 = a->d[3],
63                    a4 = a->d[4], a5 = a->d[5], a6 = a->d[6], a7 = a->d[7];
64 
65+#ifdef VERIFY
66     VERIFY_CHECK(secp256k1_scalar_check_overflow(a) == 0);
67+#endif
68 
69     r->v[0] =  a0                   & M30;
70     r->v[1] = (a0 >> 30 | a1 <<  2) & M30;

Safegcd inverses, drop Jacobi symbols, remove libgmp #831