Yuval Yarom pointed us to this presentation https://cryptojedi.org/peter/data/chesrump-20130822.pdf which suggests that on at least some hardware uniform cacheline access is not enough, presumably because the loads have multi-cycle delays and earlier bytes are being made available earlier (?).
The obvious defense is to read all the bytes, which is what we expected cacheline colocation to do so doing it explicitly shouldn’t require more memory bandwidth, and then extract the data we want.
A kludgy implementation given our current memory layout (and typedefs, pardon the typepunning) is:
0diff --git a/src/ecmult_gen_impl.h b/src/ecmult_gen_impl.h
1index 07859ab..157ab05 100644
2--- a/src/ecmult_gen_impl.h
3+++ b/src/ecmult_gen_impl.h
4@@ -104,14 +104,32 @@ static void secp256k1_ecmult_gen(secp256k1_gej_t *r, const secp256k1_scalar_t *g
5 const secp256k1_ecmult_gen_consts_t *c = secp256k1_ecmult_gen_consts;
6 secp256k1_gej_set_infinity(r);
7 secp256k1_ge_t add;
8- int bits;
9+ uint32_t bits;
10+ uint32_t bpos;
11 for (int j=0; j<64; j++) {
12 bits = secp256k1_scalar_get_bits(gn, j * 4, 4);
13+ bpos = (1<<((bits&3)<<3))*255;
14+ int b2 = (bits>>2)&1;
15+ int b3 = (bits>>3)&1;
16+ uint32_t maska = bpos*!(b2|b3);
17+ uint32_t maskb = bpos*(b2&!b3);
18+ uint32_t maskc = bpos*(b3&!b2);
19+ uint32_t maskd = bpos*(b2&b3);
20+ bits = (bits&3)<<3;
21 for (size_t k=0; k<sizeof(secp256k1_ge_t); k++)
22- ((unsigned char*)(&add))[k] = c->prec[j][k][bits];
23+ {
24+ uint32_t *p;
25+ p = (uint32_t *)c->prec[j][k];
26+ uint32_t ra = maska & p[0];
27+ uint32_t rb = maskb & p[1];
28+ uint32_t rc = maskc & p[2];
29+ uint32_t rd = maskd & p[3];
30+ ((unsigned char*)(&add))[k] = (ra|rb|rc|rd)>>bits;
31+ }
32 secp256k1_gej_add_ge(r, r, &add);
33 }
34 bits = 0;
35+ bpos = 0;
36 secp256k1_ge_clear(&add);
37 }
Is something like this something we’d like to do?