I suggest following optimizations in the scalar_4x64_imp.h file:
CHANGE:
#define muladd_fast(a,b) { \
uint64_t tl, th; \
{ \
uint128_t t = (uint128_t)a * b; \
th = t >> 64; /* at most 0xFFFFFFFFFFFFFFFE */ \
tl = t; \
} \
c0 += tl; /* overflow is handled on the next line */ \
th += (c0 < tl) ? 1 : 0; /* at most 0xFFFFFFFFFFFFFFFF */ \
c1 += th; /* never overflows by contract (verified in the next line) */ \
VERIFY_CHECK(c1 >= th); \
}
TO:
#define muladd_fast(a,b) { \
uint64_t tl, th; \
uint128_t t = (uint128_t)a * b + c0; \
c0 = (uint64_t)t; \
c1 += (uint64_t)(t >> 64); /* never overflows by contract (verified in the next line) */ \
VERIFY_CHECK(c1 >= t >> 64); \
}
This is repeated in multiple macros.