I suggest following optimizations in the scalar_4x64_imp.h file:
CHANGE:
 0#define muladd_fast(a,b) { \
 1    uint64_t tl, th; \
 2    { \
 3        uint128_t t = (uint128_t)a * b; \
 4        th = t >> 64;         /* at most 0xFFFFFFFFFFFFFFFE */ \
 5        tl = t; \
 6    } \
 7    c0 += tl;                 /* overflow is handled on the next line */ \
 8    th += (c0 < tl) ? 1 : 0;  /* at most 0xFFFFFFFFFFFFFFFF */ \
 9    c1 += th;                 /* never overflows by contract (verified in the next line) */ \
10    VERIFY_CHECK(c1 >= th); \
11}
TO:
0#define muladd_fast(a,b) { \
1    uint64_t tl, th; \
2    uint128_t t = (uint128_t)a * b + c0; \
3    c0 = (uint64_t)t; \
4    c1 += (uint64_t)(t >> 64);  /* never overflows by contract (verified in the next line) */ \
5    VERIFY_CHECK(c1 >= t >> 64); \
6}
This is repeated in multiple macros.