I’m a bit concerned this is not well-defined. We can’t guarantee that chunk has an alignment that is compatible with the uint8x16_t type, and even if it is, whether such a reinterpretation is permitted would be highly architecture-specific (of course, this is already architecture-specific code). Do you know of documentation that specifically permits this?
If not, I’d use this patch:
 0diff --git a/src/crypto/sha256_arm_shani.cpp b/src/crypto/sha256_arm_shani.cpp
 1index c051d87042..a783be9068 100644
 2--- a/src/crypto/sha256_arm_shani.cpp
 3+++ b/src/crypto/sha256_arm_shani.cpp
 4@@ -47,8 +47,6 @@ void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
 5     STATE0 = vld1q_u32(&s[0]);
 6     STATE1 = vld1q_u32(&s[4]);
 7 
 8-    const uint8x16_t* input32 = reinterpret_cast<const uint8x16_t*>(chunk);
 9-
10     while (blocks--)
11     {
12         // Save state
13@@ -56,10 +54,14 @@ void Transform(uint32_t* s, const unsigned char* chunk, size_t blocks)
14         CDGH_SAVE = STATE1;
15 
16         // Load and convert input chunk to Big Endian
17-        MSG0 = vreinterpretq_u32_u8(vrev32q_u8(*input32++));
18-        MSG1 = vreinterpretq_u32_u8(vrev32q_u8(*input32++));
19-        MSG2 = vreinterpretq_u32_u8(vrev32q_u8(*input32++));
20-        MSG3 = vreinterpretq_u32_u8(vrev32q_u8(*input32++));
21+        MSG0 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 0)));
22+        MSG1 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 16)));
23+        MSG2 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 32)));
24+        MSG3 = vreinterpretq_u32_u8(vrev32q_u8(vld1q_u8(chunk + 48)));
25+        chunk += 64;
26 
27         // Original implemenation preloaded message and constant addition which was 1-3% slower.
28         // Now included as first step in quad round code saving one Q Neon register