File Coverage

blake3.c
Criterion Covered Total %
statement 174 177 98.3
branch 61 64 95.3
condition n/a
subroutine n/a
pod n/a
total 235 241 97.5


line stmt bran cond sub pod time code
1             /*
2             * blake3.c - BLAKE3 reference implementation, sequential, no SIMD.
3             *
4             * Implements the BLAKE3 hash function in hash mode with the default
5             * 32-byte (256-bit) output. Streaming via the spec's tree-stack
6             * algorithm: at any moment we hold one in-progress chunk plus up to
7             * BLAKE3_MAX_DEPTH chaining values waiting to be combined with their
8             * right siblings. Per-call memory is O(log N) over the input length.
9             */
10              
11             #include "blake3.h"
12             #include
13              
14             /* ---------------- constants ---------------- */
15              
16             static const uint32_t IV[8] = {
17             0x6A09E667u, 0xBB67AE85u, 0x3C6EF372u, 0xA54FF53Au,
18             0x510E527Fu, 0x9B05688Cu, 0x1F83D9ABu, 0x5BE0CD19u
19             };
20              
21             /* Domain-separation flags. */
22             #define CHUNK_START 0x01u
23             #define CHUNK_END 0x02u
24             #define PARENT 0x04u
25             #define ROOT 0x08u
26             /* KEYED_HASH / DERIVE_KEY_* unused in v0.01 (hash mode only). */
27              
28             /* Message-word permutation applied between rounds (rounds 1..6). */
29             static const uint8_t MSG_PERM[16] = {
30             2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8
31             };
32              
33             /* ---------------- bit/byte helpers ---------------- */
34              
35             #define ROR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
36              
37             static uint32_t
38 3168           load32_le(const unsigned char *p)
39             {
40 3168           return (uint32_t)p[0]
41 3168           | ((uint32_t)p[1] << 8)
42 3168           | ((uint32_t)p[2] << 16)
43 3168           | ((uint32_t)p[3] << 24);
44             }
45              
46             static void
47 88           store32_le(unsigned char *p, uint32_t v)
48             {
49 88           p[0] = (unsigned char)( v & 0xFFu);
50 88           p[1] = (unsigned char)((v >> 8) & 0xFFu);
51 88           p[2] = (unsigned char)((v >> 16) & 0xFFu);
52 88           p[3] = (unsigned char)((v >> 24) & 0xFFu);
53 88           }
54              
55             /* ---------------- compression ---------------- */
56              
57             static void
58 11592           g_mix(uint32_t state[16], int a, int b, int c, int d, uint32_t mx, uint32_t my)
59             {
60 11592           state[a] = state[a] + state[b] + mx;
61 11592           state[d] = ROR32(state[d] ^ state[a], 16);
62 11592           state[c] = state[c] + state[d];
63 11592           state[b] = ROR32(state[b] ^ state[c], 12);
64 11592           state[a] = state[a] + state[b] + my;
65 11592           state[d] = ROR32(state[d] ^ state[a], 8);
66 11592           state[c] = state[c] + state[d];
67 11592           state[b] = ROR32(state[b] ^ state[c], 7);
68 11592           }
69              
70             static void
71 1449           round_fn(uint32_t state[16], const uint32_t m[16])
72             {
73             /* Columns. */
74 1449           g_mix(state, 0, 4, 8, 12, m[0], m[1]);
75 1449           g_mix(state, 1, 5, 9, 13, m[2], m[3]);
76 1449           g_mix(state, 2, 6, 10, 14, m[4], m[5]);
77 1449           g_mix(state, 3, 7, 11, 15, m[6], m[7]);
78             /* Diagonals. */
79 1449           g_mix(state, 0, 5, 10, 15, m[ 8], m[ 9]);
80 1449           g_mix(state, 1, 6, 11, 12, m[10], m[11]);
81 1449           g_mix(state, 2, 7, 8, 13, m[12], m[13]);
82 1449           g_mix(state, 3, 4, 9, 14, m[14], m[15]);
83 1449           }
84              
85             static void
86 1242           permute(uint32_t m[16])
87             {
88             uint32_t tmp[16];
89             int i;
90 21114 100         for (i = 0; i < 16; i++) tmp[i] = m[MSG_PERM[i]];
91 21114 100         for (i = 0; i < 16; i++) m[i] = tmp[i];
92 1242           }
93              
94             /* compress: produces full 16-word output. Caller takes the first 8
95             * words for a chaining value, or all 16 for a root-output block. */
96             static void
97 207           compress(const uint32_t chaining_value[8],
98             const uint32_t block_words_in[16],
99             uint64_t counter,
100             uint32_t block_len,
101             uint32_t flags,
102             uint32_t out[16])
103             {
104             uint32_t state[16];
105             uint32_t block_words[16];
106             int i;
107              
108 207           state[ 0] = chaining_value[0];
109 207           state[ 1] = chaining_value[1];
110 207           state[ 2] = chaining_value[2];
111 207           state[ 3] = chaining_value[3];
112 207           state[ 4] = chaining_value[4];
113 207           state[ 5] = chaining_value[5];
114 207           state[ 6] = chaining_value[6];
115 207           state[ 7] = chaining_value[7];
116 207           state[ 8] = IV[0];
117 207           state[ 9] = IV[1];
118 207           state[10] = IV[2];
119 207           state[11] = IV[3];
120 207           state[12] = (uint32_t)(counter & 0xFFFFFFFFull);
121 207           state[13] = (uint32_t)((counter >> 32) & 0xFFFFFFFFull);
122 207           state[14] = block_len;
123 207           state[15] = flags;
124              
125 3519 100         for (i = 0; i < 16; i++) block_words[i] = block_words_in[i];
126              
127 207           round_fn(state, block_words); /* round 1 */
128 207           permute(block_words);
129 207           round_fn(state, block_words); /* round 2 */
130 207           permute(block_words);
131 207           round_fn(state, block_words); /* round 3 */
132 207           permute(block_words);
133 207           round_fn(state, block_words); /* round 4 */
134 207           permute(block_words);
135 207           round_fn(state, block_words); /* round 5 */
136 207           permute(block_words);
137 207           round_fn(state, block_words); /* round 6 */
138 207           permute(block_words);
139 207           round_fn(state, block_words); /* round 7 */
140              
141             /* XOR upper half into lower half, then upper into chaining value
142             * - this is the "feedforward" the spec describes. The full 16-word
143             * output is what the caller can consume. */
144 1863 100         for (i = 0; i < 8; i++) {
145 1656           state[i] ^= state[i + 8];
146 1656           state[i + 8] ^= chaining_value[i];
147             }
148              
149 3519 100         for (i = 0; i < 16; i++) out[i] = state[i];
150 207           }
151              
152             /* Convert a 64-byte block to 16 little-endian words. */
153             static void
154 198           block_to_words(const unsigned char *block, uint32_t out[16])
155             {
156             int i;
157 3366 100         for (i = 0; i < 16; i++) out[i] = load32_le(block + i * 4);
158 198           }
159              
160             /* ---------------- chunk machinery ---------------- */
161              
162             static uint32_t
163 198           chunk_start_flag(const blake3_chunk_state_t *cs)
164             {
165 198 100         return cs->blocks_compressed == 0 ? CHUNK_START : 0;
166             }
167              
168             static void
169 20           chunk_init(blake3_chunk_state_t *cs, const uint32_t key[8], uint64_t counter)
170             {
171             int i;
172 180 100         for (i = 0; i < 8; i++) cs->chaining_value[i] = key[i];
173 20           cs->chunk_counter = counter;
174 20           cs->buffer_len = 0;
175 20           cs->blocks_compressed = 0;
176             /* Zero the buffer so that a partial-or-empty last block is read as
177             * zero-padded message words. Without this an empty input would
178             * read stack garbage from the buffer in chunk_output_root. */
179 20           memset(cs->buffer, 0, BLAKE3_BLOCK_SIZE);
180 20           }
181              
182             static size_t
183 17           chunk_len(const blake3_chunk_state_t *cs)
184             {
185 17           return (size_t)cs->blocks_compressed * BLAKE3_BLOCK_SIZE + cs->buffer_len;
186             }
187              
188             static void
189 178           chunk_compress_buffer(blake3_chunk_state_t *cs, uint32_t flags)
190             {
191             uint32_t block_words[16];
192             uint32_t out[16];
193             int i;
194              
195 178           block_to_words(cs->buffer, block_words);
196 178           compress(cs->chaining_value, block_words, cs->chunk_counter,
197 178           (uint32_t)cs->buffer_len,
198 178           flags | chunk_start_flag(cs),
199             out);
200 1602 100         for (i = 0; i < 8; i++) cs->chaining_value[i] = out[i];
201 178           cs->blocks_compressed++;
202 178           cs->buffer_len = 0;
203 178           memset(cs->buffer, 0, BLAKE3_BLOCK_SIZE);
204 178           }
205              
206             /* Append `len` bytes (len > 0). Caller must ensure chunk_len + len
207             * <= BLAKE3_CHUNK_SIZE. */
208             static void
209 17           chunk_update(blake3_chunk_state_t *cs, const unsigned char *p, size_t len,
210             uint32_t flags)
211             {
212             /* Fill any partial block, compressing only when we know more bytes
213             * (or a chunk boundary) will follow. We never compress the LAST
214             * block of a chunk here - the caller does that with CHUNK_END. */
215 212 100         while (len > 0) {
216             size_t take;
217 195 100         if (cs->buffer_len == BLAKE3_BLOCK_SIZE) {
218 178           chunk_compress_buffer(cs, flags);
219             }
220 195           take = BLAKE3_BLOCK_SIZE - cs->buffer_len;
221 195 100         if (take > len) take = len;
222 195           memcpy(cs->buffer + cs->buffer_len, p, take);
223 195           cs->buffer_len += (uint8_t)take;
224 195           p += take;
225 195           len -= take;
226             }
227 17           }
228              
229             /* Finalise a chunk into an 8-word chaining value (used as a leaf in
230             * the tree). Caller passes `is_root` to add the ROOT flag, in which
231             * case `out16` (16 words) gets the full output for byte extraction. */
232             static void
233 12           chunk_output_cv(blake3_chunk_state_t *cs, uint32_t flags, uint32_t out_cv[8])
234             {
235             uint32_t block_words[16];
236             uint32_t out[16];
237             int i;
238              
239 12           block_to_words(cs->buffer, block_words);
240 12           compress(cs->chaining_value, block_words, cs->chunk_counter,
241 12           (uint32_t)cs->buffer_len,
242 12           flags | chunk_start_flag(cs) | CHUNK_END,
243             out);
244 108 100         for (i = 0; i < 8; i++) out_cv[i] = out[i];
245 12           }
246              
247             /* As above but produces the full 16-word root output. */
248             static void
249 8           chunk_output_root(blake3_chunk_state_t *cs, uint32_t flags,
250             unsigned char out_bytes[BLAKE3_DIGEST_SIZE])
251             {
252             uint32_t block_words[16];
253             uint32_t out[16];
254             int i;
255              
256 8           block_to_words(cs->buffer, block_words);
257 8           compress(cs->chaining_value, block_words, cs->chunk_counter,
258 8           (uint32_t)cs->buffer_len,
259 8           flags | chunk_start_flag(cs) | CHUNK_END | ROOT,
260             out);
261 72 100         for (i = 0; i < 8; i++) store32_le(out_bytes + i * 4, out[i]);
262 8           }
263              
264             /* ---------------- parent machinery ---------------- */
265              
266             /* Combine left+right CVs into a single parent CV. */
267             static void
268 6           parent_cv(const uint32_t left[8], const uint32_t right[8],
269             const uint32_t key[8], uint32_t flags, uint32_t out_cv[8])
270             {
271             uint32_t block_words[16];
272             uint32_t out[16];
273             int i;
274 54 100         for (i = 0; i < 8; i++) block_words[i] = left[i];
275 54 100         for (i = 0; i < 8; i++) block_words[i + 8] = right[i];
276              
277 6           compress(key, block_words, 0, BLAKE3_BLOCK_SIZE,
278             flags | PARENT, out);
279 54 100         for (i = 0; i < 8; i++) out_cv[i] = out[i];
280 6           }
281              
282             /* As above, root version producing the 32-byte output. */
283             static void
284 3           parent_output_root(const uint32_t left[8], const uint32_t right[8],
285             const uint32_t key[8], uint32_t flags,
286             unsigned char out_bytes[BLAKE3_DIGEST_SIZE])
287             {
288             uint32_t block_words[16];
289             uint32_t out[16];
290             int i;
291 27 100         for (i = 0; i < 8; i++) block_words[i] = left[i];
292 27 100         for (i = 0; i < 8; i++) block_words[i + 8] = right[i];
293              
294 3           compress(key, block_words, 0, BLAKE3_BLOCK_SIZE,
295             flags | PARENT | ROOT, out);
296 27 100         for (i = 0; i < 8; i++) store32_le(out_bytes + i * 4, out[i]);
297 3           }
298              
299             /* ---------------- public API ---------------- */
300              
301             void
302 11           blake3_init(blake3_ctx_t *ctx)
303             {
304             int i;
305 99 100         for (i = 0; i < 8; i++) ctx->key_words[i] = IV[i];
306 11           ctx->flags = 0; /* hash mode */
307 11           ctx->cv_stack_len = 0;
308 11           chunk_init(&ctx->chunk, ctx->key_words, 0);
309 11           }
310              
311             /* Push a CV onto the stack and collapse pairs of equal-level CVs into
312             * parents. The trick: every time we add chunk N+1 to the tree, we
313             * collapse pairs starting at the level matching the count of trailing
314             * zeros in N+1. We track the chunk index implicitly via the stack
315             * length; the "merge if total chunks is even at this level" check is
316             * encoded by counting trailing zeros in the post-add chunk count. */
317             static void
318 9           push_chunk_cv(blake3_ctx_t *ctx, uint32_t new_cv[8], uint64_t total_chunks)
319             {
320             /* `total_chunks` is the number of chunks ALREADY added (so the new
321             * one being pushed is chunk index `total_chunks`, and after push
322             * we'll have total_chunks+1 chunks total). The standard collapses
323             * stack pairs while the count of completed chunks is even at the
324             * relevant level - equivalently while the LSB-going-up bits of
325             * total_chunks (post-increment) match. */
326 9           uint64_t post = total_chunks; /* chunks completed so far before push */
327             uint32_t cv[8];
328             int i;
329 81 100         for (i = 0; i < 8; i++) cv[i] = new_cv[i];
330              
331             /* Merge while the bit at the current level of post is set; that
332             * indicates an unpaired CV at that level. */
333 15 100         while (post & 1) {
334             uint32_t left[8];
335 6           ctx->cv_stack_len--;
336 54 100         for (i = 0; i < 8; i++) left[i] = ctx->cv_stack[ctx->cv_stack_len][i];
337 6           parent_cv(left, cv, ctx->key_words, ctx->flags, cv);
338 6           post >>= 1;
339             }
340 81 100         for (i = 0; i < 8; i++) ctx->cv_stack[ctx->cv_stack_len][i] = cv[i];
341 9           ctx->cv_stack_len++;
342 9           }
343              
344             void
345 8           blake3_update(blake3_ctx_t *ctx, const void *data, size_t len)
346             {
347 8           const unsigned char *p = (const unsigned char *)data;
348              
349 25 100         while (len > 0) {
350             size_t want;
351 17           size_t cur = chunk_len(&ctx->chunk);
352              
353             /* If the current chunk is full, finalise it as a leaf and
354             * rotate to a fresh chunk. */
355 17 100         if (cur == BLAKE3_CHUNK_SIZE) {
356             uint32_t leaf_cv[8];
357 9           uint64_t this_idx = ctx->chunk.chunk_counter;
358 9           chunk_output_cv(&ctx->chunk, ctx->flags, leaf_cv);
359 9           push_chunk_cv(ctx, leaf_cv, this_idx);
360 9           chunk_init(&ctx->chunk, ctx->key_words, this_idx + 1);
361 9           cur = 0;
362             }
363              
364 17           want = BLAKE3_CHUNK_SIZE - cur;
365 17 100         if (want > len) want = len;
366 17           chunk_update(&ctx->chunk, p, want, ctx->flags);
367 17           p += want;
368 17           len -= want;
369             }
370 8           }
371              
372             void
373 11           blake3_final(blake3_ctx_t *ctx, unsigned char out[BLAKE3_DIGEST_SIZE])
374             {
375             int i;
376              
377             /* If we never compressed any leaf into the stack, this is a
378             * single-chunk hash: the chunk itself is the root. */
379 11 100         if (ctx->cv_stack_len == 0) {
380 8           chunk_output_root(&ctx->chunk, ctx->flags, out);
381 8           return;
382             }
383              
384             /* Otherwise we collapse the stack from the bottom up. The chunk
385             * we hold becomes the rightmost leaf at the bottom level; combine
386             * it with whatever sits at the top of the stack until we reach the
387             * single root parent that needs ROOT-flag output. */
388             {
389             uint32_t cv[8];
390 3           chunk_output_cv(&ctx->chunk, ctx->flags, cv);
391              
392             /* Walk the stack: every entry must be combined as the LEFT
393             * sibling of `cv`. When only one stack entry remains, the
394             * combination is the ROOT and produces output bytes. */
395 3 50         while (ctx->cv_stack_len > 1) {
396             uint32_t left[8];
397 0           ctx->cv_stack_len--;
398 0 0         for (i = 0; i < 8; i++) left[i] = ctx->cv_stack[ctx->cv_stack_len][i];
399 0           parent_cv(left, cv, ctx->key_words, ctx->flags, cv);
400             }
401             /* Final root parent. */
402             {
403             uint32_t left[8];
404 3           ctx->cv_stack_len--;
405 27 100         for (i = 0; i < 8; i++) left[i] = ctx->cv_stack[ctx->cv_stack_len][i];
406 3           parent_output_root(left, cv, ctx->key_words, ctx->flags, out);
407             }
408             }
409             }