File Coverage

src/jenkins_hash.c

Criterion	Covered	Total	%
statement	28	97	28.8
branch	3	10	30.0
condition			n/a
subroutine			n/a
pod			n/a
total	31	107	28.9

line	stmt	bran	code
1			/*
2			-------------------------------------------------------------------------------
3			lookup3.c, by Bob Jenkins, May 2006, Public Domain.
4
5			These are functions for producing 32-bit hashes for hash table lookup.
6			hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
7			are externally useful functions. Routines to test the hash are included
8			if SELF_TEST is defined. You can use this free for any purpose. It's in
9			the public domain. It has no warranty.
10
11			You probably want to use hashlittle(). hashlittle() and hashbig()
12			hash byte arrays. hashlittle() is is faster than hashbig() on
13			little-endian machines. Intel and AMD are little-endian machines.
14			On second thought, you probably want hashlittle2(), which is identical to
15			hashlittle() except it returns two 32-bit hashes for the price of one.
16			You could implement hashbig2() if you wanted but I haven't bothered here.
17
18			If you want to find a hash of, say, exactly 7 integers, do
19			a = i1; b = i2; c = i3;
20			mix(a,b,c);
21			a += i4; b += i5; c += i6;
22			mix(a,b,c);
23			a += i7;
24			final(a,b,c);
25			then use c as the hash value. If you have a variable length array of
26			4-byte integers to hash, use hashword(). If you have a byte array (like
27			a character string), use hashlittle(). If you have several byte arrays, or
28			a mix of things, see the comments above hashlittle().
29
30			Why is this so big? I read 12 bytes at a time into 3 4-byte integers,
31			then mix those integers. This is fast (you can do a lot more thorough
32			mixing with 12*3 instructions on 3 integers than you can with 3 instructions
33			on 1 byte), but shoehorning those bytes into integers efficiently is messy.
34			-------------------------------------------------------------------------------
35			*/
36
37			#ifdef linux
38			#include /* attempt to define endianness */
39			#include /* attempt to define endianness */
40			#endif
41
42			/*
43			* My best guess at if you are big-endian or little-endian. This may
44			* need adjustment.
45			*/
46			#if (defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && \
47			__BYTE_ORDER == __LITTLE_ENDIAN) \|\| \
48			(defined(i386) \|\| defined(__i386__) \|\| defined(__i486__) \|\| \
49			defined(__i586__) \|\| defined(__i686__) \|\| defined(vax) \|\| defined(MIPSEL))
50			# define HASH_LITTLE_ENDIAN 1
51			# define HASH_BIG_ENDIAN 0
52			#elif (defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && \
53			__BYTE_ORDER == __BIG_ENDIAN) \|\| \
54			(defined(sparc) \|\| defined(POWERPC) \|\| defined(mc68000) \|\| defined(sel))
55			# define HASH_LITTLE_ENDIAN 0
56			# define HASH_BIG_ENDIAN 1
57			#else
58			# define HASH_LITTLE_ENDIAN 0
59			# define HASH_BIG_ENDIAN 0
60			#endif
61
62			#define hashsize(n) ((uint32_t)1<<(n))
63			#define hashmask(n) (hashsize(n)-1)
64			#define rot(x,k) (((x)<<(k)) \| ((x)>>(32-(k))))
65
66			/*
67			-------------------------------------------------------------------------------
68			mix -- mix 3 32-bit values reversibly.
69
70			This is reversible, so any information in (a,b,c) before mix() is
71			still in (a,b,c) after mix().
72
73			If four pairs of (a,b,c) inputs are run through mix(), or through
74			mix() in reverse, there are at least 32 bits of the output that
75			are sometimes the same for one pair and different for another pair.
76			This was tested for:
77			* pairs that differed by one bit, by two bits, in any combination
78			of top bits of (a,b,c), or in any combination of bottom bits of
79			(a,b,c).
80			* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
81			the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
82			is commonly produced by subtraction) look like a single 1-bit
83			difference.
84			* the base values were pseudorandom, all zero but one bit set, or
85			all zero plus a counter that starts at zero.
86
87			Some k values for my "a-=c; a^=rot(c,k); c+=b;" arrangement that
88			satisfy this are
89			4 6 8 16 19 4
90			9 15 3 18 27 15
91			14 9 3 7 17 3
92			Well, "9 15 3 18 27 15" didn't quite get 32 bits diffing
93			for "differ" defined as + with a one-bit base and a two-bit delta. I
94			used http://burtleburtle.net/bob/hash/avalanche.html to choose
95			the operations, constants, and arrangements of the variables.
96
97			This does not achieve avalanche. There are input bits of (a,b,c)
98			that fail to affect some output bits of (a,b,c), especially of a. The
99			most thoroughly mixed value is c, but it doesn't really even achieve
100			avalanche in c.
101
102			This allows some parallelism. Read-after-writes are good at doubling
103			the number of bits affected, so the goal of mixing pulls in the opposite
104			direction as the goal of parallelism. I did what I could. Rotates
105			seem to cost as much as shifts on every machine I could lay my hands
106			on, and rotates are much kinder to the top and bottom bits, so I used
107			rotates.
108			-------------------------------------------------------------------------------
109			*/
110			#define mix(a,b,c) \
111			{ \
112			a -= c; a ^= rot(c, 4); c += b; \
113			b -= a; b ^= rot(a, 6); a += c; \
114			c -= b; c ^= rot(b, 8); b += a; \
115			a -= c; a ^= rot(c,16); c += b; \
116			b -= a; b ^= rot(a,19); a += c; \
117			c -= b; c ^= rot(b, 4); b += a; \
118			}
119
120			/*
121			-------------------------------------------------------------------------------
122			final -- final mixing of 3 32-bit values (a,b,c) into c
123
124			Pairs of (a,b,c) values differing in only a few bits will usually
125			produce values of c that look totally different. This was tested for
126			* pairs that differed by one bit, by two bits, in any combination
127			of top bits of (a,b,c), or in any combination of bottom bits of
128			(a,b,c).
129			* "differ" is defined as +, -, ^, or ~^. For + and -, I transformed
130			the output delta to a Gray code (a^(a>>1)) so a string of 1's (as
131			is commonly produced by subtraction) look like a single 1-bit
132			difference.
133			* the base values were pseudorandom, all zero but one bit set, or
134			all zero plus a counter that starts at zero.
135
136			These constants passed:
137			14 11 25 16 4 14 24
138			12 14 25 16 4 14 24
139			and these came close:
140			4 8 15 26 3 22 24
141			10 8 15 26 3 22 24
142			11 8 15 26 3 22 24
143			-------------------------------------------------------------------------------
144			*/
145			#define final(a,b,c) \
146			{ \
147			c ^= b; c -= rot(b,14); \
148			a ^= c; a -= rot(c,11); \
149			b ^= a; b -= rot(a,25); \
150			c ^= b; c -= rot(b,16); \
151			a ^= c; a -= rot(c,4); \
152			b ^= a; b -= rot(a,14); \
153			c ^= b; c -= rot(b,24); \
154			}
155
156
157			/*
158			-------------------------------------------------------------------------------
159			hashlittle() -- hash a variable-length key into a 32-bit value
160			k : the key (the unaligned variable-length array of bytes)
161			length : the length of the key, counting by bytes
162			initval : can be any 4-byte value
163			Returns a 32-bit value. Every bit of the key affects every bit of
164			the return value. Two keys differing by one or two bits will have
165			totally different hash values.
166
167			The best hash table sizes are powers of 2. There is no need to do
168			mod a prime (mod is sooo slow!). If you need less than 32 bits,
169			use a bitmask. For example, if you need only 10 bits, do
170			h = (h & hashmask(10));
171			In which case, the hash table should have hashsize(10) elements.
172
173			If you are hashing n strings (uint8_t **)k, do it like this:
174			for (i=0, h=0; i
175
176			By Bob Jenkins, 2006. bob_jenkins@burtleburtle.net. You may use this
177			code any way you wish, private, educational, or commercial. It's free.
178
179			Use for hash table lookup, or anything where one collision in 2^^32 is
180			acceptable. Do NOT use for cryptographic purposes.
181			-------------------------------------------------------------------------------
182			*/
183
184	188		uint32_t hashlittle( const void *key, size_t length, uint32_t initval)
185			{
186			uint32_t a,b,c; /* internal state */
187			union { const void ptr; size_t i; } u; / needed for Mac Powerbook G4 */
188
189			/* Set up the internal state */
190	188		a = b = c = 0xdeadbeef + ((uint32_t)length) + initval;
191
192	188		u.ptr = key;
193	188	50	if (HASH_LITTLE_ENDIAN && ((u.i & 0x3) == 0)) {
194	188		const uint32_t k = (const uint32_t )key; /* read 32-bit chunks */
195			const uint8_t *k8;
196
197			/------ all but last block: aligned reads and affect 32 bits of (a,b,c) /
198	1226	100	while (length > 12)
199			{
200	1038		a += k[0];
201	1038		b += k[1];
202	1038		c += k[2];
203	1038		mix(a,b,c);
204	1038		length -= 12;
205	1038		k += 3;
206			}
207
208			/----------------------------- handle the last (probably partial) block /
209			/*
210			* "k[2]&0xffffff" actually reads beyond the end of the string, but
211			* then masks off the part it's not allowed to read. Because the
212			* string is aligned, the masked-off tail is in the same word as the
213			* rest of the string. Every machine with memory protection I've seen
214			* does it on word boundaries, so is OK with this. But VALGRIND will
215			* still catch it and complain. The masking trick does make the hash
216			* noticably faster for short strings (like English words).
217			*/
218			#ifndef VALGRIND
219
220	188		switch(length)
221			{
222	9		case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
223	16		case 11: c+=k[2]&0xffffff; b+=k[1]; a+=k[0]; break;
224	14		case 10: c+=k[2]&0xffff; b+=k[1]; a+=k[0]; break;
225	19		case 9 : c+=k[2]&0xff; b+=k[1]; a+=k[0]; break;
226	14		case 8 : b+=k[1]; a+=k[0]; break;
227	16		case 7 : b+=k[1]&0xffffff; a+=k[0]; break;
228	18		case 6 : b+=k[1]&0xffff; a+=k[0]; break;
229	20		case 5 : b+=k[1]&0xff; a+=k[0]; break;
230	23		case 4 : a+=k[0]; break;
231	13		case 3 : a+=k[0]&0xffffff; break;
232	16		case 2 : a+=k[0]&0xffff; break;
233	10		case 1 : a+=k[0]&0xff; break;
234	188		case 0 : return c; /* zero length strings require no mixing */
235			}
236
237			#else /* make valgrind happy */
238
239			k8 = (const uint8_t *)k;
240			switch(length)
241			{
242			case 12: c+=k[2]; b+=k[1]; a+=k[0]; break;
243			case 11: c+=((uint32_t)k8[10])<<16; /* fall through */
244			case 10: c+=((uint32_t)k8[9])<<8; /* fall through */
245			case 9 : c+=k8[8]; /* fall through */
246			case 8 : b+=k[1]; a+=k[0]; break;
247			case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */
248			case 6 : b+=((uint32_t)k8[5])<<8; /* fall through */
249			case 5 : b+=k8[4]; /* fall through */
250			case 4 : a+=k[0]; break;
251			case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */
252			case 2 : a+=((uint32_t)k8[1])<<8; /* fall through */
253			case 1 : a+=k8[0]; break;
254			case 0 : return c;
255			}
256
257			#endif /* !valgrind */
258
259	0	0	} else if (HASH_LITTLE_ENDIAN && ((u.i & 0x1) == 0)) {
260	0		const uint16_t k = (const uint16_t )key; /* read 16-bit chunks */
261			const uint8_t *k8;
262
263			/--------------- all but last block: aligned reads and different mixing /
264	0	0	while (length > 12)
265			{
266	0		a += k[0] + (((uint32_t)k[1])<<16);
267	0		b += k[2] + (((uint32_t)k[3])<<16);
268	0		c += k[4] + (((uint32_t)k[5])<<16);
269	0		mix(a,b,c);
270	0		length -= 12;
271	0		k += 6;
272			}
273
274			/----------------------------- handle the last (probably partial) block /
275	0		k8 = (const uint8_t *)k;
276	0		switch(length)
277			{
278	0		case 12: c+=k[4]+(((uint32_t)k[5])<<16);
279	0		b+=k[2]+(((uint32_t)k[3])<<16);
280	0		a+=k[0]+(((uint32_t)k[1])<<16);
281	0		break;
282	0		case 11: c+=((uint32_t)k8[10])<<16; /* fall through */
283	0		case 10: c+=k[4];
284	0		b+=k[2]+(((uint32_t)k[3])<<16);
285	0		a+=k[0]+(((uint32_t)k[1])<<16);
286	0		break;
287	0		case 9 : c+=k8[8]; /* fall through */
288	0		case 8 : b+=k[2]+(((uint32_t)k[3])<<16);
289	0		a+=k[0]+(((uint32_t)k[1])<<16);
290	0		break;
291	0		case 7 : b+=((uint32_t)k8[6])<<16; /* fall through */
292	0		case 6 : b+=k[2];
293	0		a+=k[0]+(((uint32_t)k[1])<<16);
294	0		break;
295	0		case 5 : b+=k8[4]; /* fall through */
296	0		case 4 : a+=k[0]+(((uint32_t)k[1])<<16);
297	0		break;
298	0		case 3 : a+=((uint32_t)k8[2])<<16; /* fall through */
299	0		case 2 : a+=k[0];
300	0		break;
301	0		case 1 : a+=k8[0];
302	0		break;
303	0		case 0 : return c; /* zero length requires no mixing */
304			}
305
306			} else { /* need to read the key one byte at a time */
307	0		const uint8_t k = (const uint8_t )key;
308
309			/--------------- all but the last block: affect some 32 bits of (a,b,c) /
310	0	0	while (length > 12)
311			{
312	0		a += k[0];
313	0		a += ((uint32_t)k[1])<<8;
314	0		a += ((uint32_t)k[2])<<16;
315	0		a += ((uint32_t)k[3])<<24;
316	0		b += k[4];
317	0		b += ((uint32_t)k[5])<<8;
318	0		b += ((uint32_t)k[6])<<16;
319	0		b += ((uint32_t)k[7])<<24;
320	0		c += k[8];
321	0		c += ((uint32_t)k[9])<<8;
322	0		c += ((uint32_t)k[10])<<16;
323	0		c += ((uint32_t)k[11])<<24;
324	0		mix(a,b,c);
325	0		length -= 12;
326	0		k += 12;
327			}
328
329			/-------------------------------- last block: affect all 32 bits of (c) /
330	0		switch(length) /* all the case statements fall through */
331			{
332	0		case 12: c+=((uint32_t)k[11])<<24;
333	0		case 11: c+=((uint32_t)k[10])<<16;
334	0		case 10: c+=((uint32_t)k[9])<<8;
335	0		case 9 : c+=k[8];
336	0		case 8 : b+=((uint32_t)k[7])<<24;
337	0		case 7 : b+=((uint32_t)k[6])<<16;
338	0		case 6 : b+=((uint32_t)k[5])<<8;
339	0		case 5 : b+=k[4];
340	0		case 4 : a+=((uint32_t)k[3])<<24;
341	0		case 3 : a+=((uint32_t)k[2])<<16;
342	0		case 2 : a+=((uint32_t)k[1])<<8;
343	0		case 1 : a+=k[0];
344	0		break;
345	0		case 0 : return c;
346			}
347			}
348
349	188		final(a,b,c);
350	188		return c;
351			}