mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2025-01-04 03:46:18 +01:00
298 lines
9.5 KiB
C
298 lines
9.5 KiB
C
#include "jenkins_hash.h"
|
|
#include <stdlib.h>
|
|
#ifdef WIN32
|
|
#define _USE_MATH_DEFINES //For M_LOG2E
|
|
#endif
|
|
#include <math.h>
|
|
#include <limits.h>
|
|
#include <string.h>
|
|
|
|
//#define DEBUG
|
|
#include "debug.h"
|
|
|
|
#define hashsize(n) ((cmph_uint32)1<<(n))
|
|
#define hashmask(n) (hashsize(n)-1)
|
|
|
|
|
|
|
|
//#define NM2 /* Define this if you do not want power of 2 table sizes*/
|
|
|
|
|
|
/*
|
|
--------------------------------------------------------------------
|
|
mix -- mix 3 32-bit values reversibly.
|
|
For every delta with one or two bits set, and the deltas of all three
|
|
high bits or all three low bits, whether the original value of a,b,c
|
|
is almost all zero or is uniformly distributed,
|
|
* If mix() is run forward or backward, at least 32 bits in a,b,c
|
|
have at least 1/4 probability of changing.
|
|
* If mix() is run forward, every bit of c will change between 1/3 and
|
|
2/3 of the time. (Well, 22/100 and 78/100 for some 2-bit deltas.)
|
|
mix() was built out of 36 single-cycle latency instructions in a
|
|
structure that could supported 2x parallelism, like so:
|
|
a -= b;
|
|
a -= c; x = (c>>13);
|
|
b -= c; a ^= x;
|
|
b -= a; x = (a<<8);
|
|
c -= a; b ^= x;
|
|
c -= b; x = (b>>13);
|
|
...
|
|
Unfortunately, superscalar Pentiums and Sparcs can't take advantage
|
|
of that parallelism. They've also turned some of those single-cycle
|
|
latency instructions into multi-cycle latency instructions. Still,
|
|
this is the fastest good hash I could find. There were about 2^^68
|
|
to choose from. I only looked at a billion or so.
|
|
--------------------------------------------------------------------
|
|
*/
|
|
#define mix(a,b,c) \
|
|
{ \
|
|
a -= b; a -= c; a ^= (c>>13); \
|
|
b -= c; b -= a; b ^= (a<<8); \
|
|
c -= a; c -= b; c ^= (b>>13); \
|
|
a -= b; a -= c; a ^= (c>>12); \
|
|
b -= c; b -= a; b ^= (a<<16); \
|
|
c -= a; c -= b; c ^= (b>>5); \
|
|
a -= b; a -= c; a ^= (c>>3); \
|
|
b -= c; b -= a; b ^= (a<<10); \
|
|
c -= a; c -= b; c ^= (b>>15); \
|
|
}
|
|
|
|
/*
|
|
--------------------------------------------------------------------
|
|
hash() -- hash a variable-length key into a 32-bit value
|
|
k : the key (the unaligned variable-length array of bytes)
|
|
len : the length of the key, counting by bytes
|
|
initval : can be any 4-byte value
|
|
Returns a 32-bit value. Every bit of the key affects every bit of
|
|
the return value. Every 1-bit and 2-bit delta achieves avalanche.
|
|
About 6*len+35 instructions.
|
|
|
|
The best hash table sizes are powers of 2. There is no need to do
|
|
mod a prime (mod is sooo slow!). If you need less than 32 bits,
|
|
use a bitmask. For example, if you need only 10 bits, do
|
|
h = (h & hashmask(10));
|
|
In which case, the hash table should have hashsize(10) elements.
|
|
|
|
If you are hashing n strings (cmph_uint8 **)k, do it like this:
|
|
for (i=0, h=0; i<n; ++i) h = hash( k[i], len[i], h);
|
|
|
|
By Bob Jenkins, 1996. bob_jenkins@burtleburtle.net. You may use this
|
|
code any way you wish, private, educational, or commercial. It's free.
|
|
|
|
See http://burtleburtle.net/bob/hash/evahash.html
|
|
Use for hash table lookup, or anything where one collision in 2^^32 is
|
|
acceptable. Do NOT use for cryptographic purposes.
|
|
--------------------------------------------------------------------
|
|
*/
|
|
jenkins_state_t *jenkins_state_new(cmph_uint32 size) //size of hash table
|
|
{
|
|
jenkins_state_t *state = (jenkins_state_t *)malloc(sizeof(jenkins_state_t));
|
|
DEBUGP("Initializing jenkins hash\n");
|
|
state->seed = ((cmph_uint32)rand() % size);
|
|
return state;
|
|
}
|
|
void jenkins_state_destroy(jenkins_state_t *state)
|
|
{
|
|
free(state);
|
|
}
|
|
|
|
|
|
static inline void __jenkins_hash_vector(cmph_uint32 seed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes)
|
|
{
|
|
register cmph_uint32 len, length;
|
|
|
|
/* Set up the internal state */
|
|
length = keylen;
|
|
len = length;
|
|
hashes[0] = hashes[1] = 0x9e3779b9; /* the golden ratio; an arbitrary value */
|
|
hashes[2] = seed; /* the previous hash value - seed in our case */
|
|
|
|
/*---------------------------------------- handle most of the key */
|
|
while (len >= 12)
|
|
{
|
|
hashes[0] += ((cmph_uint32)k[0] +((cmph_uint32)k[1]<<8) +((cmph_uint32)k[2]<<16) +((cmph_uint32)k[3]<<24));
|
|
hashes[1] += ((cmph_uint32)k[4] +((cmph_uint32)k[5]<<8) +((cmph_uint32)k[6]<<16) +((cmph_uint32)k[7]<<24));
|
|
hashes[2] += ((cmph_uint32)k[8] +((cmph_uint32)k[9]<<8) +((cmph_uint32)k[10]<<16)+((cmph_uint32)k[11]<<24));
|
|
mix(hashes[0],hashes[1],hashes[2]);
|
|
k += 12; len -= 12;
|
|
}
|
|
|
|
/*------------------------------------- handle the last 11 bytes */
|
|
hashes[2] += length;
|
|
switch(len) /* all the case statements fall through */
|
|
{
|
|
case 11:
|
|
hashes[2] +=((cmph_uint32)k[10]<<24);
|
|
case 10:
|
|
hashes[2] +=((cmph_uint32)k[9]<<16);
|
|
case 9 :
|
|
hashes[2] +=((cmph_uint32)k[8]<<8);
|
|
/* the first byte of hashes[2] is reserved for the length */
|
|
case 8 :
|
|
hashes[1] +=((cmph_uint32)k[7]<<24);
|
|
case 7 :
|
|
hashes[1] +=((cmph_uint32)k[6]<<16);
|
|
case 6 :
|
|
hashes[1] +=((cmph_uint32)k[5]<<8);
|
|
case 5 :
|
|
hashes[1] +=(cmph_uint8) k[4];
|
|
case 4 :
|
|
hashes[0] +=((cmph_uint32)k[3]<<24);
|
|
case 3 :
|
|
hashes[0] +=((cmph_uint32)k[2]<<16);
|
|
case 2 :
|
|
hashes[0] +=((cmph_uint32)k[1]<<8);
|
|
case 1 :
|
|
hashes[0] +=(cmph_uint8)k[0];
|
|
/* case 0: nothing left to add */
|
|
}
|
|
|
|
mix(hashes[0],hashes[1],hashes[2]);
|
|
}
|
|
|
|
cmph_uint32 jenkins_hash(jenkins_state_t *state, const char *k, cmph_uint32 keylen)
|
|
{
|
|
cmph_uint32 hashes[3];
|
|
__jenkins_hash_vector(state->seed, k, keylen, hashes);
|
|
return hashes[2];
|
|
/* cmph_uint32 a, b, c;
|
|
cmph_uint32 len, length;
|
|
|
|
// Set up the internal state
|
|
length = keylen;
|
|
len = length;
|
|
a = b = 0x9e3779b9; // the golden ratio; an arbitrary value
|
|
c = state->seed; // the previous hash value - seed in our case
|
|
|
|
// handle most of the key
|
|
while (len >= 12)
|
|
{
|
|
a += (k[0] +((cmph_uint32)k[1]<<8) +((cmph_uint32)k[2]<<16) +((cmph_uint32)k[3]<<24));
|
|
b += (k[4] +((cmph_uint32)k[5]<<8) +((cmph_uint32)k[6]<<16) +((cmph_uint32)k[7]<<24));
|
|
c += (k[8] +((cmph_uint32)k[9]<<8) +((cmph_uint32)k[10]<<16)+((cmph_uint32)k[11]<<24));
|
|
mix(a,b,c);
|
|
k += 12; len -= 12;
|
|
}
|
|
|
|
// handle the last 11 bytes
|
|
c += length;
|
|
switch(len) /// all the case statements fall through
|
|
{
|
|
case 11:
|
|
c +=((cmph_uint32)k[10]<<24);
|
|
case 10:
|
|
c +=((cmph_uint32)k[9]<<16);
|
|
case 9 :
|
|
c +=((cmph_uint32)k[8]<<8);
|
|
// the first byte of c is reserved for the length
|
|
case 8 :
|
|
b +=((cmph_uint32)k[7]<<24);
|
|
case 7 :
|
|
b +=((cmph_uint32)k[6]<<16);
|
|
case 6 :
|
|
b +=((cmph_uint32)k[5]<<8);
|
|
case 5 :
|
|
b +=k[4];
|
|
case 4 :
|
|
a +=((cmph_uint32)k[3]<<24);
|
|
case 3 :
|
|
a +=((cmph_uint32)k[2]<<16);
|
|
case 2 :
|
|
a +=((cmph_uint32)k[1]<<8);
|
|
case 1 :
|
|
a +=k[0];
|
|
// case 0: nothing left to add
|
|
}
|
|
|
|
mix(a,b,c);
|
|
|
|
/// report the result
|
|
|
|
return c;
|
|
*/
|
|
}
|
|
|
|
void jenkins_hash_vector_(jenkins_state_t *state, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes)
|
|
{
|
|
__jenkins_hash_vector(state->seed, k, keylen, hashes);
|
|
}
|
|
|
|
void jenkins_state_dump(jenkins_state_t *state, char **buf, cmph_uint32 *buflen)
|
|
{
|
|
*buflen = sizeof(cmph_uint32);
|
|
*buf = (char *)malloc(sizeof(cmph_uint32));
|
|
if (!*buf)
|
|
{
|
|
*buflen = UINT_MAX;
|
|
return;
|
|
}
|
|
memcpy(*buf, &(state->seed), sizeof(cmph_uint32));
|
|
DEBUGP("Dumped jenkins state with seed %u\n", state->seed);
|
|
return;
|
|
}
|
|
|
|
jenkins_state_t *jenkins_state_copy(jenkins_state_t *src_state)
|
|
{
|
|
jenkins_state_t *dest_state = (jenkins_state_t *)malloc(sizeof(jenkins_state_t));
|
|
dest_state->hashfunc = src_state->hashfunc;
|
|
dest_state->seed = src_state->seed;
|
|
return dest_state;
|
|
}
|
|
|
|
jenkins_state_t *jenkins_state_load(const char *buf, cmph_uint32 buflen)
|
|
{
|
|
jenkins_state_t *state = (jenkins_state_t *)malloc(sizeof(jenkins_state_t));
|
|
state->seed = *(cmph_uint32 *)buf;
|
|
state->hashfunc = CMPH_HASH_JENKINS;
|
|
DEBUGP("Loaded jenkins state with seed %u\n", state->seed);
|
|
return state;
|
|
}
|
|
|
|
|
|
/** \fn void jenkins_state_pack(jenkins_state_t *state, void *jenkins_packed);
|
|
* \brief Support the ability to pack a jenkins function into a preallocated contiguous memory space pointed by jenkins_packed.
|
|
* \param state points to the jenkins function
|
|
* \param jenkins_packed pointer to the contiguous memory area used to store the jenkins function. The size of jenkins_packed must be at least jenkins_state_packed_size()
|
|
*/
|
|
void jenkins_state_pack(jenkins_state_t *state, void *jenkins_packed)
|
|
{
|
|
if (state && jenkins_packed)
|
|
{
|
|
memcpy(jenkins_packed, &(state->seed), sizeof(cmph_uint32));
|
|
}
|
|
}
|
|
|
|
/** \fn cmph_uint32 jenkins_state_packed_size(jenkins_state_t *state);
|
|
* \brief Return the amount of space needed to pack a jenkins function.
|
|
* \return the size of the packed function or zero for failures
|
|
*/
|
|
cmph_uint32 jenkins_state_packed_size(void)
|
|
{
|
|
return sizeof(cmph_uint32);
|
|
}
|
|
|
|
|
|
/** \fn cmph_uint32 jenkins_hash_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen);
|
|
* \param jenkins_packed is a pointer to a contiguous memory area
|
|
* \param key is a pointer to a key
|
|
* \param keylen is the key length
|
|
* \return an integer that represents a hash value of 32 bits.
|
|
*/
|
|
cmph_uint32 jenkins_hash_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen)
|
|
{
|
|
cmph_uint32 hashes[3];
|
|
__jenkins_hash_vector(*((cmph_uint32 *)jenkins_packed), k, keylen, hashes);
|
|
return hashes[2];
|
|
}
|
|
|
|
/** \fn jenkins_hash_vector_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes);
|
|
* \param jenkins_packed is a pointer to a contiguous memory area
|
|
* \param key is a pointer to a key
|
|
* \param keylen is the key length
|
|
* \param hashes is a pointer to a memory large enough to fit three 32-bit integers.
|
|
*/
|
|
void jenkins_hash_vector_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes)
|
|
{
|
|
__jenkins_hash_vector(*((cmph_uint32 *)jenkins_packed), k, keylen, hashes);
|
|
}
|