mirror of
https://gitlab.gnome.org/GNOME/glib.git
synced 2024-12-29 08:56:15 +01:00
86b7d7cc99
...So that it will compile on non-C99 compilers. The changes are mainly moving the variable declarations to the start of the resecptive blocks. Also, replace the use of buflen in chd.c as it might not be defined for all platforms, instead using packed_cr_size as it seems to represent the value that is to be printed/displayed by the debugging output. https://bugzilla.gnome.org/show_bug.cgi?id=681820
1000 lines
27 KiB
C
1000 lines
27 KiB
C
#include<stdio.h>
|
|
#include<stdlib.h>
|
|
#include<string.h>
|
|
#include<math.h>
|
|
#include<time.h>
|
|
#include<assert.h>
|
|
#include<limits.h>
|
|
#include<errno.h>
|
|
|
|
#include "cmph_structs.h"
|
|
#include "chd_structs_ph.h"
|
|
#include "chd_ph.h"
|
|
#include"miller_rabin.h"
|
|
#include"bitbool.h"
|
|
|
|
|
|
//#define DEBUG
|
|
#include "debug.h"
|
|
|
|
// NO_ELEMENT is equivalent to null pointer
|
|
#ifndef NO_ELEMENT
|
|
#define NO_ELEMENT UINT_MAX
|
|
#endif
|
|
|
|
// struct used to represent items at mapping, ordering and searching phases
|
|
struct _chd_ph_item_t
|
|
{
|
|
cmph_uint32 f;
|
|
cmph_uint32 h;
|
|
};
|
|
typedef struct _chd_ph_item_t chd_ph_item_t;
|
|
|
|
// struct to represent the items at mapping phase only.
|
|
struct _chd_ph_map_item_t
|
|
{
|
|
cmph_uint32 f;
|
|
cmph_uint32 h;
|
|
cmph_uint32 bucket_num;
|
|
};
|
|
typedef struct _chd_ph_map_item_t chd_ph_map_item_t;
|
|
|
|
// struct to represent a bucket
|
|
struct _chd_ph_bucket_t
|
|
{
|
|
cmph_uint32 items_list; // offset
|
|
union
|
|
{
|
|
cmph_uint32 size;
|
|
cmph_uint32 bucket_id;
|
|
};
|
|
};
|
|
|
|
typedef struct _chd_ph_bucket_t chd_ph_bucket_t;
|
|
|
|
struct _chd_ph_sorted_list_t
|
|
{
|
|
cmph_uint32 buckets_list;
|
|
cmph_uint32 size;
|
|
};
|
|
|
|
typedef struct _chd_ph_sorted_list_t chd_ph_sorted_list_t;
|
|
|
|
|
|
static inline chd_ph_bucket_t * chd_ph_bucket_new(cmph_uint32 nbuckets);
|
|
static inline void chd_ph_bucket_clean(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets);
|
|
static inline void chd_ph_bucket_destroy(chd_ph_bucket_t * buckets);
|
|
|
|
chd_ph_bucket_t * chd_ph_bucket_new(cmph_uint32 nbuckets)
|
|
{
|
|
chd_ph_bucket_t * buckets = (chd_ph_bucket_t *) calloc(nbuckets, sizeof(chd_ph_bucket_t));
|
|
return buckets;
|
|
}
|
|
|
|
void chd_ph_bucket_clean(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets)
|
|
{
|
|
register cmph_uint32 i = 0;
|
|
assert(buckets);
|
|
for(i = 0; i < nbuckets; i++)
|
|
buckets[i].size = 0;
|
|
}
|
|
static cmph_uint8 chd_ph_bucket_insert(chd_ph_bucket_t * buckets,chd_ph_map_item_t * map_items, chd_ph_item_t * items,
|
|
cmph_uint32 nbuckets,cmph_uint32 item_idx)
|
|
{
|
|
register cmph_uint32 i = 0;
|
|
register chd_ph_item_t * tmp_item;
|
|
register chd_ph_map_item_t * tmp_map_item = map_items + item_idx;
|
|
register chd_ph_bucket_t * bucket = buckets + tmp_map_item->bucket_num;
|
|
tmp_item = items + bucket->items_list;
|
|
|
|
for(i = 0; i < bucket->size; i++)
|
|
{
|
|
if(tmp_item->f == tmp_map_item->f && tmp_item->h == tmp_map_item->h)
|
|
{
|
|
DEBUGP("Item not added\n");
|
|
return 0;
|
|
};
|
|
tmp_item++;
|
|
};
|
|
tmp_item->f = tmp_map_item->f;
|
|
tmp_item->h = tmp_map_item->h;
|
|
bucket->size++;
|
|
return 1;
|
|
};
|
|
void chd_ph_bucket_destroy(chd_ph_bucket_t * buckets)
|
|
{
|
|
free(buckets);
|
|
}
|
|
|
|
static inline cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items,
|
|
cmph_uint32 *max_bucket_size);
|
|
|
|
static chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets,chd_ph_item_t ** items,
|
|
cmph_uint32 nbuckets,cmph_uint32 nitems, cmph_uint32 max_bucket_size);
|
|
|
|
static cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items ,
|
|
cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table);
|
|
|
|
static inline double chd_ph_space_lower_bound(cmph_uint32 _n, cmph_uint32 _r)
|
|
{
|
|
double r = _r, n = _n;
|
|
return (1 + (r/n - 1.0 + 1.0/(2.0*n))*log(1 - n/r))/log(2);
|
|
};
|
|
|
|
/* computes the entropy of non empty buckets.*/
|
|
static inline double chd_ph_get_entropy(cmph_uint32 * disp_table, cmph_uint32 n, cmph_uint32 max_probes)
|
|
{
|
|
register cmph_uint32 * probe_counts = (cmph_uint32 *) calloc(max_probes, sizeof(cmph_uint32));
|
|
register cmph_uint32 i;
|
|
register double entropy = 0;
|
|
|
|
for(i = 0; i < n; i++)
|
|
{
|
|
probe_counts[disp_table[i]]++;
|
|
};
|
|
|
|
for(i = 0; i < max_probes; i++)
|
|
{
|
|
if(probe_counts[i] > 0)
|
|
entropy -= probe_counts[i]*log((double)probe_counts[i]/(double)n)/log(2);
|
|
};
|
|
free(probe_counts);
|
|
return entropy;
|
|
};
|
|
|
|
chd_ph_config_data_t *chd_ph_config_new(void)
|
|
{
|
|
chd_ph_config_data_t *chd_ph;
|
|
chd_ph = (chd_ph_config_data_t *)malloc(sizeof(chd_ph_config_data_t));
|
|
assert(chd_ph);
|
|
memset(chd_ph, 0, sizeof(chd_ph_config_data_t));
|
|
|
|
chd_ph->hashfunc = CMPH_HASH_JENKINS;
|
|
chd_ph->cs = NULL;
|
|
chd_ph->nbuckets = 0;
|
|
chd_ph->n = 0;
|
|
chd_ph->hl = NULL;
|
|
|
|
chd_ph->m = 0;
|
|
chd_ph->use_h = 1;
|
|
chd_ph->keys_per_bin = 1;
|
|
chd_ph->keys_per_bucket = 4;
|
|
chd_ph->occup_table = 0;
|
|
|
|
return chd_ph;
|
|
}
|
|
|
|
void chd_ph_config_destroy(cmph_config_t *mph)
|
|
{
|
|
chd_ph_config_data_t *data = (chd_ph_config_data_t *) mph->data;
|
|
DEBUGP("Destroying algorithm dependent data\n");
|
|
if(data->occup_table)
|
|
{
|
|
free(data->occup_table);
|
|
data->occup_table = NULL;
|
|
}
|
|
free(data);
|
|
}
|
|
|
|
|
|
void chd_ph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs)
|
|
{
|
|
chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
|
|
CMPH_HASH *hashptr = hashfuncs;
|
|
cmph_uint32 i = 0;
|
|
while(*hashptr != CMPH_HASH_COUNT)
|
|
{
|
|
if (i >= 1) break; //chd_ph only uses one linear hash function
|
|
chd_ph->hashfunc = *hashptr;
|
|
++i, ++hashptr;
|
|
}
|
|
}
|
|
|
|
|
|
void chd_ph_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket)
|
|
{
|
|
chd_ph_config_data_t *chd_ph;
|
|
assert(mph);
|
|
chd_ph = (chd_ph_config_data_t *)mph->data;
|
|
if(keys_per_bucket < 1 || keys_per_bucket >= 15)
|
|
{
|
|
keys_per_bucket = 4;
|
|
}
|
|
chd_ph->keys_per_bucket = keys_per_bucket;
|
|
}
|
|
|
|
|
|
void chd_ph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin)
|
|
{
|
|
chd_ph_config_data_t *chd_ph;
|
|
assert(mph);
|
|
chd_ph = (chd_ph_config_data_t *)mph->data;
|
|
if(keys_per_bin <= 1 || keys_per_bin >= 128)
|
|
{
|
|
keys_per_bin = 1;
|
|
}
|
|
chd_ph->keys_per_bin = keys_per_bin;
|
|
}
|
|
|
|
cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items, cmph_uint32 *max_bucket_size)
|
|
{
|
|
register cmph_uint32 i = 0, g = 0;
|
|
cmph_uint32 hl[3];
|
|
chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
|
|
char * key = NULL;
|
|
cmph_uint32 keylen = 0;
|
|
chd_ph_map_item_t * map_item;
|
|
chd_ph_map_item_t * map_items = malloc(chd_ph->m*sizeof(chd_ph_map_item_t));
|
|
register cmph_uint32 mapping_iterations = 1000;
|
|
*max_bucket_size = 0;
|
|
while(1)
|
|
{
|
|
mapping_iterations--;
|
|
if (chd_ph->hl) hash_state_destroy(chd_ph->hl);
|
|
chd_ph->hl = hash_state_new(chd_ph->hashfunc, chd_ph->m);
|
|
|
|
chd_ph_bucket_clean(buckets, chd_ph->nbuckets);
|
|
|
|
mph->key_source->rewind(mph->key_source->data);
|
|
|
|
for(i = 0; i < chd_ph->m; i++)
|
|
{
|
|
mph->key_source->read(mph->key_source->data, &key, &keylen);
|
|
hash_vector(chd_ph->hl, key, keylen, hl);
|
|
|
|
map_item = (map_items + i);
|
|
|
|
g = hl[0] % chd_ph->nbuckets;
|
|
map_item->f = hl[1] % chd_ph->n;
|
|
map_item->h = hl[2] % (chd_ph->n - 1) + 1;
|
|
map_item->bucket_num=g;
|
|
mph->key_source->dispose(mph->key_source->data, key, keylen);
|
|
// if(buckets[g].size == (chd_ph->keys_per_bucket << 2))
|
|
// {
|
|
// DEBUGP("BUCKET = %u -- SIZE = %u -- MAXIMUM SIZE = %u\n", g, buckets[g].size, (chd_ph->keys_per_bucket << 2));
|
|
// goto error;
|
|
// }
|
|
buckets[g].size++;
|
|
if(buckets[g].size > *max_bucket_size)
|
|
{
|
|
*max_bucket_size = buckets[g].size;
|
|
}
|
|
}
|
|
buckets[0].items_list = 0;
|
|
for(i = 1; i < chd_ph->nbuckets; i++)
|
|
{
|
|
buckets[i].items_list = buckets[i-1].items_list + buckets[i - 1].size;
|
|
buckets[i - 1].size = 0;
|
|
};
|
|
buckets[i - 1].size = 0;
|
|
for(i = 0; i < chd_ph->m; i++)
|
|
{
|
|
map_item = (map_items + i);
|
|
if(!chd_ph_bucket_insert(buckets, map_items, items, chd_ph->nbuckets, i))
|
|
break;
|
|
}
|
|
if(i == chd_ph->m)
|
|
{
|
|
free(map_items);
|
|
return 1; // SUCCESS
|
|
}
|
|
|
|
if(mapping_iterations == 0)
|
|
{
|
|
goto error;
|
|
}
|
|
}
|
|
error:
|
|
free(map_items);
|
|
hash_state_destroy(chd_ph->hl);
|
|
chd_ph->hl = NULL;
|
|
return 0; // FAILURE
|
|
}
|
|
|
|
chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets, chd_ph_item_t ** _items,
|
|
cmph_uint32 nbuckets, cmph_uint32 nitems, cmph_uint32 max_bucket_size)
|
|
{
|
|
chd_ph_sorted_list_t * sorted_lists = (chd_ph_sorted_list_t *) calloc(max_bucket_size + 1, sizeof(chd_ph_sorted_list_t));
|
|
|
|
chd_ph_bucket_t * input_buckets = (*_buckets);
|
|
chd_ph_bucket_t * output_buckets;
|
|
chd_ph_item_t * input_items = (*_items);
|
|
chd_ph_item_t * output_items;
|
|
register cmph_uint32 i, j, bucket_size, position, position2;
|
|
// cmph_uint32 non_empty_buckets;
|
|
DEBUGP("MAX BUCKET SIZE = %u\n", max_bucket_size);
|
|
// Determine size of each list of buckets
|
|
for(i = 0; i < nbuckets; i++)
|
|
{
|
|
bucket_size = input_buckets[i].size;
|
|
if(bucket_size == 0)
|
|
continue;
|
|
sorted_lists[bucket_size].size++;
|
|
};
|
|
sorted_lists[1].buckets_list = 0;
|
|
// Determine final position of list of buckets into the contiguous array that will store all the buckets
|
|
for(i = 2; i <= max_bucket_size; i++)
|
|
{
|
|
sorted_lists[i].buckets_list = sorted_lists[i-1].buckets_list + sorted_lists[i-1].size;
|
|
sorted_lists[i-1].size = 0;
|
|
};
|
|
sorted_lists[i-1].size = 0;
|
|
// Store the buckets in a new array which is sorted by bucket sizes
|
|
output_buckets = calloc(nbuckets, sizeof(chd_ph_bucket_t)); // everything is initialized with zero
|
|
// non_empty_buckets = nbuckets;
|
|
|
|
for(i = 0; i < nbuckets; i++)
|
|
{
|
|
bucket_size = input_buckets[i].size;
|
|
if(bucket_size == 0)
|
|
{
|
|
// non_empty_buckets--;
|
|
continue;
|
|
};
|
|
position = sorted_lists[bucket_size].buckets_list + sorted_lists[bucket_size].size;
|
|
output_buckets[position].bucket_id = i;
|
|
output_buckets[position].items_list = input_buckets[i].items_list;
|
|
sorted_lists[bucket_size].size++;
|
|
};
|
|
/* for(i = non_empty_buckets; i < nbuckets; i++)
|
|
output_buckets[i].size=0;*/
|
|
// Return the buckets sorted in new order and free the old buckets sorted in old order
|
|
free(input_buckets);
|
|
(*_buckets) = output_buckets;
|
|
|
|
|
|
// Store the items according to the new order of buckets.
|
|
output_items = (chd_ph_item_t*)calloc(nitems, sizeof(chd_ph_item_t));
|
|
position = 0;
|
|
i = 0;
|
|
for(bucket_size = 1; bucket_size <= max_bucket_size; bucket_size++)
|
|
{
|
|
for(i = sorted_lists[bucket_size].buckets_list; i < sorted_lists[bucket_size].size + sorted_lists[bucket_size].buckets_list; i++)
|
|
{
|
|
position2 = output_buckets[i].items_list;
|
|
output_buckets[i].items_list = position;
|
|
for(j = 0; j < bucket_size; j++)
|
|
{
|
|
output_items[position].f = input_items[position2].f;
|
|
output_items[position].h = input_items[position2].h;
|
|
position++;
|
|
position2++;
|
|
};
|
|
};
|
|
};
|
|
//Return the items sorted in new order and free the old items sorted in old order
|
|
free(input_items);
|
|
(*_items) = output_items;
|
|
return sorted_lists;
|
|
};
|
|
|
|
static inline cmph_uint8 place_bucket_probe(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets,
|
|
chd_ph_item_t *items, cmph_uint32 probe0_num, cmph_uint32 probe1_num,
|
|
cmph_uint32 bucket_num, cmph_uint32 size)
|
|
{
|
|
register cmph_uint32 i;
|
|
register chd_ph_item_t * item;
|
|
register cmph_uint32 position;
|
|
|
|
item = items + buckets[bucket_num].items_list;
|
|
// try place bucket with probe_num
|
|
if(chd_ph->keys_per_bin > 1)
|
|
{
|
|
for(i = 0; i < size; i++) // placement
|
|
{
|
|
position = (cmph_uint32)((item->f + ((cmph_uint64)item->h)*probe0_num + probe1_num) % chd_ph->n);
|
|
if(chd_ph->occup_table[position] >= chd_ph->keys_per_bin)
|
|
{
|
|
break;
|
|
}
|
|
(chd_ph->occup_table[position])++;
|
|
item++;
|
|
};
|
|
} else
|
|
{
|
|
for(i = 0; i < size; i++) // placement
|
|
{
|
|
position = (cmph_uint32)((item->f + ((cmph_uint64)item->h)*probe0_num + probe1_num) % chd_ph->n);
|
|
if(GETBIT32(((cmph_uint32 *)chd_ph->occup_table), position))
|
|
{
|
|
break;
|
|
}
|
|
SETBIT32(((cmph_uint32*)chd_ph->occup_table), position);
|
|
item++;
|
|
};
|
|
};
|
|
if(i != size) // Undo the placement
|
|
{
|
|
item = items + buckets[bucket_num].items_list;
|
|
if(chd_ph->keys_per_bin > 1)
|
|
{
|
|
while(1)
|
|
{
|
|
if(i == 0)
|
|
{
|
|
break;
|
|
}
|
|
position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n);
|
|
(chd_ph->occup_table[position])--;
|
|
item++;
|
|
i--;
|
|
};
|
|
} else
|
|
{
|
|
while(1)
|
|
{
|
|
if(i == 0)
|
|
{
|
|
break;
|
|
}
|
|
position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n);
|
|
UNSETBIT32(((cmph_uint32*)chd_ph->occup_table), position);
|
|
|
|
// ([position/32]^=(1<<(position%32));
|
|
item++;
|
|
i--;
|
|
};
|
|
};
|
|
return 0;
|
|
}
|
|
return 1;
|
|
};
|
|
|
|
static inline cmph_uint8 place_bucket(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items, cmph_uint32 max_probes,
|
|
cmph_uint32 * disp_table, cmph_uint32 bucket_num, cmph_uint32 size)
|
|
|
|
{
|
|
register cmph_uint32 probe0_num, probe1_num, probe_num;
|
|
probe0_num = 0;
|
|
probe1_num = 0;
|
|
probe_num = 0;
|
|
|
|
while(1)
|
|
{
|
|
if(place_bucket_probe(chd_ph, buckets, items, probe0_num, probe1_num, bucket_num,size))
|
|
{
|
|
disp_table[buckets[bucket_num].bucket_id] = probe0_num + probe1_num * chd_ph->n;
|
|
return 1;
|
|
}
|
|
probe0_num++;
|
|
if(probe0_num >= chd_ph->n)
|
|
{
|
|
probe0_num -= chd_ph->n;
|
|
probe1_num++;
|
|
};
|
|
probe_num++;
|
|
if(probe_num >= max_probes || probe1_num >= chd_ph->n)
|
|
{
|
|
return 0;
|
|
};
|
|
};
|
|
return 0;
|
|
};
|
|
|
|
static inline cmph_uint8 place_buckets1(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t * buckets, chd_ph_item_t *items,
|
|
cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes,
|
|
cmph_uint32 * disp_table)
|
|
{
|
|
register cmph_uint32 i = 0;
|
|
register cmph_uint32 curr_bucket = 0;
|
|
|
|
for(i = max_bucket_size; i > 0; i--)
|
|
{
|
|
curr_bucket = sorted_lists[i].buckets_list;
|
|
while(curr_bucket < sorted_lists[i].size + sorted_lists[i].buckets_list)
|
|
{
|
|
if(!place_bucket(chd_ph, buckets, items, max_probes, disp_table, curr_bucket, i))
|
|
{
|
|
return 0;
|
|
}
|
|
curr_bucket++;
|
|
};
|
|
};
|
|
return 1;
|
|
};
|
|
|
|
static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items,
|
|
cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes,
|
|
cmph_uint32 * disp_table)
|
|
{
|
|
register cmph_uint32 i,j, non_placed_bucket;
|
|
register cmph_uint32 curr_bucket;
|
|
register cmph_uint32 probe_num, probe0_num, probe1_num;
|
|
cmph_uint32 sorted_list_size;
|
|
#ifdef DEBUG
|
|
cmph_uint32 items_list;
|
|
cmph_uint32 bucket_id;
|
|
#endif
|
|
DEBUGP("USING HEURISTIC TO PLACE BUCKETS\n");
|
|
for(i = max_bucket_size; i > 0; i--)
|
|
{
|
|
probe_num = 0;
|
|
probe0_num = 0;
|
|
probe1_num = 0;
|
|
sorted_list_size = sorted_lists[i].size;
|
|
while(sorted_lists[i].size != 0)
|
|
{
|
|
curr_bucket = sorted_lists[i].buckets_list;
|
|
for(j = 0, non_placed_bucket = 0; j < sorted_lists[i].size; j++)
|
|
{
|
|
// if bucket is successfully placed remove it from list
|
|
if(place_bucket_probe(chd_ph, buckets, items, probe0_num, probe1_num, curr_bucket, i))
|
|
{
|
|
disp_table[buckets[curr_bucket].bucket_id] = probe0_num + probe1_num * chd_ph->n;
|
|
// DEBUGP("BUCKET %u PLACED --- DISPLACEMENT = %u\n", curr_bucket, disp_table[curr_bucket]);
|
|
}
|
|
else
|
|
{
|
|
// DEBUGP("BUCKET %u NOT PLACED\n", curr_bucket);
|
|
#ifdef DEBUG
|
|
items_list = buckets[non_placed_bucket + sorted_lists[i].buckets_list].items_list;
|
|
bucket_id = buckets[non_placed_bucket + sorted_lists[i].buckets_list].bucket_id;
|
|
#endif
|
|
buckets[non_placed_bucket + sorted_lists[i].buckets_list].items_list = buckets[curr_bucket].items_list;
|
|
buckets[non_placed_bucket + sorted_lists[i].buckets_list].bucket_id = buckets[curr_bucket].bucket_id;
|
|
#ifdef DEBUG
|
|
buckets[curr_bucket].items_list=items_list;
|
|
buckets[curr_bucket].bucket_id=bucket_id;
|
|
#endif
|
|
non_placed_bucket++;
|
|
}
|
|
curr_bucket++;
|
|
};
|
|
sorted_lists[i].size = non_placed_bucket;
|
|
probe0_num++;
|
|
if(probe0_num >= chd_ph->n)
|
|
{
|
|
probe0_num -= chd_ph->n;
|
|
probe1_num++;
|
|
};
|
|
probe_num++;
|
|
if(probe_num >= max_probes || probe1_num >= chd_ph->n)
|
|
{
|
|
sorted_lists[i].size = sorted_list_size;
|
|
return 0;
|
|
};
|
|
};
|
|
sorted_lists[i].size = sorted_list_size;
|
|
};
|
|
return 1;
|
|
};
|
|
|
|
cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items ,
|
|
cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes,
|
|
cmph_uint32 * disp_table)
|
|
{
|
|
if(chd_ph->use_h)
|
|
{
|
|
return place_buckets2(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table);
|
|
}
|
|
else
|
|
{
|
|
return place_buckets1(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table);
|
|
}
|
|
|
|
}
|
|
|
|
static inline cmph_uint8 chd_ph_check_bin_hashing(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items,
|
|
cmph_uint32 * disp_table, chd_ph_sorted_list_t * sorted_lists,cmph_uint32 max_bucket_size)
|
|
{
|
|
register cmph_uint32 bucket_size, i, j;
|
|
register cmph_uint32 position, probe0_num, probe1_num;
|
|
register cmph_uint32 m = 0;
|
|
register chd_ph_item_t * item;
|
|
if(chd_ph->keys_per_bin > 1)
|
|
memset(chd_ph->occup_table, 0, chd_ph->n);
|
|
else
|
|
memset(chd_ph->occup_table, 0, ((chd_ph->n + 31)/32) * sizeof(cmph_uint32));
|
|
|
|
for(bucket_size = 1; bucket_size <= max_bucket_size; bucket_size++)
|
|
for(i = sorted_lists[bucket_size].buckets_list; i < sorted_lists[bucket_size].size +
|
|
sorted_lists[bucket_size].buckets_list; i++)
|
|
{
|
|
j = bucket_size;
|
|
item = items + buckets[i].items_list;
|
|
probe0_num = disp_table[buckets[i].bucket_id] % chd_ph->n;
|
|
probe1_num = disp_table[buckets[i].bucket_id] / chd_ph->n;
|
|
for(; j > 0; j--)
|
|
{
|
|
m++;
|
|
position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n);
|
|
if(chd_ph->keys_per_bin > 1)
|
|
{
|
|
if(chd_ph->occup_table[position] >= chd_ph->keys_per_bin)
|
|
{
|
|
return 0;
|
|
}
|
|
(chd_ph->occup_table[position])++;
|
|
}
|
|
else
|
|
{
|
|
if(GETBIT32(((cmph_uint32*)chd_ph->occup_table), position))
|
|
{
|
|
return 0;
|
|
}
|
|
SETBIT32(((cmph_uint32*)chd_ph->occup_table), position);
|
|
};
|
|
item++;
|
|
};
|
|
};
|
|
DEBUGP("We were able to place m = %u keys\n", m);
|
|
return 1;
|
|
};
|
|
|
|
|
|
cmph_t *chd_ph_new(cmph_config_t *mph, double c)
|
|
{
|
|
cmph_t *mphf = NULL;
|
|
chd_ph_data_t *chd_phf = NULL;
|
|
chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data;
|
|
|
|
register double load_factor = c;
|
|
register cmph_uint8 searching_success = 0;
|
|
register cmph_uint32 max_probes = 1 << 20; // default value for max_probes
|
|
register cmph_uint32 iterations = 100;
|
|
chd_ph_bucket_t * buckets = NULL;
|
|
chd_ph_item_t * items = NULL;
|
|
register cmph_uint8 failure = 0;
|
|
cmph_uint32 max_bucket_size = 0;
|
|
chd_ph_sorted_list_t * sorted_lists = NULL;
|
|
cmph_uint32 * disp_table = NULL;
|
|
register double space_lower_bound = 0;
|
|
#ifdef CMPH_TIMING
|
|
double construction_time_begin = 0.0;
|
|
double construction_time = 0.0;
|
|
ELAPSED_TIME_IN_SECONDS(&construction_time_begin);
|
|
#endif
|
|
|
|
|
|
chd_ph->m = mph->key_source->nkeys;
|
|
DEBUGP("m = %u\n", chd_ph->m);
|
|
|
|
chd_ph->nbuckets = (cmph_uint32)(chd_ph->m/chd_ph->keys_per_bucket) + 1;
|
|
DEBUGP("nbuckets = %u\n", chd_ph->nbuckets);
|
|
|
|
if(load_factor < 0.5 )
|
|
{
|
|
load_factor = 0.5;
|
|
}
|
|
|
|
if(load_factor >= 0.99)
|
|
{
|
|
load_factor = 0.99;
|
|
}
|
|
|
|
DEBUGP("load_factor = %.3f\n", load_factor);
|
|
|
|
chd_ph->n = (cmph_uint32)(chd_ph->m/(chd_ph->keys_per_bin * load_factor)) + 1;
|
|
|
|
//Round the number of bins to the prime immediately above
|
|
if(chd_ph->n % 2 == 0) chd_ph->n++;
|
|
for(;;)
|
|
{
|
|
if(check_primality(chd_ph->n) == 1)
|
|
break;
|
|
chd_ph->n += 2; // just odd numbers can be primes for n > 2
|
|
|
|
};
|
|
|
|
DEBUGP("n = %u \n", chd_ph->n);
|
|
if(chd_ph->keys_per_bin == 1)
|
|
{
|
|
space_lower_bound = chd_ph_space_lower_bound(chd_ph->m, chd_ph->n);
|
|
}
|
|
|
|
if(mph->verbosity)
|
|
{
|
|
fprintf(stderr, "space lower bound is %.3f bits per key\n", space_lower_bound);
|
|
}
|
|
|
|
// We allocate the working tables
|
|
buckets = chd_ph_bucket_new(chd_ph->nbuckets);
|
|
items = (chd_ph_item_t *) calloc(chd_ph->m, sizeof(chd_ph_item_t));
|
|
|
|
max_probes = (cmph_uint32)(((log(chd_ph->m)/log(2))/20) * max_probes);
|
|
|
|
if(chd_ph->keys_per_bin == 1)
|
|
chd_ph->occup_table = (cmph_uint8 *) calloc(((chd_ph->n + 31)/32), sizeof(cmph_uint32));
|
|
else
|
|
chd_ph->occup_table = (cmph_uint8 *) calloc(chd_ph->n, sizeof(cmph_uint8));
|
|
|
|
disp_table = (cmph_uint32 *) calloc(chd_ph->nbuckets, sizeof(cmph_uint32));
|
|
//
|
|
// init_genrand(time(0));
|
|
|
|
while(1)
|
|
{
|
|
iterations --;
|
|
if (mph->verbosity)
|
|
{
|
|
fprintf(stderr, "Starting mapping step for mph creation of %u keys with %u bins\n", chd_ph->m, chd_ph->n);
|
|
}
|
|
|
|
if(!chd_ph_mapping(mph, buckets, items, &max_bucket_size))
|
|
{
|
|
if (mph->verbosity)
|
|
{
|
|
fprintf(stderr, "Failure in mapping step\n");
|
|
}
|
|
failure = 1;
|
|
goto cleanup;
|
|
}
|
|
|
|
if (mph->verbosity)
|
|
{
|
|
fprintf(stderr, "Starting ordering step\n");
|
|
}
|
|
if(sorted_lists)
|
|
{
|
|
free(sorted_lists);
|
|
}
|
|
|
|
sorted_lists = chd_ph_ordering(&buckets, &items, chd_ph->nbuckets, chd_ph->m, max_bucket_size);
|
|
|
|
if (mph->verbosity)
|
|
{
|
|
fprintf(stderr, "Starting searching step\n");
|
|
}
|
|
|
|
searching_success = chd_ph_searching(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table);
|
|
if(searching_success) break;
|
|
|
|
// reset occup_table
|
|
if(chd_ph->keys_per_bin > 1)
|
|
memset(chd_ph->occup_table, 0, chd_ph->n);
|
|
else
|
|
memset(chd_ph->occup_table, 0, ((chd_ph->n + 31)/32) * sizeof(cmph_uint32));
|
|
if(iterations == 0)
|
|
{
|
|
// Cleanup memory
|
|
if (mph->verbosity)
|
|
{
|
|
fprintf(stderr, "Failure because the max trials was exceeded\n");
|
|
}
|
|
failure = 1;
|
|
goto cleanup;
|
|
};
|
|
}
|
|
|
|
#ifdef DEBUG
|
|
{
|
|
if(!chd_ph_check_bin_hashing(chd_ph, buckets, items, disp_table,sorted_lists,max_bucket_size))
|
|
{
|
|
|
|
DEBUGP("Error for bin packing generation");
|
|
failure = 1;
|
|
goto cleanup;
|
|
}
|
|
}
|
|
#endif
|
|
|
|
if (mph->verbosity)
|
|
{
|
|
fprintf(stderr, "Starting compressing step\n");
|
|
}
|
|
|
|
if(chd_ph->cs)
|
|
{
|
|
free(chd_ph->cs);
|
|
}
|
|
chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t));
|
|
compressed_seq_init(chd_ph->cs);
|
|
compressed_seq_generate(chd_ph->cs, disp_table, chd_ph->nbuckets);
|
|
|
|
#ifdef CMPH_TIMING
|
|
ELAPSED_TIME_IN_SECONDS(&construction_time);
|
|
register double entropy = chd_ph_get_entropy(disp_table, chd_ph->nbuckets, max_probes);
|
|
DEBUGP("Entropy = %.4f\n", entropy/chd_ph->m);
|
|
#endif
|
|
|
|
cleanup:
|
|
chd_ph_bucket_destroy(buckets);
|
|
free(items);
|
|
free(sorted_lists);
|
|
free(disp_table);
|
|
if(failure)
|
|
{
|
|
if(chd_ph->hl)
|
|
{
|
|
hash_state_destroy(chd_ph->hl);
|
|
}
|
|
chd_ph->hl = NULL;
|
|
return NULL;
|
|
}
|
|
|
|
mphf = (cmph_t *)malloc(sizeof(cmph_t));
|
|
mphf->algo = mph->algo;
|
|
chd_phf = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t));
|
|
|
|
chd_phf->cs = chd_ph->cs;
|
|
chd_ph->cs = NULL; //transfer memory ownership
|
|
chd_phf->hl = chd_ph->hl;
|
|
chd_ph->hl = NULL; //transfer memory ownership
|
|
chd_phf->n = chd_ph->n;
|
|
chd_phf->nbuckets = chd_ph->nbuckets;
|
|
|
|
mphf->data = chd_phf;
|
|
mphf->size = chd_ph->n;
|
|
|
|
DEBUGP("Successfully generated minimal perfect hash\n");
|
|
if (mph->verbosity)
|
|
{
|
|
fprintf(stderr, "Successfully generated minimal perfect hash function\n");
|
|
}
|
|
|
|
#ifdef CMPH_TIMING
|
|
register cmph_uint32 space_usage = chd_ph_packed_size(mphf)*8;
|
|
construction_time = construction_time - construction_time_begin;
|
|
fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\t%.4f\t%.4f\n", chd_ph->m, load_factor, chd_ph->keys_per_bucket, construction_time, space_usage/(double)chd_ph->m, space_lower_bound, entropy/chd_ph->m);
|
|
#endif
|
|
|
|
return mphf;
|
|
}
|
|
|
|
|
|
|
|
void chd_ph_load(FILE *fd, cmph_t *mphf)
|
|
{
|
|
char *buf = NULL;
|
|
cmph_uint32 buflen;
|
|
register size_t nbytes;
|
|
chd_ph_data_t *chd_ph = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t));
|
|
|
|
DEBUGP("Loading chd_ph mphf\n");
|
|
mphf->data = chd_ph;
|
|
|
|
nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
|
|
DEBUGP("Hash state has %u bytes\n", buflen);
|
|
buf = (char *)malloc((size_t)buflen);
|
|
nbytes = fread(buf, (size_t)buflen, (size_t)1, fd);
|
|
chd_ph->hl = hash_state_load(buf, buflen);
|
|
free(buf);
|
|
|
|
nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
|
|
DEBUGP("Compressed sequence structure has %u bytes\n", buflen);
|
|
buf = (char *)malloc((size_t)buflen);
|
|
nbytes = fread(buf, (size_t)buflen, (size_t)1, fd);
|
|
chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t));
|
|
compressed_seq_load(chd_ph->cs, buf, buflen);
|
|
free(buf);
|
|
|
|
// loading n and nbuckets
|
|
DEBUGP("Reading n and nbuckets\n");
|
|
nbytes = fread(&(chd_ph->n), sizeof(cmph_uint32), (size_t)1, fd);
|
|
nbytes = fread(&(chd_ph->nbuckets), sizeof(cmph_uint32), (size_t)1, fd);
|
|
if (nbytes == 0 && ferror(fd)) {
|
|
fprintf(stderr, "ERROR: %s\n", strerror(errno));
|
|
}
|
|
|
|
}
|
|
|
|
int chd_ph_dump(cmph_t *mphf, FILE *fd)
|
|
{
|
|
char *buf = NULL;
|
|
cmph_uint32 buflen;
|
|
register size_t nbytes;
|
|
chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
|
|
|
|
__cmph_dump(mphf, fd);
|
|
|
|
hash_state_dump(data->hl, &buf, &buflen);
|
|
DEBUGP("Dumping hash state with %u bytes to disk\n", buflen);
|
|
nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
|
|
nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
|
|
free(buf);
|
|
|
|
compressed_seq_dump(data->cs, &buf, &buflen);
|
|
DEBUGP("Dumping compressed sequence structure with %u bytes to disk\n", buflen);
|
|
nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd);
|
|
nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd);
|
|
free(buf);
|
|
|
|
// dumping n and nbuckets
|
|
nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd);
|
|
nbytes = fwrite(&(data->nbuckets), sizeof(cmph_uint32), (size_t)1, fd);
|
|
if (nbytes == 0 && ferror(fd)) {
|
|
fprintf(stderr, "ERROR: %s\n", strerror(errno));
|
|
return 0;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
void chd_ph_destroy(cmph_t *mphf)
|
|
{
|
|
chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
|
|
compressed_seq_destroy(data->cs);
|
|
free(data->cs);
|
|
hash_state_destroy(data->hl);
|
|
free(data);
|
|
free(mphf);
|
|
|
|
}
|
|
|
|
cmph_uint32 chd_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen)
|
|
{
|
|
register chd_ph_data_t * chd_ph = mphf->data;
|
|
cmph_uint32 hl[3];
|
|
register cmph_uint32 disp,position;
|
|
register cmph_uint32 probe0_num,probe1_num;
|
|
register cmph_uint32 f,g,h;
|
|
hash_vector(chd_ph->hl, key, keylen, hl);
|
|
g = hl[0] % chd_ph->nbuckets;
|
|
f = hl[1] % chd_ph->n;
|
|
h = hl[2] % (chd_ph->n-1) + 1;
|
|
|
|
disp = compressed_seq_query(chd_ph->cs, g);
|
|
probe0_num = disp % chd_ph->n;
|
|
probe1_num = disp/chd_ph->n;
|
|
position = (cmph_uint32)((f + ((cmph_uint64 )h)*probe0_num + probe1_num) % chd_ph->n);
|
|
return position;
|
|
}
|
|
|
|
void chd_ph_pack(cmph_t *mphf, void *packed_mphf)
|
|
{
|
|
chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
|
|
cmph_uint8 * ptr = packed_mphf;
|
|
|
|
// packing hl type
|
|
CMPH_HASH hl_type = hash_get_type(data->hl);
|
|
*((cmph_uint32 *) ptr) = hl_type;
|
|
ptr += sizeof(cmph_uint32);
|
|
|
|
// packing hl
|
|
hash_state_pack(data->hl, ptr);
|
|
ptr += hash_state_packed_size(hl_type);
|
|
|
|
// packing n
|
|
*((cmph_uint32 *) ptr) = data->n;
|
|
ptr += sizeof(data->n);
|
|
|
|
// packing nbuckets
|
|
*((cmph_uint32 *) ptr) = data->nbuckets;
|
|
ptr += sizeof(data->nbuckets);
|
|
|
|
// packing cs
|
|
compressed_seq_pack(data->cs, ptr);
|
|
//ptr += compressed_seq_packed_size(data->cs);
|
|
|
|
}
|
|
|
|
cmph_uint32 chd_ph_packed_size(cmph_t *mphf)
|
|
{
|
|
register chd_ph_data_t *data = (chd_ph_data_t *)mphf->data;
|
|
register CMPH_HASH hl_type = hash_get_type(data->hl);
|
|
register cmph_uint32 hash_state_pack_size = hash_state_packed_size(hl_type);
|
|
register cmph_uint32 cs_pack_size = compressed_seq_packed_size(data->cs);
|
|
|
|
return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_pack_size + cs_pack_size + 3*sizeof(cmph_uint32));
|
|
|
|
}
|
|
|
|
cmph_uint32 chd_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen)
|
|
{
|
|
register CMPH_HASH hl_type = *(cmph_uint32 *)packed_mphf;
|
|
register cmph_uint8 *hl_ptr = (cmph_uint8 *)(packed_mphf) + 4;
|
|
|
|
register cmph_uint32 * ptr = (cmph_uint32 *)(hl_ptr + hash_state_packed_size(hl_type));
|
|
register cmph_uint32 n = *ptr++;
|
|
register cmph_uint32 nbuckets = *ptr++;
|
|
cmph_uint32 hl[3];
|
|
|
|
register cmph_uint32 disp,position;
|
|
register cmph_uint32 probe0_num,probe1_num;
|
|
register cmph_uint32 f,g,h;
|
|
|
|
hash_vector_packed(hl_ptr, hl_type, key, keylen, hl);
|
|
|
|
g = hl[0] % nbuckets;
|
|
f = hl[1] % n;
|
|
h = hl[2] % (n-1) + 1;
|
|
|
|
disp = compressed_seq_query_packed(ptr, g);
|
|
probe0_num = disp % n;
|
|
probe1_num = disp/n;
|
|
position = (cmph_uint32)((f + ((cmph_uint64 )h)*probe0_num + probe1_num) % n);
|
|
return position;
|
|
}
|
|
|
|
|
|
|