From 6178293a83b77635fddfe0ba8da1626877edb0c0 Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Thu, 11 Nov 2010 15:01:07 -0500 Subject: [PATCH] Import CMPH 1.0 This will be used for typelib indexing. See README-CMPH-IMPORT.txt for more information. --- cmph/README-CMPH-IMPORT.txt | 5 + cmph/bdz.c | 703 +++++++++++++++++++++++++ cmph/bdz.h | 43 ++ cmph/bdz_gen_lookup_table.c | 33 ++ cmph/bdz_ph.c | 621 ++++++++++++++++++++++ cmph/bdz_ph.h | 42 ++ cmph/bdz_structs.h | 36 ++ cmph/bdz_structs_ph.h | 26 + cmph/bitbool.h | 179 +++++++ cmph/bmz.c | 620 ++++++++++++++++++++++ cmph/bmz.h | 42 ++ cmph/bmz8.c | 632 +++++++++++++++++++++++ cmph/bmz8.h | 42 ++ cmph/bmz8_structs.h | 25 + cmph/bmz_structs.h | 25 + cmph/brz.c | 985 +++++++++++++++++++++++++++++++++++ cmph/brz.h | 47 ++ cmph/brz_structs.h | 39 ++ cmph/buffer_entry.c | 103 ++++ cmph/buffer_entry.h | 14 + cmph/buffer_manage.c | 66 +++ cmph/buffer_manage.h | 12 + cmph/buffer_manager.c | 64 +++ cmph/buffer_manager.h | 12 + cmph/chd.c | 271 ++++++++++ cmph/chd.h | 59 +++ cmph/chd_ph.c | 988 ++++++++++++++++++++++++++++++++++++ cmph/chd_ph.h | 59 +++ cmph/chd_structs.h | 21 + cmph/chd_structs_ph.h | 29 ++ cmph/chm.c | 381 ++++++++++++++ cmph/chm.h | 42 ++ cmph/chm_structs.h | 24 + cmph/cmph.c | 845 ++++++++++++++++++++++++++++++ cmph/cmph.h | 112 ++++ cmph/cmph_structs.c | 69 +++ cmph/cmph_structs.h | 33 ++ cmph/cmph_time.h | 62 +++ cmph/cmph_types.h | 42 ++ cmph/compressed_rank.c | 321 ++++++++++++ cmph/compressed_rank.h | 55 ++ cmph/compressed_seq.c | 378 ++++++++++++++ cmph/compressed_seq.h | 84 +++ cmph/debug.h | 53 ++ cmph/djb2_hash.c | 49 ++ cmph/djb2_hash.h | 18 + cmph/fch.c | 517 +++++++++++++++++++ cmph/fch.h | 48 ++ cmph/fch_buckets.c | 214 ++++++++ cmph/fch_buckets.h | 30 ++ cmph/fch_structs.h | 30 ++ cmph/fnv_hash.c | 53 ++ cmph/fnv_hash.h | 18 + cmph/graph.c | 338 ++++++++++++ cmph/graph.h | 40 ++ cmph/hash.c | 216 ++++++++ cmph/hash.h | 76 +++ cmph/hash_state.h | 12 + cmph/hashtree.c | 289 +++++++++++ cmph/hashtree.h | 19 + cmph/hashtree_structs.h | 32 ++ cmph/jenkins_hash.c | 297 +++++++++++ cmph/jenkins_hash.h | 65 +++ cmph/main.c | 342 +++++++++++++ cmph/miller_rabin.c | 67 +++ cmph/miller_rabin.h | 5 + cmph/sdbm_hash.c | 49 ++ cmph/sdbm_hash.h | 18 + cmph/select.c | 337 ++++++++++++ cmph/select.h | 61 +++ cmph/select_lookup_tables.h | 170 +++++++ cmph/vqueue.c | 51 ++ cmph/vqueue.h | 18 + cmph/vstack.c | 79 +++ cmph/vstack.h | 18 + cmph/wingetopt.c | 179 +++++++ cmph/wingetopt.h | 25 + 77 files changed, 12124 insertions(+) create mode 100644 cmph/README-CMPH-IMPORT.txt create mode 100755 cmph/bdz.c create mode 100755 cmph/bdz.h create mode 100755 cmph/bdz_gen_lookup_table.c create mode 100755 cmph/bdz_ph.c create mode 100755 cmph/bdz_ph.h create mode 100755 cmph/bdz_structs.h create mode 100755 cmph/bdz_structs_ph.h create mode 100644 cmph/bitbool.h create mode 100644 cmph/bmz.c create mode 100644 cmph/bmz.h create mode 100644 cmph/bmz8.c create mode 100644 cmph/bmz8.h create mode 100644 cmph/bmz8_structs.h create mode 100644 cmph/bmz_structs.h create mode 100755 cmph/brz.c create mode 100644 cmph/brz.h create mode 100755 cmph/brz_structs.h create mode 100644 cmph/buffer_entry.c create mode 100644 cmph/buffer_entry.h create mode 100644 cmph/buffer_manage.c create mode 100644 cmph/buffer_manage.h create mode 100644 cmph/buffer_manager.c create mode 100644 cmph/buffer_manager.h create mode 100644 cmph/chd.c create mode 100644 cmph/chd.h create mode 100644 cmph/chd_ph.c create mode 100644 cmph/chd_ph.h create mode 100644 cmph/chd_structs.h create mode 100644 cmph/chd_structs_ph.h create mode 100644 cmph/chm.c create mode 100644 cmph/chm.h create mode 100644 cmph/chm_structs.h create mode 100644 cmph/cmph.c create mode 100644 cmph/cmph.h create mode 100644 cmph/cmph_structs.c create mode 100644 cmph/cmph_structs.h create mode 100644 cmph/cmph_time.h create mode 100644 cmph/cmph_types.h create mode 100644 cmph/compressed_rank.c create mode 100644 cmph/compressed_rank.h create mode 100644 cmph/compressed_seq.c create mode 100644 cmph/compressed_seq.h create mode 100644 cmph/debug.h create mode 100644 cmph/djb2_hash.c create mode 100644 cmph/djb2_hash.h create mode 100644 cmph/fch.c create mode 100644 cmph/fch.h create mode 100644 cmph/fch_buckets.c create mode 100644 cmph/fch_buckets.h create mode 100755 cmph/fch_structs.h create mode 100644 cmph/fnv_hash.c create mode 100644 cmph/fnv_hash.h create mode 100644 cmph/graph.c create mode 100644 cmph/graph.h create mode 100644 cmph/hash.c create mode 100644 cmph/hash.h create mode 100644 cmph/hash_state.h create mode 100644 cmph/hashtree.c create mode 100644 cmph/hashtree.h create mode 100644 cmph/hashtree_structs.h create mode 100644 cmph/jenkins_hash.c create mode 100644 cmph/jenkins_hash.h create mode 100644 cmph/main.c create mode 100644 cmph/miller_rabin.c create mode 100644 cmph/miller_rabin.h create mode 100644 cmph/sdbm_hash.c create mode 100644 cmph/sdbm_hash.h create mode 100644 cmph/select.c create mode 100644 cmph/select.h create mode 100644 cmph/select_lookup_tables.h create mode 100644 cmph/vqueue.c create mode 100644 cmph/vqueue.h create mode 100644 cmph/vstack.c create mode 100644 cmph/vstack.h create mode 100644 cmph/wingetopt.c create mode 100644 cmph/wingetopt.h diff --git a/cmph/README-CMPH-IMPORT.txt b/cmph/README-CMPH-IMPORT.txt new file mode 100644 index 000000000..a1c23c246 --- /dev/null +++ b/cmph/README-CMPH-IMPORT.txt @@ -0,0 +1,5 @@ +This import of CMPH was made from revision bfdcc3a3a18dfb9 of +git://cmph.git.sourceforge.net/gitroot/cmph/cmph + +Only the following files were taken, and everything else deleted: +COPYING src/*.[ch] diff --git a/cmph/bdz.c b/cmph/bdz.c new file mode 100755 index 000000000..f422c8f91 --- /dev/null +++ b/cmph/bdz.c @@ -0,0 +1,703 @@ +#include "bdz.h" +#include "cmph_structs.h" +#include "bdz_structs.h" +#include "hash.h" +#include "bitbool.h" + +#include +#include +#include +#include +#include +//#define DEBUG +#include "debug.h" +#define UNASSIGNED 3U +#define NULL_EDGE 0xffffffff + +//cmph_uint32 ngrafos = 0; +//cmph_uint32 ngrafos_aciclicos = 0; +// table used for looking up the number of assigned vertices a 8-bit integer +const cmph_uint8 bdz_lookup_table[] = +{ +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +4, 4, 4, 3, 4, 4, 4, 3, 4, 4, 4, 3, 3, 3, 3, 2, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +3, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 2, 2, 2, 1, +2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 0 +}; + +typedef struct +{ + cmph_uint32 vertices[3]; + cmph_uint32 next_edges[3]; +}bdz_edge_t; + +typedef cmph_uint32 * bdz_queue_t; + +static void bdz_alloc_queue(bdz_queue_t * queuep, cmph_uint32 nedges) +{ + (*queuep)=malloc(nedges*sizeof(cmph_uint32)); +}; +static void bdz_free_queue(bdz_queue_t * queue) +{ + free(*queue); +}; + +typedef struct +{ + cmph_uint32 nedges; + bdz_edge_t * edges; + cmph_uint32 * first_edge; + cmph_uint8 * vert_degree; +}bdz_graph3_t; + + +static void bdz_alloc_graph3(bdz_graph3_t * graph3, cmph_uint32 nedges, cmph_uint32 nvertices) +{ + graph3->edges=malloc(nedges*sizeof(bdz_edge_t)); + graph3->first_edge=malloc(nvertices*sizeof(cmph_uint32)); + graph3->vert_degree=malloc((size_t)nvertices); +}; +static void bdz_init_graph3(bdz_graph3_t * graph3, cmph_uint32 nedges, cmph_uint32 nvertices) +{ + memset(graph3->first_edge,0xff,nvertices*sizeof(cmph_uint32)); + memset(graph3->vert_degree,0,(size_t)nvertices); + graph3->nedges=0; +}; +static void bdz_free_graph3(bdz_graph3_t *graph3) +{ + free(graph3->edges); + free(graph3->first_edge); + free(graph3->vert_degree); +}; + +static void bdz_partial_free_graph3(bdz_graph3_t *graph3) +{ + free(graph3->first_edge); + free(graph3->vert_degree); + graph3->first_edge = NULL; + graph3->vert_degree = NULL; +}; + +static void bdz_add_edge(bdz_graph3_t * graph3, cmph_uint32 v0, cmph_uint32 v1, cmph_uint32 v2) +{ + graph3->edges[graph3->nedges].vertices[0]=v0; + graph3->edges[graph3->nedges].vertices[1]=v1; + graph3->edges[graph3->nedges].vertices[2]=v2; + graph3->edges[graph3->nedges].next_edges[0]=graph3->first_edge[v0]; + graph3->edges[graph3->nedges].next_edges[1]=graph3->first_edge[v1]; + graph3->edges[graph3->nedges].next_edges[2]=graph3->first_edge[v2]; + graph3->first_edge[v0]=graph3->first_edge[v1]=graph3->first_edge[v2]=graph3->nedges; + graph3->vert_degree[v0]++; + graph3->vert_degree[v1]++; + graph3->vert_degree[v2]++; + graph3->nedges++; +}; + +static void bdz_dump_graph(bdz_graph3_t* graph3, cmph_uint32 nedges, cmph_uint32 nvertices) +{ + int i; + for(i=0;iedges[i].vertices[0], + graph3->edges[i].vertices[1],graph3->edges[i].vertices[2]); + printf(" nexts %d %d %d",graph3->edges[i].next_edges[0], + graph3->edges[i].next_edges[1],graph3->edges[i].next_edges[2]); + }; + + for(i=0;ifirst_edge[i]); + + }; +}; + +static void bdz_remove_edge(bdz_graph3_t * graph3, cmph_uint32 curr_edge) +{ + cmph_uint32 i,j=0,vert,edge1,edge2; + for(i=0;i<3;i++){ + vert=graph3->edges[curr_edge].vertices[i]; + edge1=graph3->first_edge[vert]; + edge2=NULL_EDGE; + while(edge1!=curr_edge&&edge1!=NULL_EDGE){ + edge2=edge1; + if(graph3->edges[edge1].vertices[0]==vert){ + j=0; + } else if(graph3->edges[edge1].vertices[1]==vert){ + j=1; + } else + j=2; + edge1=graph3->edges[edge1].next_edges[j]; + }; + if(edge1==NULL_EDGE){ + printf("\nerror remove edge %d dump graph",curr_edge); + bdz_dump_graph(graph3,graph3->nedges,graph3->nedges+graph3->nedges/4); + exit(-1); + }; + + if(edge2!=NULL_EDGE){ + graph3->edges[edge2].next_edges[j] = + graph3->edges[edge1].next_edges[i]; + } else + graph3->first_edge[vert]= + graph3->edges[edge1].next_edges[i]; + graph3->vert_degree[vert]--; + }; + +}; + +static int bdz_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_queue_t queue, bdz_graph3_t* graph3) +{ + cmph_uint32 i,v0,v1,v2; + cmph_uint32 queue_head=0,queue_tail=0; + cmph_uint32 curr_edge; + cmph_uint32 tmp_edge; + cmph_uint8 * marked_edge =malloc((size_t)(nedges >> 3) + 1); + memset(marked_edge, 0, (size_t)(nedges >> 3) + 1); + + for(i=0;iedges[i].vertices[0]; + v1=graph3->edges[i].vertices[1]; + v2=graph3->edges[i].vertices[2]; + if(graph3->vert_degree[v0]==1 || + graph3->vert_degree[v1]==1 || + graph3->vert_degree[v2]==1){ + if(!GETBIT(marked_edge,i)) { + queue[queue_head++]=i; + SETBIT(marked_edge,i); + } + }; + }; + while(queue_tail!=queue_head){ + curr_edge=queue[queue_tail++]; + bdz_remove_edge(graph3,curr_edge); + v0=graph3->edges[curr_edge].vertices[0]; + v1=graph3->edges[curr_edge].vertices[1]; + v2=graph3->edges[curr_edge].vertices[2]; + if(graph3->vert_degree[v0]==1 ) { + tmp_edge=graph3->first_edge[v0]; + if(!GETBIT(marked_edge,tmp_edge)) { + queue[queue_head++]=tmp_edge; + SETBIT(marked_edge,tmp_edge); + }; + + }; + if(graph3->vert_degree[v1]==1) { + tmp_edge=graph3->first_edge[v1]; + if(!GETBIT(marked_edge,tmp_edge)){ + queue[queue_head++]=tmp_edge; + SETBIT(marked_edge,tmp_edge); + }; + + }; + if(graph3->vert_degree[v2]==1){ + tmp_edge=graph3->first_edge[v2]; + if(!GETBIT(marked_edge,tmp_edge)){ + queue[queue_head++]=tmp_edge; + SETBIT(marked_edge,tmp_edge); + }; + }; + }; + free(marked_edge); + return (int)(queue_head-nedges);/* returns 0 if successful otherwies return negative number*/ +}; + +static int bdz_mapping(cmph_config_t *mph, bdz_graph3_t* graph3, bdz_queue_t queue); +static void assigning(bdz_config_data_t *bdz, bdz_graph3_t* graph3, bdz_queue_t queue); +static void ranking(bdz_config_data_t *bdz); +static cmph_uint32 rank(cmph_uint32 b, cmph_uint32 * ranktable, cmph_uint8 * g, cmph_uint32 vertex); + +bdz_config_data_t *bdz_config_new() +{ + bdz_config_data_t *bdz; + bdz = (bdz_config_data_t *)malloc(sizeof(bdz_config_data_t)); + assert(bdz); + memset(bdz, 0, sizeof(bdz_config_data_t)); + bdz->hashfunc = CMPH_HASH_JENKINS; + bdz->g = NULL; + bdz->hl = NULL; + bdz->k = 0; //kth index in ranktable, $k = log_2(n=3r)/\varepsilon$ + bdz->b = 7; // number of bits of k + bdz->ranktablesize = 0; //number of entries in ranktable, $n/k +1$ + bdz->ranktable = NULL; // rank table + return bdz; +} + +void bdz_config_destroy(cmph_config_t *mph) +{ + bdz_config_data_t *data = (bdz_config_data_t *)mph->data; + DEBUGP("Destroying algorithm dependent data\n"); + free(data); +} + +void bdz_config_set_b(cmph_config_t *mph, cmph_uint32 b) +{ + bdz_config_data_t *bdz = (bdz_config_data_t *)mph->data; + if (b <= 2 || b > 10) b = 7; // validating restrictions over parameter b. + bdz->b = (cmph_uint8)b; + DEBUGP("b: %u\n", b); + +} + +void bdz_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) +{ + bdz_config_data_t *bdz = (bdz_config_data_t *)mph->data; + CMPH_HASH *hashptr = hashfuncs; + cmph_uint32 i = 0; + while(*hashptr != CMPH_HASH_COUNT) + { + if (i >= 1) break; //bdz only uses one linear hash function + bdz->hashfunc = *hashptr; + ++i, ++hashptr; + } +} + +cmph_t *bdz_new(cmph_config_t *mph, double c) +{ + cmph_t *mphf = NULL; + bdz_data_t *bdzf = NULL; + cmph_uint32 iterations; + bdz_queue_t edges; + bdz_graph3_t graph3; + bdz_config_data_t *bdz = (bdz_config_data_t *)mph->data; + #ifdef CMPH_TIMING + double construction_time_begin = 0.0; + double construction_time = 0.0; + ELAPSED_TIME_IN_SECONDS(&construction_time_begin); + #endif + + + if (c == 0) c = 1.23; // validating restrictions over parameter c. + DEBUGP("c: %f\n", c); + bdz->m = mph->key_source->nkeys; + bdz->r = (cmph_uint32)ceil((c * mph->key_source->nkeys)/3); + if ((bdz->r % 2) == 0) bdz->r+=1; + bdz->n = 3*bdz->r; + + bdz->k = (1U << bdz->b); + DEBUGP("b: %u -- k: %u\n", bdz->b, bdz->k); + + bdz->ranktablesize = (cmph_uint32)ceil(bdz->n/(double)bdz->k); + DEBUGP("ranktablesize: %u\n", bdz->ranktablesize); + + + bdz_alloc_graph3(&graph3, bdz->m, bdz->n); + bdz_alloc_queue(&edges,bdz->m); + DEBUGP("Created hypergraph\n"); + + DEBUGP("m (edges): %u n (vertices): %u r: %u c: %f \n", bdz->m, bdz->n, bdz->r, c); + + // Mapping step + iterations = 1000; + if (mph->verbosity) + { + fprintf(stderr, "Entering mapping step for mph creation of %u keys with graph sized %u\n", bdz->m, bdz->n); + } + while(1) + { + int ok; + DEBUGP("linear hash function \n"); + bdz->hl = hash_state_new(bdz->hashfunc, 15); + + ok = bdz_mapping(mph, &graph3, edges); + //ok = 0; + if (!ok) + { + --iterations; + hash_state_destroy(bdz->hl); + bdz->hl = NULL; + DEBUGP("%u iterations remaining\n", iterations); + if (mph->verbosity) + { + fprintf(stderr, "acyclic graph creation failure - %u iterations remaining\n", iterations); + } + if (iterations == 0) break; + } + else break; + } + + if (iterations == 0) + { + bdz_free_queue(&edges); + bdz_free_graph3(&graph3); + return NULL; + } + bdz_partial_free_graph3(&graph3); + // Assigning step + if (mph->verbosity) + { + fprintf(stderr, "Entering assigning step for mph creation of %u keys with graph sized %u\n", bdz->m, bdz->n); + } + assigning(bdz, &graph3, edges); + + bdz_free_queue(&edges); + bdz_free_graph3(&graph3); + if (mph->verbosity) + { + fprintf(stderr, "Entering ranking step for mph creation of %u keys with graph sized %u\n", bdz->m, bdz->n); + } + ranking(bdz); + #ifdef CMPH_TIMING + ELAPSED_TIME_IN_SECONDS(&construction_time); + #endif + mphf = (cmph_t *)malloc(sizeof(cmph_t)); + mphf->algo = mph->algo; + bdzf = (bdz_data_t *)malloc(sizeof(bdz_data_t)); + bdzf->g = bdz->g; + bdz->g = NULL; //transfer memory ownership + bdzf->hl = bdz->hl; + bdz->hl = NULL; //transfer memory ownership + bdzf->ranktable = bdz->ranktable; + bdz->ranktable = NULL; //transfer memory ownership + bdzf->ranktablesize = bdz->ranktablesize; + bdzf->k = bdz->k; + bdzf->b = bdz->b; + bdzf->n = bdz->n; + bdzf->m = bdz->m; + bdzf->r = bdz->r; + mphf->data = bdzf; + mphf->size = bdz->m; + + DEBUGP("Successfully generated minimal perfect hash\n"); + if (mph->verbosity) + { + fprintf(stderr, "Successfully generated minimal perfect hash function\n"); + } + + + #ifdef CMPH_TIMING + register cmph_uint32 space_usage = bdz_packed_size(mphf)*8; + register cmph_uint32 keys_per_bucket = 1; + construction_time = construction_time - construction_time_begin; + fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\n", bdz->m, bdz->m/(double)bdz->n, keys_per_bucket, construction_time, space_usage/(double)bdz->m); + #endif + + return mphf; +} + + +static int bdz_mapping(cmph_config_t *mph, bdz_graph3_t* graph3, bdz_queue_t queue) +{ + cmph_uint32 e; + int cycles = 0; + cmph_uint32 hl[3]; + bdz_config_data_t *bdz = (bdz_config_data_t *)mph->data; + bdz_init_graph3(graph3, bdz->m, bdz->n); + mph->key_source->rewind(mph->key_source->data); + for (e = 0; e < mph->key_source->nkeys; ++e) + { + cmph_uint32 h0, h1, h2; + cmph_uint32 keylen; + char *key = NULL; + mph->key_source->read(mph->key_source->data, &key, &keylen); + hash_vector(bdz->hl, key, keylen,hl); + h0 = hl[0] % bdz->r; + h1 = hl[1] % bdz->r + bdz->r; + h2 = hl[2] % bdz->r + (bdz->r << 1); + mph->key_source->dispose(mph->key_source->data, key, keylen); + bdz_add_edge(graph3,h0,h1,h2); + } + cycles = bdz_generate_queue(bdz->m, bdz->n, queue, graph3); + return (cycles == 0); +} + +static void assigning(bdz_config_data_t *bdz, bdz_graph3_t* graph3, bdz_queue_t queue) +{ + cmph_uint32 i; + cmph_uint32 nedges=graph3->nedges; + cmph_uint32 curr_edge; + cmph_uint32 v0,v1,v2; + cmph_uint8 * marked_vertices =malloc((size_t)(bdz->n >> 3) + 1); + cmph_uint32 sizeg = (cmph_uint32)ceil(bdz->n/4.0); + bdz->g = (cmph_uint8 *)calloc((size_t)(sizeg), sizeof(cmph_uint8)); + memset(marked_vertices, 0, (size_t)(bdz->n >> 3) + 1); + memset(bdz->g, 0xff, (size_t)(sizeg)); + + for(i=nedges-1;i+1>=1;i--){ + curr_edge=queue[i]; + v0=graph3->edges[curr_edge].vertices[0]; + v1=graph3->edges[curr_edge].vertices[1]; + v2=graph3->edges[curr_edge].vertices[2]; + DEBUGP("B:%u %u %u -- %u %u %u\n", v0, v1, v2, GETVALUE(bdz->g, v0), GETVALUE(bdz->g, v1), GETVALUE(bdz->g, v2)); + if(!GETBIT(marked_vertices, v0)){ + if(!GETBIT(marked_vertices,v1)) + { + SETVALUE1(bdz->g, v1, UNASSIGNED); + SETBIT(marked_vertices, v1); + } + if(!GETBIT(marked_vertices,v2)) + { + SETVALUE1(bdz->g, v2, UNASSIGNED); + SETBIT(marked_vertices, v2); + } + SETVALUE1(bdz->g, v0, (6-(GETVALUE(bdz->g, v1) + GETVALUE(bdz->g,v2)))%3); + SETBIT(marked_vertices, v0); + } else if(!GETBIT(marked_vertices, v1)) { + if(!GETBIT(marked_vertices, v2)) + { + SETVALUE1(bdz->g, v2, UNASSIGNED); + SETBIT(marked_vertices, v2); + } + SETVALUE1(bdz->g, v1, (7-(GETVALUE(bdz->g, v0)+GETVALUE(bdz->g, v2)))%3); + SETBIT(marked_vertices, v1); + }else { + SETVALUE1(bdz->g, v2, (8-(GETVALUE(bdz->g,v0)+GETVALUE(bdz->g, v1)))%3); + SETBIT(marked_vertices, v2); + } + DEBUGP("A:%u %u %u -- %u %u %u\n", v0, v1, v2, GETVALUE(bdz->g, v0), GETVALUE(bdz->g, v1), GETVALUE(bdz->g, v2)); + }; + free(marked_vertices); +} + + +static void ranking(bdz_config_data_t *bdz) +{ + cmph_uint32 i, j, offset = 0U, count = 0U, size = (bdz->k >> 2U), nbytes_total = (cmph_uint32)ceil(bdz->n/4.0), nbytes; + bdz->ranktable = (cmph_uint32 *)calloc((size_t)bdz->ranktablesize, sizeof(cmph_uint32)); + // ranktable computation + bdz->ranktable[0] = 0; + i = 1; + while(1) + { + if(i == bdz->ranktablesize) break; + nbytes = size < nbytes_total? size : nbytes_total; + for(j = 0; j < nbytes; j++) + { + count += bdz_lookup_table[*(bdz->g + offset + j)]; + } + bdz->ranktable[i] = count; + offset += nbytes; + nbytes_total -= size; + i++; + } +} + + +int bdz_dump(cmph_t *mphf, FILE *fd) +{ + char *buf = NULL; + cmph_uint32 buflen; + register size_t nbytes; + bdz_data_t *data = (bdz_data_t *)mphf->data; + __cmph_dump(mphf, fd); + + hash_state_dump(data->hl, &buf, &buflen); + DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); + nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd); + free(buf); + + nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(&(data->r), sizeof(cmph_uint32), (size_t)1, fd); + + cmph_uint32 sizeg = (cmph_uint32)ceil(data->n/4.0); + nbytes = fwrite(data->g, sizeof(cmph_uint8)*sizeg, (size_t)1, fd); + + nbytes = fwrite(&(data->k), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(&(data->b), sizeof(cmph_uint8), (size_t)1, fd); + nbytes = fwrite(&(data->ranktablesize), sizeof(cmph_uint32), (size_t)1, fd); + + nbytes = fwrite(data->ranktable, sizeof(cmph_uint32)*(data->ranktablesize), (size_t)1, fd); + #ifdef DEBUG + cmph_uint32 i; + fprintf(stderr, "G: "); + for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", GETVALUE(data->g, i)); + fprintf(stderr, "\n"); + #endif + return 1; +} + +void bdz_load(FILE *f, cmph_t *mphf) +{ + char *buf = NULL; + cmph_uint32 buflen, sizeg; + register size_t nbytes; + bdz_data_t *bdz = (bdz_data_t *)malloc(sizeof(bdz_data_t)); + + DEBUGP("Loading bdz mphf\n"); + mphf->data = bdz; + + nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f); + DEBUGP("Hash state has %u bytes\n", buflen); + buf = (char *)malloc((size_t)buflen); + nbytes = fread(buf, (size_t)buflen, (size_t)1, f); + bdz->hl = hash_state_load(buf, buflen); + free(buf); + + + DEBUGP("Reading m and n\n"); + nbytes = fread(&(bdz->n), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(bdz->m), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(bdz->r), sizeof(cmph_uint32), (size_t)1, f); + sizeg = (cmph_uint32)ceil(bdz->n/4.0); + bdz->g = (cmph_uint8 *)calloc((size_t)(sizeg), sizeof(cmph_uint8)); + nbytes = fread(bdz->g, sizeg*sizeof(cmph_uint8), (size_t)1, f); + + nbytes = fread(&(bdz->k), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(bdz->b), sizeof(cmph_uint8), (size_t)1, f); + nbytes = fread(&(bdz->ranktablesize), sizeof(cmph_uint32), (size_t)1, f); + + bdz->ranktable = (cmph_uint32 *)calloc((size_t)bdz->ranktablesize, sizeof(cmph_uint32)); + nbytes = fread(bdz->ranktable, sizeof(cmph_uint32)*(bdz->ranktablesize), (size_t)1, f); + + #ifdef DEBUG + cmph_uint32 i = 0; + fprintf(stderr, "G: "); + for (i = 0; i < bdz->n; ++i) fprintf(stderr, "%u ", GETVALUE(bdz->g,i)); + fprintf(stderr, "\n"); + #endif + return; +} + + +cmph_uint32 bdz_search_ph(cmph_t *mphf, const char *key, cmph_uint32 keylen) +{ + bdz_data_t *bdz = mphf->data; + cmph_uint32 hl[3]; + hash_vector(bdz->hl, key, keylen, hl); + cmph_uint32 vertex; + hl[0] = hl[0] % bdz->r; + hl[1] = hl[1] % bdz->r + bdz->r; + hl[2] = hl[2] % bdz->r + (bdz->r << 1); + vertex = hl[(GETVALUE(bdz->g, hl[0]) + GETVALUE(bdz->g, hl[1]) + GETVALUE(bdz->g, hl[2])) % 3]; + return vertex; +} + +static inline cmph_uint32 rank(cmph_uint32 b, cmph_uint32 * ranktable, cmph_uint8 * g, cmph_uint32 vertex) +{ + register cmph_uint32 index = vertex >> b; + register cmph_uint32 base_rank = ranktable[index]; + register cmph_uint32 beg_idx_v = index << b; + register cmph_uint32 beg_idx_b = beg_idx_v >> 2; + register cmph_uint32 end_idx_b = vertex >> 2; + while(beg_idx_b < end_idx_b) + { + base_rank += bdz_lookup_table[*(g + beg_idx_b++)]; + + } + beg_idx_v = beg_idx_b << 2; + while(beg_idx_v < vertex) + { + if(GETVALUE(g, beg_idx_v) != UNASSIGNED) base_rank++; + beg_idx_v++; + } + + return base_rank; +} + +cmph_uint32 bdz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) +{ + register cmph_uint32 vertex; + register bdz_data_t *bdz = mphf->data; + cmph_uint32 hl[3]; + hash_vector(bdz->hl, key, keylen, hl); + hl[0] = hl[0] % bdz->r; + hl[1] = hl[1] % bdz->r + bdz->r; + hl[2] = hl[2] % bdz->r + (bdz->r << 1); + vertex = hl[(GETVALUE(bdz->g, hl[0]) + GETVALUE(bdz->g, hl[1]) + GETVALUE(bdz->g, hl[2])) % 3]; + return rank(bdz->b, bdz->ranktable, bdz->g, vertex); +} + + +void bdz_destroy(cmph_t *mphf) +{ + bdz_data_t *data = (bdz_data_t *)mphf->data; + free(data->g); + hash_state_destroy(data->hl); + free(data->ranktable); + free(data); + free(mphf); +} + +/** \fn void bdz_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void bdz_pack(cmph_t *mphf, void *packed_mphf) +{ + bdz_data_t *data = (bdz_data_t *)mphf->data; + cmph_uint8 * ptr = packed_mphf; + + // packing hl type + CMPH_HASH hl_type = hash_get_type(data->hl); + *((cmph_uint32 *) ptr) = hl_type; + ptr += sizeof(cmph_uint32); + + // packing hl + hash_state_pack(data->hl, ptr); + ptr += hash_state_packed_size(hl_type); + + // packing r + *((cmph_uint32 *) ptr) = data->r; + ptr += sizeof(data->r); + + // packing ranktablesize + *((cmph_uint32 *) ptr) = data->ranktablesize; + ptr += sizeof(data->ranktablesize); + + // packing ranktable + memcpy(ptr, data->ranktable, sizeof(cmph_uint32)*(data->ranktablesize)); + ptr += sizeof(cmph_uint32)*(data->ranktablesize); + + // packing b + *ptr++ = data->b; + + // packing g + cmph_uint32 sizeg = (cmph_uint32)ceil(data->n/4.0); + memcpy(ptr, data->g, sizeof(cmph_uint8)*sizeg); +} + +/** \fn cmph_uint32 bdz_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 bdz_packed_size(cmph_t *mphf) +{ + bdz_data_t *data = (bdz_data_t *)mphf->data; + + CMPH_HASH hl_type = hash_get_type(data->hl); + + return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(hl_type) + 3*sizeof(cmph_uint32) + sizeof(cmph_uint32)*(data->ranktablesize) + sizeof(cmph_uint8) + sizeof(cmph_uint8)* (cmph_uint32)(ceil(data->n/4.0))); +} + +/** cmph_uint32 bdz_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 bdz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) +{ + + register cmph_uint32 vertex; + register CMPH_HASH hl_type = *(cmph_uint32 *)packed_mphf; + register cmph_uint8 *hl_ptr = (cmph_uint8 *)(packed_mphf) + 4; + + register cmph_uint32 *ranktable = (cmph_uint32*)(hl_ptr + hash_state_packed_size(hl_type)); + + register cmph_uint32 r = *ranktable++; + register cmph_uint32 ranktablesize = *ranktable++; + register cmph_uint8 * g = (cmph_uint8 *)(ranktable + ranktablesize); + register cmph_uint8 b = *g++; + + cmph_uint32 hl[3]; + hash_vector_packed(hl_ptr, hl_type, key, keylen, hl); + hl[0] = hl[0] % r; + hl[1] = hl[1] % r + r; + hl[2] = hl[2] % r + (r << 1); + vertex = hl[(GETVALUE(g, hl[0]) + GETVALUE(g, hl[1]) + GETVALUE(g, hl[2])) % 3]; + return rank(b, ranktable, g, vertex); +} diff --git a/cmph/bdz.h b/cmph/bdz.h new file mode 100755 index 000000000..f2b7b89c2 --- /dev/null +++ b/cmph/bdz.h @@ -0,0 +1,43 @@ +#ifndef __CMPH_BDZ_H__ +#define __CMPH_BDZ_H__ + +#include "cmph.h" + +typedef struct __bdz_data_t bdz_data_t; +typedef struct __bdz_config_data_t bdz_config_data_t; + +bdz_config_data_t *bdz_config_new(); +void bdz_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs); +void bdz_config_destroy(cmph_config_t *mph); +void bdz_config_set_b(cmph_config_t *mph, cmph_uint32 b); +cmph_t *bdz_new(cmph_config_t *mph, double c); + +void bdz_load(FILE *f, cmph_t *mphf); +int bdz_dump(cmph_t *mphf, FILE *f); +void bdz_destroy(cmph_t *mphf); +cmph_uint32 bdz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); + +/** \fn void bdz_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void bdz_pack(cmph_t *mphf, void *packed_mphf); + +/** \fn cmph_uint32 bdz_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 bdz_packed_size(cmph_t *mphf); + +/** cmph_uint32 bdz_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 bdz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen); + +#endif diff --git a/cmph/bdz_gen_lookup_table.c b/cmph/bdz_gen_lookup_table.c new file mode 100755 index 000000000..b8f66068d --- /dev/null +++ b/cmph/bdz_gen_lookup_table.c @@ -0,0 +1,33 @@ +#include +#include +#include +void help(char * prname) +{ + fprintf(stderr, "USE: %s \n", prname); + exit(1); +} + +int main(int argc, char ** argv) +{ + if(argc != 3) help(argv[0]); + int n = atoi(argv[1]); + int wordsize = (atoi(argv[2]) >> 1); + int i, j, n_assigned; + for(i = 0; i < n; i++) + { + int num = i; + n_assigned = 0; + for(j = 0; j < wordsize; j++) + { + if ((num & 0x0003) != 3) + { + n_assigned++; + //fprintf(stderr, "num:%d\n", num); + } + num = num >> 2; + } + if(i%16 == 0) fprintf(stderr, "\n"); + fprintf(stderr, "%d, ", n_assigned); + } + fprintf(stderr, "\n"); +} diff --git a/cmph/bdz_ph.c b/cmph/bdz_ph.c new file mode 100755 index 000000000..49cf56463 --- /dev/null +++ b/cmph/bdz_ph.c @@ -0,0 +1,621 @@ +#include "bdz_ph.h" +#include "cmph_structs.h" +#include "bdz_structs_ph.h" +#include "hash.h" +#include "bitbool.h" + +#include +#include +#include +#include +#include +//#define DEBUG +#include "debug.h" +#define UNASSIGNED 3 +#define NULL_EDGE 0xffffffff + + +static cmph_uint8 pow3_table[5] = {1,3,9,27,81}; +static cmph_uint8 lookup_table[5][256] = { + {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0}, + {0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1, 1, 1, 2, 2, 2, 0, 0, 0, 1}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, +}; + +typedef struct +{ + cmph_uint32 vertices[3]; + cmph_uint32 next_edges[3]; +}bdz_ph_edge_t; + +typedef cmph_uint32 * bdz_ph_queue_t; + +static void bdz_ph_alloc_queue(bdz_ph_queue_t * queuep, cmph_uint32 nedges) +{ + (*queuep)=malloc(nedges*sizeof(cmph_uint32)); +}; +static void bdz_ph_free_queue(bdz_ph_queue_t * queue) +{ + free(*queue); +}; + +typedef struct +{ + cmph_uint32 nedges; + bdz_ph_edge_t * edges; + cmph_uint32 * first_edge; + cmph_uint8 * vert_degree; +}bdz_ph_graph3_t; + + +static void bdz_ph_alloc_graph3(bdz_ph_graph3_t * graph3, cmph_uint32 nedges, cmph_uint32 nvertices) +{ + graph3->edges=malloc(nedges*sizeof(bdz_ph_edge_t)); + graph3->first_edge=malloc(nvertices*sizeof(cmph_uint32)); + graph3->vert_degree=malloc((size_t)nvertices); +}; +static void bdz_ph_init_graph3(bdz_ph_graph3_t * graph3, cmph_uint32 nedges, cmph_uint32 nvertices) +{ + memset(graph3->first_edge,0xff,nvertices*sizeof(cmph_uint32)); + memset(graph3->vert_degree,0,(size_t)nvertices); + graph3->nedges=0; +}; +static void bdz_ph_free_graph3(bdz_ph_graph3_t *graph3) +{ + free(graph3->edges); + free(graph3->first_edge); + free(graph3->vert_degree); +}; + +static void bdz_ph_partial_free_graph3(bdz_ph_graph3_t *graph3) +{ + free(graph3->first_edge); + free(graph3->vert_degree); + graph3->first_edge = NULL; + graph3->vert_degree = NULL; +}; + +static void bdz_ph_add_edge(bdz_ph_graph3_t * graph3, cmph_uint32 v0, cmph_uint32 v1, cmph_uint32 v2) +{ + graph3->edges[graph3->nedges].vertices[0]=v0; + graph3->edges[graph3->nedges].vertices[1]=v1; + graph3->edges[graph3->nedges].vertices[2]=v2; + graph3->edges[graph3->nedges].next_edges[0]=graph3->first_edge[v0]; + graph3->edges[graph3->nedges].next_edges[1]=graph3->first_edge[v1]; + graph3->edges[graph3->nedges].next_edges[2]=graph3->first_edge[v2]; + graph3->first_edge[v0]=graph3->first_edge[v1]=graph3->first_edge[v2]=graph3->nedges; + graph3->vert_degree[v0]++; + graph3->vert_degree[v1]++; + graph3->vert_degree[v2]++; + graph3->nedges++; +}; + +static void bdz_ph_dump_graph(bdz_ph_graph3_t* graph3, cmph_uint32 nedges, cmph_uint32 nvertices) +{ + int i; + for(i=0;iedges[i].vertices[0], + graph3->edges[i].vertices[1],graph3->edges[i].vertices[2]); + printf(" nexts %d %d %d",graph3->edges[i].next_edges[0], + graph3->edges[i].next_edges[1],graph3->edges[i].next_edges[2]); + }; + + for(i=0;ifirst_edge[i]); + + }; +}; + +static void bdz_ph_remove_edge(bdz_ph_graph3_t * graph3, cmph_uint32 curr_edge) +{ + cmph_uint32 i,j=0,vert,edge1,edge2; + for(i=0;i<3;i++){ + vert=graph3->edges[curr_edge].vertices[i]; + edge1=graph3->first_edge[vert]; + edge2=NULL_EDGE; + while(edge1!=curr_edge&&edge1!=NULL_EDGE){ + edge2=edge1; + if(graph3->edges[edge1].vertices[0]==vert){ + j=0; + } else if(graph3->edges[edge1].vertices[1]==vert){ + j=1; + } else + j=2; + edge1=graph3->edges[edge1].next_edges[j]; + }; + if(edge1==NULL_EDGE){ + printf("\nerror remove edge %d dump graph",curr_edge); + bdz_ph_dump_graph(graph3,graph3->nedges,graph3->nedges+graph3->nedges/4); + exit(-1); + }; + + if(edge2!=NULL_EDGE){ + graph3->edges[edge2].next_edges[j] = + graph3->edges[edge1].next_edges[i]; + } else + graph3->first_edge[vert]= + graph3->edges[edge1].next_edges[i]; + graph3->vert_degree[vert]--; + }; + +}; + +static int bdz_ph_generate_queue(cmph_uint32 nedges, cmph_uint32 nvertices, bdz_ph_queue_t queue, bdz_ph_graph3_t* graph3) +{ + cmph_uint32 i,v0,v1,v2; + cmph_uint32 queue_head=0,queue_tail=0; + cmph_uint32 curr_edge; + cmph_uint32 tmp_edge; + cmph_uint8 * marked_edge =malloc((size_t)(nedges >> 3) + 1); + memset(marked_edge, 0, (size_t)(nedges >> 3) + 1); + + for(i=0;iedges[i].vertices[0]; + v1=graph3->edges[i].vertices[1]; + v2=graph3->edges[i].vertices[2]; + if(graph3->vert_degree[v0]==1 || + graph3->vert_degree[v1]==1 || + graph3->vert_degree[v2]==1){ + if(!GETBIT(marked_edge,i)) { + queue[queue_head++]=i; + SETBIT(marked_edge,i); + } + }; + }; + while(queue_tail!=queue_head){ + curr_edge=queue[queue_tail++]; + bdz_ph_remove_edge(graph3,curr_edge); + v0=graph3->edges[curr_edge].vertices[0]; + v1=graph3->edges[curr_edge].vertices[1]; + v2=graph3->edges[curr_edge].vertices[2]; + if(graph3->vert_degree[v0]==1 ) { + tmp_edge=graph3->first_edge[v0]; + if(!GETBIT(marked_edge,tmp_edge)) { + queue[queue_head++]=tmp_edge; + SETBIT(marked_edge,tmp_edge); + }; + + }; + if(graph3->vert_degree[v1]==1) { + tmp_edge=graph3->first_edge[v1]; + if(!GETBIT(marked_edge,tmp_edge)){ + queue[queue_head++]=tmp_edge; + SETBIT(marked_edge,tmp_edge); + }; + + }; + if(graph3->vert_degree[v2]==1){ + tmp_edge=graph3->first_edge[v2]; + if(!GETBIT(marked_edge,tmp_edge)){ + queue[queue_head++]=tmp_edge; + SETBIT(marked_edge,tmp_edge); + }; + }; + }; + free(marked_edge); + return (int)queue_head - (int)nedges;/* returns 0 if successful otherwies return negative number*/ +}; + +static int bdz_ph_mapping(cmph_config_t *mph, bdz_ph_graph3_t* graph3, bdz_ph_queue_t queue); +static void assigning(bdz_ph_config_data_t *bdz_ph, bdz_ph_graph3_t* graph3, bdz_ph_queue_t queue); +static void bdz_ph_optimization(bdz_ph_config_data_t *bdz_ph); + +bdz_ph_config_data_t *bdz_ph_config_new() +{ + bdz_ph_config_data_t *bdz_ph; + bdz_ph = (bdz_ph_config_data_t *)malloc(sizeof(bdz_ph_config_data_t)); + assert(bdz_ph); + memset(bdz_ph, 0, sizeof(bdz_ph_config_data_t)); + bdz_ph->hashfunc = CMPH_HASH_JENKINS; + bdz_ph->g = NULL; + bdz_ph->hl = NULL; + return bdz_ph; +} + +void bdz_ph_config_destroy(cmph_config_t *mph) +{ + bdz_ph_config_data_t *data = (bdz_ph_config_data_t *)mph->data; + DEBUGP("Destroying algorithm dependent data\n"); + free(data); +} + +void bdz_ph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) +{ + bdz_ph_config_data_t *bdz_ph = (bdz_ph_config_data_t *)mph->data; + CMPH_HASH *hashptr = hashfuncs; + cmph_uint32 i = 0; + while(*hashptr != CMPH_HASH_COUNT) + { + if (i >= 1) break; //bdz_ph only uses one linear hash function + bdz_ph->hashfunc = *hashptr; + ++i, ++hashptr; + } +} + +cmph_t *bdz_ph_new(cmph_config_t *mph, double c) +{ + cmph_t *mphf = NULL; + bdz_ph_data_t *bdz_phf = NULL; + cmph_uint32 iterations; + bdz_ph_queue_t edges; + bdz_ph_graph3_t graph3; + bdz_ph_config_data_t *bdz_ph = (bdz_ph_config_data_t *)mph->data; + #ifdef CMPH_TIMING + double construction_time_begin = 0.0; + double construction_time = 0.0; + ELAPSED_TIME_IN_SECONDS(&construction_time_begin); + #endif + + + if (c == 0) c = 1.23; // validating restrictions over parameter c. + DEBUGP("c: %f\n", c); + bdz_ph->m = mph->key_source->nkeys; + bdz_ph->r = (cmph_uint32)ceil((c * mph->key_source->nkeys)/3); + if ((bdz_ph->r % 2) == 0) bdz_ph->r += 1; + bdz_ph->n = 3*bdz_ph->r; + + + bdz_ph_alloc_graph3(&graph3, bdz_ph->m, bdz_ph->n); + bdz_ph_alloc_queue(&edges,bdz_ph->m); + DEBUGP("Created hypergraph\n"); + + DEBUGP("m (edges): %u n (vertices): %u r: %u c: %f \n", bdz_ph->m, bdz_ph->n, bdz_ph->r, c); + + // Mapping step + iterations = 100; + if (mph->verbosity) + { + fprintf(stderr, "Entering mapping step for mph creation of %u keys with graph sized %u\n", bdz_ph->m, bdz_ph->n); + } + while(1) + { + int ok; + DEBUGP("linear hash function \n"); + bdz_ph->hl = hash_state_new(bdz_ph->hashfunc, 15); + + ok = bdz_ph_mapping(mph, &graph3, edges); + if (!ok) + { + --iterations; + hash_state_destroy(bdz_ph->hl); + bdz_ph->hl = NULL; + DEBUGP("%u iterations remaining\n", iterations); + if (mph->verbosity) + { + fprintf(stderr, "acyclic graph creation failure - %u iterations remaining\n", iterations); + } + if (iterations == 0) break; + } + else break; + } + + if (iterations == 0) + { +// free(bdz_ph->g); + bdz_ph_free_queue(&edges); + bdz_ph_free_graph3(&graph3); + return NULL; + } + bdz_ph_partial_free_graph3(&graph3); + // Assigning step + if (mph->verbosity) + { + fprintf(stderr, "Entering assigning step for mph creation of %u keys with graph sized %u\n", bdz_ph->m, bdz_ph->n); + } + assigning(bdz_ph, &graph3, edges); + + bdz_ph_free_queue(&edges); + bdz_ph_free_graph3(&graph3); + + if (mph->verbosity) + { + fprintf(stderr, "Starting optimization step\n"); + } + + bdz_ph_optimization(bdz_ph); + + #ifdef CMPH_TIMING + ELAPSED_TIME_IN_SECONDS(&construction_time); + #endif + mphf = (cmph_t *)malloc(sizeof(cmph_t)); + mphf->algo = mph->algo; + bdz_phf = (bdz_ph_data_t *)malloc(sizeof(bdz_ph_data_t)); + bdz_phf->g = bdz_ph->g; + bdz_ph->g = NULL; //transfer memory ownership + bdz_phf->hl = bdz_ph->hl; + bdz_ph->hl = NULL; //transfer memory ownership + bdz_phf->n = bdz_ph->n; + bdz_phf->m = bdz_ph->m; + bdz_phf->r = bdz_ph->r; + mphf->data = bdz_phf; + mphf->size = bdz_ph->n; + + DEBUGP("Successfully generated minimal perfect hash\n"); + if (mph->verbosity) + { + fprintf(stderr, "Successfully generated minimal perfect hash function\n"); + } + + #ifdef CMPH_TIMING + register cmph_uint32 space_usage = bdz_ph_packed_size(mphf)*8; + register cmph_uint32 keys_per_bucket = 1; + construction_time = construction_time - construction_time_begin; + fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\n", bdz_ph->m, bdz_ph->m/(double)bdz_ph->n, keys_per_bucket, construction_time, space_usage/(double)bdz_ph->m); + #endif + + return mphf; +} + + +static int bdz_ph_mapping(cmph_config_t *mph, bdz_ph_graph3_t* graph3, bdz_ph_queue_t queue) +{ + cmph_uint32 e; + int cycles = 0; + cmph_uint32 hl[3]; + + bdz_ph_config_data_t *bdz_ph = (bdz_ph_config_data_t *)mph->data; + bdz_ph_init_graph3(graph3, bdz_ph->m, bdz_ph->n); + mph->key_source->rewind(mph->key_source->data); + for (e = 0; e < mph->key_source->nkeys; ++e) + { + cmph_uint32 h0, h1, h2; + cmph_uint32 keylen; + char *key = NULL; + mph->key_source->read(mph->key_source->data, &key, &keylen); + hash_vector(bdz_ph->hl, key, keylen, hl); + h0 = hl[0] % bdz_ph->r; + h1 = hl[1] % bdz_ph->r + bdz_ph->r; + h2 = hl[2] % bdz_ph->r + (bdz_ph->r << 1); + mph->key_source->dispose(mph->key_source->data, key, keylen); + bdz_ph_add_edge(graph3,h0,h1,h2); + } + cycles = bdz_ph_generate_queue(bdz_ph->m, bdz_ph->n, queue, graph3); + return (cycles == 0); +} + +static void assigning(bdz_ph_config_data_t *bdz_ph, bdz_ph_graph3_t* graph3, bdz_ph_queue_t queue) +{ + cmph_uint32 i; + cmph_uint32 nedges=graph3->nedges; + cmph_uint32 curr_edge; + cmph_uint32 v0,v1,v2; + cmph_uint8 * marked_vertices =malloc((size_t)(bdz_ph->n >> 3) + 1); + cmph_uint32 sizeg = (cmph_uint32)ceil(bdz_ph->n/4.0); + bdz_ph->g = (cmph_uint8 *)calloc((size_t)sizeg, sizeof(cmph_uint8)); + memset(marked_vertices, 0, (size_t)(bdz_ph->n >> 3) + 1); + //memset(bdz_ph->g, 0xff, sizeg); + + for(i=nedges-1;i+1>=1;i--){ + curr_edge=queue[i]; + v0=graph3->edges[curr_edge].vertices[0]; + v1=graph3->edges[curr_edge].vertices[1]; + v2=graph3->edges[curr_edge].vertices[2]; + DEBUGP("B:%u %u %u -- %u %u %u\n", v0, v1, v2, GETVALUE(bdz_ph->g, v0), GETVALUE(bdz_ph->g, v1), GETVALUE(bdz_ph->g, v2)); + if(!GETBIT(marked_vertices, v0)){ + if(!GETBIT(marked_vertices,v1)) + { + //SETVALUE(bdz_ph->g, v1, UNASSIGNED); + SETBIT(marked_vertices, v1); + } + if(!GETBIT(marked_vertices,v2)) + { + //SETVALUE(bdz_ph->g, v2, UNASSIGNED); + SETBIT(marked_vertices, v2); + } + SETVALUE0(bdz_ph->g, v0, (6-(GETVALUE(bdz_ph->g, v1) + GETVALUE(bdz_ph->g,v2)))%3); + SETBIT(marked_vertices, v0); + } else if(!GETBIT(marked_vertices, v1)) { + if(!GETBIT(marked_vertices, v2)) + { + //SETVALUE(bdz_ph->g, v2, UNASSIGNED); + SETBIT(marked_vertices, v2); + } + SETVALUE0(bdz_ph->g, v1, (7 - (GETVALUE(bdz_ph->g, v0)+GETVALUE(bdz_ph->g, v2)))%3); + SETBIT(marked_vertices, v1); + }else { + SETVALUE0(bdz_ph->g, v2, (8-(GETVALUE(bdz_ph->g,v0)+GETVALUE(bdz_ph->g, v1)))%3); + SETBIT(marked_vertices, v2); + } + DEBUGP("A:%u %u %u -- %u %u %u\n", v0, v1, v2, GETVALUE(bdz_ph->g, v0), GETVALUE(bdz_ph->g, v1), GETVALUE(bdz_ph->g, v2)); + }; + free(marked_vertices); +} + +static void bdz_ph_optimization(bdz_ph_config_data_t *bdz_ph) +{ + cmph_uint32 i; + cmph_uint8 byte = 0; + cmph_uint32 sizeg = (cmph_uint32)ceil(bdz_ph->n/5.0); + cmph_uint8 * new_g = (cmph_uint8 *)calloc((size_t)sizeg, sizeof(cmph_uint8)); + cmph_uint8 value; + cmph_uint32 idx; + for(i = 0; i < bdz_ph->n; i++) + { + idx = i/5; + byte = new_g[idx]; + value = GETVALUE(bdz_ph->g, i); + byte = (cmph_uint8) (byte + value*pow3_table[i%5U]); + new_g[idx] = byte; + } + free(bdz_ph->g); + bdz_ph->g = new_g; +} + + +int bdz_ph_dump(cmph_t *mphf, FILE *fd) +{ + char *buf = NULL; + cmph_uint32 buflen; + cmph_uint32 sizeg = 0; + register size_t nbytes; + bdz_ph_data_t *data = (bdz_ph_data_t *)mphf->data; + __cmph_dump(mphf, fd); + + hash_state_dump(data->hl, &buf, &buflen); + DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); + nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd); + free(buf); + + nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(&(data->r), sizeof(cmph_uint32), (size_t)1, fd); + sizeg = (cmph_uint32)ceil(data->n/5.0); + nbytes = fwrite(data->g, sizeof(cmph_uint8)*sizeg, (size_t)1, fd); + + #ifdef DEBUG + cmph_uint32 i; + fprintf(stderr, "G: "); + for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", GETVALUE(data->g, i)); + fprintf(stderr, "\n"); + #endif + return 1; +} + +void bdz_ph_load(FILE *f, cmph_t *mphf) +{ + char *buf = NULL; + cmph_uint32 buflen; + cmph_uint32 sizeg = 0; + register size_t nbytes; + bdz_ph_data_t *bdz_ph = (bdz_ph_data_t *)malloc(sizeof(bdz_ph_data_t)); + + DEBUGP("Loading bdz_ph mphf\n"); + mphf->data = bdz_ph; + + nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f); + DEBUGP("Hash state has %u bytes\n", buflen); + buf = (char *)malloc((size_t)buflen); + nbytes = fread(buf, (size_t)buflen, (size_t)1, f); + bdz_ph->hl = hash_state_load(buf, buflen); + free(buf); + + + DEBUGP("Reading m and n\n"); + nbytes = fread(&(bdz_ph->n), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(bdz_ph->m), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(bdz_ph->r), sizeof(cmph_uint32), (size_t)1, f); + sizeg = (cmph_uint32)ceil(bdz_ph->n/5.0); + bdz_ph->g = (cmph_uint8 *)calloc((size_t)sizeg, sizeof(cmph_uint8)); + nbytes = fread(bdz_ph->g, sizeg*sizeof(cmph_uint8), (size_t)1, f); + + return; +} + + +cmph_uint32 bdz_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) +{ + register bdz_ph_data_t *bdz_ph = mphf->data; + cmph_uint32 hl[3]; + register cmph_uint8 byte0, byte1, byte2; + register cmph_uint32 vertex; + + hash_vector(bdz_ph->hl, key, keylen,hl); + hl[0] = hl[0] % bdz_ph->r; + hl[1] = hl[1] % bdz_ph->r + bdz_ph->r; + hl[2] = hl[2] % bdz_ph->r + (bdz_ph->r << 1); + + byte0 = bdz_ph->g[hl[0]/5]; + byte1 = bdz_ph->g[hl[1]/5]; + byte2 = bdz_ph->g[hl[2]/5]; + + byte0 = lookup_table[hl[0]%5U][byte0]; + byte1 = lookup_table[hl[1]%5U][byte1]; + byte2 = lookup_table[hl[2]%5U][byte2]; + vertex = hl[(byte0 + byte1 + byte2)%3]; + + return vertex; +} + + +void bdz_ph_destroy(cmph_t *mphf) +{ + bdz_ph_data_t *data = (bdz_ph_data_t *)mphf->data; + free(data->g); + hash_state_destroy(data->hl); + free(data); + free(mphf); +} + +/** \fn void bdz_ph_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void bdz_ph_pack(cmph_t *mphf, void *packed_mphf) +{ + bdz_ph_data_t *data = (bdz_ph_data_t *)mphf->data; + cmph_uint8 * ptr = packed_mphf; + + // packing hl type + CMPH_HASH hl_type = hash_get_type(data->hl); + *((cmph_uint32 *) ptr) = hl_type; + ptr += sizeof(cmph_uint32); + + // packing hl + hash_state_pack(data->hl, ptr); + ptr += hash_state_packed_size(hl_type); + + // packing r + *((cmph_uint32 *) ptr) = data->r; + ptr += sizeof(data->r); + + // packing g + cmph_uint32 sizeg = (cmph_uint32)ceil(data->n/5.0); + memcpy(ptr, data->g, sizeof(cmph_uint8)*sizeg); +} + +/** \fn cmph_uint32 bdz_ph_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 bdz_ph_packed_size(cmph_t *mphf) +{ + bdz_ph_data_t *data = (bdz_ph_data_t *)mphf->data; + CMPH_HASH hl_type = hash_get_type(data->hl); + cmph_uint32 sizeg = (cmph_uint32)ceil(data->n/5.0); + return (cmph_uint32) (sizeof(CMPH_ALGO) + hash_state_packed_size(hl_type) + 2*sizeof(cmph_uint32) + sizeof(cmph_uint8)*sizeg); +} + +/** cmph_uint32 bdz_ph_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 bdz_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) +{ + + register CMPH_HASH hl_type = *(cmph_uint32 *)packed_mphf; + register cmph_uint8 *hl_ptr = (cmph_uint8 *)(packed_mphf) + 4; + + register cmph_uint8 * ptr = hl_ptr + hash_state_packed_size(hl_type); + + register cmph_uint32 r = *((cmph_uint32*) ptr); + register cmph_uint8 * g = ptr + 4; + + cmph_uint32 hl[3]; + register cmph_uint8 byte0, byte1, byte2; + register cmph_uint32 vertex; + + hash_vector_packed(hl_ptr, hl_type, key, keylen, hl); + + hl[0] = hl[0] % r; + hl[1] = hl[1] % r + r; + hl[2] = hl[2] % r + (r << 1); + + byte0 = g[hl[0]/5]; + byte1 = g[hl[1]/5]; + byte2 = g[hl[2]/5]; + + byte0 = lookup_table[hl[0]%5][byte0]; + byte1 = lookup_table[hl[1]%5][byte1]; + byte2 = lookup_table[hl[2]%5][byte2]; + vertex = hl[(byte0 + byte1 + byte2)%3]; + + return vertex; +} diff --git a/cmph/bdz_ph.h b/cmph/bdz_ph.h new file mode 100755 index 000000000..73cce2ed1 --- /dev/null +++ b/cmph/bdz_ph.h @@ -0,0 +1,42 @@ +#ifndef __CMPH_BDZ_PH_H__ +#define __CMPH_BDZ_PH_H__ + +#include "cmph.h" + +typedef struct __bdz_ph_data_t bdz_ph_data_t; +typedef struct __bdz_ph_config_data_t bdz_ph_config_data_t; + +bdz_ph_config_data_t *bdz_ph_config_new(); +void bdz_ph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs); +void bdz_ph_config_destroy(cmph_config_t *mph); +cmph_t *bdz_ph_new(cmph_config_t *mph, double c); + +void bdz_ph_load(FILE *f, cmph_t *mphf); +int bdz_ph_dump(cmph_t *mphf, FILE *f); +void bdz_ph_destroy(cmph_t *mphf); +cmph_uint32 bdz_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); + +/** \fn void bdz_ph_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void bdz_ph_pack(cmph_t *mphf, void *packed_mphf); + +/** \fn cmph_uint32 bdz_ph_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 bdz_ph_packed_size(cmph_t *mphf); + +/** cmph_uint32 bdz_ph_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 bdz_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen); + +#endif diff --git a/cmph/bdz_structs.h b/cmph/bdz_structs.h new file mode 100755 index 000000000..ba7dc3c66 --- /dev/null +++ b/cmph/bdz_structs.h @@ -0,0 +1,36 @@ +#ifndef __CMPH_BDZ_STRUCTS_H__ +#define __CMPH_BDZ_STRUCTS_H__ + +#include "hash_state.h" + +struct __bdz_data_t +{ + cmph_uint32 m; //edges (words) count + cmph_uint32 n; //vertex count + cmph_uint32 r; //partition vertex count + cmph_uint8 *g; + hash_state_t *hl; // linear hashing + + cmph_uint32 k; //kth index in ranktable, $k = log_2(n=3r)/\varepsilon$ + cmph_uint8 b; // number of bits of k + cmph_uint32 ranktablesize; //number of entries in ranktable, $n/k +1$ + cmph_uint32 *ranktable; // rank table +}; + + +struct __bdz_config_data_t +{ + cmph_uint32 m; //edges (words) count + cmph_uint32 n; //vertex count + cmph_uint32 r; //partition vertex count + cmph_uint8 *g; + hash_state_t *hl; // linear hashing + + cmph_uint32 k; //kth index in ranktable, $k = log_2(n=3r)/\varepsilon$ + cmph_uint8 b; // number of bits of k + cmph_uint32 ranktablesize; //number of entries in ranktable, $n/k +1$ + cmph_uint32 *ranktable; // rank table + CMPH_HASH hashfunc; +}; + +#endif diff --git a/cmph/bdz_structs_ph.h b/cmph/bdz_structs_ph.h new file mode 100755 index 000000000..5874a26df --- /dev/null +++ b/cmph/bdz_structs_ph.h @@ -0,0 +1,26 @@ +#ifndef __CMPH_BDZ_STRUCTS_PH_H__ +#define __CMPH_BDZ_STRUCTS_PH_H__ + +#include "hash_state.h" + +struct __bdz_ph_data_t +{ + cmph_uint32 m; //edges (words) count + cmph_uint32 n; //vertex count + cmph_uint32 r; //partition vertex count + cmph_uint8 *g; + hash_state_t *hl; // linear hashing +}; + + +struct __bdz_ph_config_data_t +{ + CMPH_HASH hashfunc; + cmph_uint32 m; //edges (words) count + cmph_uint32 n; //vertex count + cmph_uint32 r; //partition vertex count + cmph_uint8 *g; + hash_state_t *hl; // linear hashing +}; + +#endif diff --git a/cmph/bitbool.h b/cmph/bitbool.h new file mode 100644 index 000000000..a3286c3c9 --- /dev/null +++ b/cmph/bitbool.h @@ -0,0 +1,179 @@ +#ifndef _CMPH_BITBOOL_H__ +#define _CMPH_BITBOOL_H__ +#include "cmph_types.h" + +static const cmph_uint8 bitmask[] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; + +static const cmph_uint32 bitmask32[] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, + 1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15, + 1 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, 1 << 21, 1 << 22, 1 << 23, + 1 << 24, 1 << 25, 1 << 26, 1 << 27, 1 << 28, 1 << 29, 1 << 30, 1U << 31 + }; + +static const cmph_uint8 valuemask[] = { 0xfc, 0xf3, 0xcf, 0x3f}; + + +/** \def GETBIT(array, i) + * \brief get the value of an 1-bit integer stored in an array. + * \param array to get 1-bit integer values from + * \param i is the index in array to get the 1-bit integer value from + * + * GETBIT(array, i) is a macro that gets the value of an 1-bit integer stored in array. + */ +#define GETBIT(array, i) ((array[i >> 3] & bitmask[i & 0x00000007]) >> (i & 0x00000007)) + +/** \def SETBIT(array, i) + * \brief set 1 to an 1-bit integer stored in an array. + * \param array to store 1-bit integer values + * \param i is the index in array to set the the bit to 1 + * + * SETBIT(array, i) is a macro that sets 1 to an 1-bit integer stored in an array. + */ +#define SETBIT(array, i) (array[i >> 3] |= bitmask[i & 0x00000007]) + +/** \def UNSETBIT(array, i) + * \brief set 0 to an 1-bit integer stored in an array. + * \param array to store 1-bit integer values + * \param i is the index in array to set the the bit to 0 + * + * UNSETBIT(array, i) is a macro that sets 0 to an 1-bit integer stored in an array. + */ +#define UNSETBIT(array, i) (array[i >> 3] ^= ((bitmask[i & 0x00000007]))) + +//#define GETBIT(array, i) (array[(i) / 8] & bitmask[(i) % 8]) +//#define SETBIT(array, i) (array[(i) / 8] |= bitmask[(i) % 8]) +//#define UNSETBIT(array, i) (array[(i) / 8] ^= ((bitmask[(i) % 8]))) + + +/** \def SETVALUE1(array, i, v) + * \brief set a value for a 2-bit integer stored in an array initialized with 1s. + * \param array to store 2-bit integer values + * \param i is the index in array to set the value v + * \param v is the value to be set + * + * SETVALUE1(array, i, v) is a macro that set a value for a 2-bit integer stored in an array. + * The array should be initialized with all bits set to 1. For example: + * memset(array, 0xff, arraySize); + */ +#define SETVALUE1(array, i, v) (array[i >> 2] &= (cmph_uint8)((v << ((i & 0x00000003) << 1)) | valuemask[i & 0x00000003])) + +/** \def SETVALUE0(array, i, v) + * \brief set a value for a 2-bit integer stored in an array initialized with 0s. + * \param array to store 2-bit integer values + * \param i is the index in array to set the value v + * \param v is the value to be set + * + * SETVALUE0(array, i, v) is a macro that set a value for a 2-bit integer stored in an array. + * The array should be initialized with all bits set to 0. For example: + * memset(array, 0, arraySize); + */ +#define SETVALUE0(array, i, v) (array[i >> 2] |= (cmph_uint8)(v << ((i & 0x00000003) << 1))) + + +/** \def GETVALUE(array, i) + * \brief get a value for a 2-bit integer stored in an array. + * \param array to get 2-bit integer values from + * \param i is the index in array to get the value from + * + * GETVALUE(array, i) is a macro that get a value for a 2-bit integer stored in an array. + */ +#define GETVALUE(array, i) ((cmph_uint8)((array[i >> 2] >> ((i & 0x00000003U) << 1U)) & 0x00000003U)) + + + +/** \def SETBIT32(array, i) + * \brief set 1 to an 1-bit integer stored in an array of 32-bit words. + * \param array to store 1-bit integer values. The entries are 32-bit words. + * \param i is the index in array to set the the bit to 1 + * + * SETBIT32(array, i) is a macro that sets 1 to an 1-bit integer stored in an array of 32-bit words. + */ +#define SETBIT32(array, i) (array[i >> 5] |= bitmask32[i & 0x0000001f]) + +/** \def GETBIT32(array, i) + * \brief get the value of an 1-bit integer stored in an array of 32-bit words. + * \param array to get 1-bit integer values from. The entries are 32-bit words. + * \param i is the index in array to get the 1-bit integer value from + * + * GETBIT32(array, i) is a macro that gets the value of an 1-bit integer stored in an array of 32-bit words. + */ +#define GETBIT32(array, i) (array[i >> 5] & bitmask32[i & 0x0000001f]) + +/** \def UNSETBIT32(array, i) + * \brief set 0 to an 1-bit integer stored in an array of 32-bit words. + * \param array to store 1-bit integer values. The entries ar 32-bit words + * \param i is the index in array to set the the bit to 0 + * + * UNSETBIT32(array, i) is a macro that sets 0 to an 1-bit integer stored in an array of 32-bit words. + */ +#define UNSETBIT32(array, i) (array[i >> 5] ^= ((bitmask32[i & 0x0000001f]))) + +#define BITS_TABLE_SIZE(n, bits_length) ((n * bits_length + 31) >> 5) + +static inline void set_bits_value(cmph_uint32 * bits_table, cmph_uint32 index, cmph_uint32 bits_string, + cmph_uint32 string_length, cmph_uint32 string_mask) +{ + register cmph_uint32 bit_idx = index * string_length; + register cmph_uint32 word_idx = bit_idx >> 5; + register cmph_uint32 shift1 = bit_idx & 0x0000001f; + register cmph_uint32 shift2 = 32 - shift1; + + bits_table[word_idx] &= ~((string_mask) << shift1); + bits_table[word_idx] |= bits_string << shift1; + + if(shift2 < string_length) + { + bits_table[word_idx+1] &= ~((string_mask) >> shift2); + bits_table[word_idx+1] |= bits_string >> shift2; + }; +}; + +static inline cmph_uint32 get_bits_value(cmph_uint32 * bits_table,cmph_uint32 index, cmph_uint32 string_length, cmph_uint32 string_mask) +{ + register cmph_uint32 bit_idx = index * string_length; + register cmph_uint32 word_idx = bit_idx >> 5; + register cmph_uint32 shift1 = bit_idx & 0x0000001f; + register cmph_uint32 shift2 = 32-shift1; + register cmph_uint32 bits_string; + + bits_string = (bits_table[word_idx] >> shift1) & string_mask; + + if(shift2 < string_length) + bits_string |= (bits_table[word_idx+1] << shift2) & string_mask; + + return bits_string; +}; + +static inline void set_bits_at_pos(cmph_uint32 * bits_table, cmph_uint32 pos, cmph_uint32 bits_string, cmph_uint32 string_length) +{ + register cmph_uint32 word_idx = pos >> 5; + register cmph_uint32 shift1 = pos & 0x0000001f; + register cmph_uint32 shift2 = 32-shift1; + register cmph_uint32 string_mask = (1U << string_length) - 1; + + bits_table[word_idx] &= ~((string_mask) << shift1); + bits_table[word_idx] |= bits_string << shift1; + if(shift2 < string_length) + { + bits_table[word_idx+1] &= ~((string_mask) >> shift2); + bits_table[word_idx+1] |= bits_string >> shift2; + } +}; + +static inline cmph_uint32 get_bits_at_pos(cmph_uint32 * bits_table,cmph_uint32 pos,cmph_uint32 string_length) +{ + register cmph_uint32 word_idx = pos >> 5; + register cmph_uint32 shift1 = pos & 0x0000001f; + register cmph_uint32 shift2 = 32 - shift1; + register cmph_uint32 string_mask = (1U << string_length) - 1; + register cmph_uint32 bits_string; + + bits_string = (bits_table[word_idx] >> shift1) & string_mask; + + if(shift2 < string_length) + bits_string |= (bits_table[word_idx+1] << shift2) & string_mask; + return bits_string; +} + + +#endif diff --git a/cmph/bmz.c b/cmph/bmz.c new file mode 100644 index 000000000..51798a189 --- /dev/null +++ b/cmph/bmz.c @@ -0,0 +1,620 @@ +#include "graph.h" +#include "bmz.h" +#include "cmph_structs.h" +#include "bmz_structs.h" +#include "hash.h" +#include "vqueue.h" +#include "bitbool.h" + +#include +#include +#include +#include +#include + +//#define DEBUG +#include "debug.h" + +static int bmz_gen_edges(cmph_config_t *mph); +static cmph_uint8 bmz_traverse_critical_nodes(bmz_config_data_t *bmz, cmph_uint32 v, cmph_uint32 * biggest_g_value, cmph_uint32 * biggest_edge_value, cmph_uint8 * used_edges, cmph_uint8 * visited); +static cmph_uint8 bmz_traverse_critical_nodes_heuristic(bmz_config_data_t *bmz, cmph_uint32 v, cmph_uint32 * biggest_g_value, cmph_uint32 * biggest_edge_value, cmph_uint8 * used_edges, cmph_uint8 * visited); +static void bmz_traverse_non_critical_nodes(bmz_config_data_t *bmz, cmph_uint8 * used_edges, cmph_uint8 * visited); + +bmz_config_data_t *bmz_config_new() +{ + bmz_config_data_t *bmz = NULL; + bmz = (bmz_config_data_t *)malloc(sizeof(bmz_config_data_t)); + assert(bmz); + memset(bmz, 0, sizeof(bmz_config_data_t)); + bmz->hashfuncs[0] = CMPH_HASH_JENKINS; + bmz->hashfuncs[1] = CMPH_HASH_JENKINS; + bmz->g = NULL; + bmz->graph = NULL; + bmz->hashes = NULL; + return bmz; +} + +void bmz_config_destroy(cmph_config_t *mph) +{ + bmz_config_data_t *data = (bmz_config_data_t *)mph->data; + DEBUGP("Destroying algorithm dependent data\n"); + free(data); +} + +void bmz_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) +{ + bmz_config_data_t *bmz = (bmz_config_data_t *)mph->data; + CMPH_HASH *hashptr = hashfuncs; + cmph_uint32 i = 0; + while(*hashptr != CMPH_HASH_COUNT) + { + if (i >= 2) break; //bmz only uses two hash functions + bmz->hashfuncs[i] = *hashptr; + ++i, ++hashptr; + } +} + +cmph_t *bmz_new(cmph_config_t *mph, double c) +{ + cmph_t *mphf = NULL; + bmz_data_t *bmzf = NULL; + cmph_uint32 i; + cmph_uint32 iterations; + cmph_uint32 iterations_map = 20; + cmph_uint8 *used_edges = NULL; + cmph_uint8 restart_mapping = 0; + cmph_uint8 * visited = NULL; + + bmz_config_data_t *bmz = (bmz_config_data_t *)mph->data; + if (c == 0) c = 1.15; // validating restrictions over parameter c. + DEBUGP("c: %f\n", c); + bmz->m = mph->key_source->nkeys; + bmz->n = (cmph_uint32)ceil(c * mph->key_source->nkeys); + DEBUGP("m (edges): %u n (vertices): %u c: %f\n", bmz->m, bmz->n, c); + bmz->graph = graph_new(bmz->n, bmz->m); + DEBUGP("Created graph\n"); + + bmz->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*3); + for(i = 0; i < 3; ++i) bmz->hashes[i] = NULL; + + do + { + // Mapping step + cmph_uint32 biggest_g_value = 0; + cmph_uint32 biggest_edge_value = 1; + iterations = 100; + if (mph->verbosity) + { + fprintf(stderr, "Entering mapping step for mph creation of %u keys with graph sized %u\n", bmz->m, bmz->n); + } + while(1) + { + int ok; + DEBUGP("hash function 1\n"); + bmz->hashes[0] = hash_state_new(bmz->hashfuncs[0], bmz->n); + DEBUGP("hash function 2\n"); + bmz->hashes[1] = hash_state_new(bmz->hashfuncs[1], bmz->n); + DEBUGP("Generating edges\n"); + ok = bmz_gen_edges(mph); + if (!ok) + { + --iterations; + hash_state_destroy(bmz->hashes[0]); + bmz->hashes[0] = NULL; + hash_state_destroy(bmz->hashes[1]); + bmz->hashes[1] = NULL; + DEBUGP("%u iterations remaining\n", iterations); + if (mph->verbosity) + { + fprintf(stderr, "simple graph creation failure - %u iterations remaining\n", iterations); + } + if (iterations == 0) break; + } + else break; + } + if (iterations == 0) + { + graph_destroy(bmz->graph); + return NULL; + } + // Ordering step + if (mph->verbosity) + { + fprintf(stderr, "Starting ordering step\n"); + } + graph_obtain_critical_nodes(bmz->graph); + + // Searching step + if (mph->verbosity) + { + fprintf(stderr, "Starting Searching step.\n"); + fprintf(stderr, "\tTraversing critical vertices.\n"); + } + DEBUGP("Searching step\n"); + visited = (cmph_uint8 *)malloc((size_t)bmz->n/8 + 1); + memset(visited, 0, (size_t)bmz->n/8 + 1); + used_edges = (cmph_uint8 *)malloc((size_t)bmz->m/8 + 1); + memset(used_edges, 0, (size_t)bmz->m/8 + 1); + free(bmz->g); + bmz->g = (cmph_uint32 *)calloc((size_t)bmz->n, sizeof(cmph_uint32)); + assert(bmz->g); + for (i = 0; i < bmz->n; ++i) // critical nodes + { + if (graph_node_is_critical(bmz->graph, i) && (!GETBIT(visited,i))) + { + if(c > 1.14) restart_mapping = bmz_traverse_critical_nodes(bmz, i, &biggest_g_value, &biggest_edge_value, used_edges, visited); + else restart_mapping = bmz_traverse_critical_nodes_heuristic(bmz, i, &biggest_g_value, &biggest_edge_value, used_edges, visited); + if(restart_mapping) break; + } + } + if(!restart_mapping) + { + if (mph->verbosity) + { + fprintf(stderr, "\tTraversing non critical vertices.\n"); + } + bmz_traverse_non_critical_nodes(bmz, used_edges, visited); // non_critical_nodes + } + else + { + iterations_map--; + if (mph->verbosity) fprintf(stderr, "Restarting mapping step. %u iterations remaining.\n", iterations_map); + } + free(used_edges); + free(visited); + }while(restart_mapping && iterations_map > 0); + graph_destroy(bmz->graph); + bmz->graph = NULL; + if (iterations_map == 0) + { + return NULL; + } + mphf = (cmph_t *)malloc(sizeof(cmph_t)); + mphf->algo = mph->algo; + bmzf = (bmz_data_t *)malloc(sizeof(bmz_data_t)); + bmzf->g = bmz->g; + bmz->g = NULL; //transfer memory ownership + bmzf->hashes = bmz->hashes; + bmz->hashes = NULL; //transfer memory ownership + bmzf->n = bmz->n; + bmzf->m = bmz->m; + mphf->data = bmzf; + mphf->size = bmz->m; + + DEBUGP("Successfully generated minimal perfect hash\n"); + if (mph->verbosity) + { + fprintf(stderr, "Successfully generated minimal perfect hash function\n"); + } + return mphf; +} + +static cmph_uint8 bmz_traverse_critical_nodes(bmz_config_data_t *bmz, cmph_uint32 v, cmph_uint32 * biggest_g_value, cmph_uint32 * biggest_edge_value, cmph_uint8 * used_edges, cmph_uint8 * visited) +{ + cmph_uint32 next_g; + cmph_uint32 u; /* Auxiliary vertex */ + cmph_uint32 lav; /* lookahead vertex */ + cmph_uint8 collision; + vqueue_t * q = vqueue_new((cmph_uint32)(graph_ncritical_nodes(bmz->graph)) + 1); + graph_iterator_t it, it1; + + DEBUGP("Labelling critical vertices\n"); + bmz->g[v] = (cmph_uint32)ceil ((double)(*biggest_edge_value)/2) - 1; + SETBIT(visited, v); + next_g = (cmph_uint32)floor((double)(*biggest_edge_value/2)); /* next_g is incremented in the do..while statement*/ + vqueue_insert(q, v); + while(!vqueue_is_empty(q)) + { + v = vqueue_remove(q); + it = graph_neighbors_it(bmz->graph, v); + while ((u = graph_next_neighbor(bmz->graph, &it)) != GRAPH_NO_NEIGHBOR) + { + if (graph_node_is_critical(bmz->graph, u) && (!GETBIT(visited,u))) + { + collision = 1; + while(collision) // lookahead to resolve collisions + { + next_g = *biggest_g_value + 1; + it1 = graph_neighbors_it(bmz->graph, u); + collision = 0; + while((lav = graph_next_neighbor(bmz->graph, &it1)) != GRAPH_NO_NEIGHBOR) + { + if (graph_node_is_critical(bmz->graph, lav) && GETBIT(visited,lav)) + { + if(next_g + bmz->g[lav] >= bmz->m) + { + vqueue_destroy(q); + return 1; // restart mapping step. + } + if (GETBIT(used_edges, (next_g + bmz->g[lav]))) + { + collision = 1; + break; + } + } + } + if (next_g > *biggest_g_value) *biggest_g_value = next_g; + } + // Marking used edges... + it1 = graph_neighbors_it(bmz->graph, u); + while((lav = graph_next_neighbor(bmz->graph, &it1)) != GRAPH_NO_NEIGHBOR) + { + if (graph_node_is_critical(bmz->graph, lav) && GETBIT(visited, lav)) + { + SETBIT(used_edges,(next_g + bmz->g[lav])); + if(next_g + bmz->g[lav] > *biggest_edge_value) *biggest_edge_value = next_g + bmz->g[lav]; + } + } + bmz->g[u] = next_g; // Labelling vertex u. + SETBIT(visited,u); + vqueue_insert(q, u); + } + } + + } + vqueue_destroy(q); + return 0; +} + +static cmph_uint8 bmz_traverse_critical_nodes_heuristic(bmz_config_data_t *bmz, cmph_uint32 v, cmph_uint32 * biggest_g_value, cmph_uint32 * biggest_edge_value, cmph_uint8 * used_edges, cmph_uint8 * visited) +{ + cmph_uint32 next_g; + cmph_uint32 u; /* Auxiliary vertex */ + cmph_uint32 lav; /* lookahead vertex */ + cmph_uint8 collision; + cmph_uint32 * unused_g_values = NULL; + cmph_uint32 unused_g_values_capacity = 0; + cmph_uint32 nunused_g_values = 0; + vqueue_t * q = vqueue_new((cmph_uint32)(0.5*graph_ncritical_nodes(bmz->graph))+1); + graph_iterator_t it, it1; + + DEBUGP("Labelling critical vertices\n"); + bmz->g[v] = (cmph_uint32)ceil ((double)(*biggest_edge_value)/2) - 1; + SETBIT(visited, v); + next_g = (cmph_uint32)floor((double)(*biggest_edge_value/2)); /* next_g is incremented in the do..while statement*/ + vqueue_insert(q, v); + while(!vqueue_is_empty(q)) + { + v = vqueue_remove(q); + it = graph_neighbors_it(bmz->graph, v); + while ((u = graph_next_neighbor(bmz->graph, &it)) != GRAPH_NO_NEIGHBOR) + { + if (graph_node_is_critical(bmz->graph, u) && (!GETBIT(visited,u))) + { + cmph_uint32 next_g_index = 0; + collision = 1; + while(collision) // lookahead to resolve collisions + { + if (next_g_index < nunused_g_values) + { + next_g = unused_g_values[next_g_index++]; + } + else + { + next_g = *biggest_g_value + 1; + next_g_index = UINT_MAX; + } + it1 = graph_neighbors_it(bmz->graph, u); + collision = 0; + while((lav = graph_next_neighbor(bmz->graph, &it1)) != GRAPH_NO_NEIGHBOR) + { + if (graph_node_is_critical(bmz->graph, lav) && GETBIT(visited,lav)) + { + if(next_g + bmz->g[lav] >= bmz->m) + { + vqueue_destroy(q); + free(unused_g_values); + return 1; // restart mapping step. + } + if (GETBIT(used_edges, (next_g + bmz->g[lav]))) + { + collision = 1; + break; + } + } + } + if(collision && (next_g > *biggest_g_value)) // saving the current g value stored in next_g. + { + if(nunused_g_values == unused_g_values_capacity) + { + unused_g_values = (cmph_uint32 *)realloc(unused_g_values, (unused_g_values_capacity + BUFSIZ)*sizeof(cmph_uint32)); + unused_g_values_capacity += BUFSIZ; + } + unused_g_values[nunused_g_values++] = next_g; + + } + if (next_g > *biggest_g_value) *biggest_g_value = next_g; + } + next_g_index--; + if (next_g_index < nunused_g_values) unused_g_values[next_g_index] = unused_g_values[--nunused_g_values]; + + // Marking used edges... + it1 = graph_neighbors_it(bmz->graph, u); + while((lav = graph_next_neighbor(bmz->graph, &it1)) != GRAPH_NO_NEIGHBOR) + { + if (graph_node_is_critical(bmz->graph, lav) && GETBIT(visited, lav)) + { + SETBIT(used_edges,(next_g + bmz->g[lav])); + if(next_g + bmz->g[lav] > *biggest_edge_value) *biggest_edge_value = next_g + bmz->g[lav]; + } + } + bmz->g[u] = next_g; // Labelling vertex u. + SETBIT(visited, u); + vqueue_insert(q, u); + } + } + + } + vqueue_destroy(q); + free(unused_g_values); + return 0; +} + +static cmph_uint32 next_unused_edge(bmz_config_data_t *bmz, cmph_uint8 * used_edges, cmph_uint32 unused_edge_index) +{ + while(1) + { + assert(unused_edge_index < bmz->m); + if(GETBIT(used_edges, unused_edge_index)) unused_edge_index ++; + else break; + } + return unused_edge_index; +} + +static void bmz_traverse(bmz_config_data_t *bmz, cmph_uint8 * used_edges, cmph_uint32 v, cmph_uint32 * unused_edge_index, cmph_uint8 * visited) +{ + graph_iterator_t it = graph_neighbors_it(bmz->graph, v); + cmph_uint32 neighbor = 0; + while((neighbor = graph_next_neighbor(bmz->graph, &it)) != GRAPH_NO_NEIGHBOR) + { + if(GETBIT(visited,neighbor)) continue; + //DEBUGP("Visiting neighbor %u\n", neighbor); + *unused_edge_index = next_unused_edge(bmz, used_edges, *unused_edge_index); + bmz->g[neighbor] = *unused_edge_index - bmz->g[v]; + //if (bmz->g[neighbor] >= bmz->m) bmz->g[neighbor] += bmz->m; + SETBIT(visited, neighbor); + (*unused_edge_index)++; + bmz_traverse(bmz, used_edges, neighbor, unused_edge_index, visited); + + } +} + +static void bmz_traverse_non_critical_nodes(bmz_config_data_t *bmz, cmph_uint8 * used_edges, cmph_uint8 * visited) +{ + + cmph_uint32 i, v1, v2, unused_edge_index = 0; + DEBUGP("Labelling non critical vertices\n"); + for(i = 0; i < bmz->m; i++) + { + v1 = graph_vertex_id(bmz->graph, i, 0); + v2 = graph_vertex_id(bmz->graph, i, 1); + if((GETBIT(visited,v1) && GETBIT(visited,v2)) || (!GETBIT(visited,v1) && !GETBIT(visited,v2))) continue; + if(GETBIT(visited,v1)) bmz_traverse(bmz, used_edges, v1, &unused_edge_index, visited); + else bmz_traverse(bmz, used_edges, v2, &unused_edge_index, visited); + + } + + for(i = 0; i < bmz->n; i++) + { + if(!GETBIT(visited,i)) + { + bmz->g[i] = 0; + SETBIT(visited, i); + bmz_traverse(bmz, used_edges, i, &unused_edge_index, visited); + } + } + +} + +static int bmz_gen_edges(cmph_config_t *mph) +{ + cmph_uint32 e; + bmz_config_data_t *bmz = (bmz_config_data_t *)mph->data; + cmph_uint8 multiple_edges = 0; + DEBUGP("Generating edges for %u vertices\n", bmz->n); + graph_clear_edges(bmz->graph); + mph->key_source->rewind(mph->key_source->data); + for (e = 0; e < mph->key_source->nkeys; ++e) + { + cmph_uint32 h1, h2; + cmph_uint32 keylen; + char *key = NULL; + mph->key_source->read(mph->key_source->data, &key, &keylen); + +// if (key == NULL)fprintf(stderr, "key = %s -- read BMZ\n", key); + h1 = hash(bmz->hashes[0], key, keylen) % bmz->n; + h2 = hash(bmz->hashes[1], key, keylen) % bmz->n; + if (h1 == h2) if (++h2 >= bmz->n) h2 = 0; + if (h1 == h2) + { + if (mph->verbosity) fprintf(stderr, "Self loop for key %u\n", e); + mph->key_source->dispose(mph->key_source->data, key, keylen); + return 0; + } + //DEBUGP("Adding edge: %u -> %u for key %s\n", h1, h2, key); + mph->key_source->dispose(mph->key_source->data, key, keylen); +// fprintf(stderr, "key = %s -- dispose BMZ\n", key); + multiple_edges = graph_contains_edge(bmz->graph, h1, h2); + if (mph->verbosity && multiple_edges) fprintf(stderr, "A non simple graph was generated\n"); + if (multiple_edges) return 0; // checking multiple edge restriction. + graph_add_edge(bmz->graph, h1, h2); + } + return !multiple_edges; +} + +int bmz_dump(cmph_t *mphf, FILE *fd) +{ + char *buf = NULL; + cmph_uint32 buflen; + cmph_uint32 two = 2; //number of hash functions + bmz_data_t *data = (bmz_data_t *)mphf->data; + register size_t nbytes; + __cmph_dump(mphf, fd); + + nbytes = fwrite(&two, sizeof(cmph_uint32), (size_t)1, fd); + + hash_state_dump(data->hashes[0], &buf, &buflen); + DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); + nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd); + free(buf); + + hash_state_dump(data->hashes[1], &buf, &buflen); + DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); + nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd); + free(buf); + + nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd); + + nbytes = fwrite(data->g, sizeof(cmph_uint32)*(data->n), (size_t)1, fd); + #ifdef DEBUG + cmph_uint32 i; + fprintf(stderr, "G: "); + for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", data->g[i]); + fprintf(stderr, "\n"); + #endif + return 1; +} + +void bmz_load(FILE *f, cmph_t *mphf) +{ + cmph_uint32 nhashes; + char *buf = NULL; + cmph_uint32 buflen; + cmph_uint32 i; + bmz_data_t *bmz = (bmz_data_t *)malloc(sizeof(bmz_data_t)); + register size_t nbytes; + DEBUGP("Loading bmz mphf\n"); + mphf->data = bmz; + nbytes = fread(&nhashes, sizeof(cmph_uint32), (size_t)1, f); + bmz->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*(nhashes + 1)); + bmz->hashes[nhashes] = NULL; + DEBUGP("Reading %u hashes\n", nhashes); + for (i = 0; i < nhashes; ++i) + { + hash_state_t *state = NULL; + nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f); + DEBUGP("Hash state has %u bytes\n", buflen); + buf = (char *)malloc((size_t)buflen); + nbytes = fread(buf, (size_t)buflen, (size_t)1, f); + state = hash_state_load(buf, buflen); + bmz->hashes[i] = state; + free(buf); + } + + DEBUGP("Reading m and n\n"); + nbytes = fread(&(bmz->n), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(bmz->m), sizeof(cmph_uint32), (size_t)1, f); + + bmz->g = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*bmz->n); + nbytes = fread(bmz->g, bmz->n*sizeof(cmph_uint32), (size_t)1, f); + #ifdef DEBUG + fprintf(stderr, "G: "); + for (i = 0; i < bmz->n; ++i) fprintf(stderr, "%u ", bmz->g[i]); + fprintf(stderr, "\n"); + #endif + return; +} + + +cmph_uint32 bmz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) +{ + bmz_data_t *bmz = mphf->data; + cmph_uint32 h1 = hash(bmz->hashes[0], key, keylen) % bmz->n; + cmph_uint32 h2 = hash(bmz->hashes[1], key, keylen) % bmz->n; + DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2); + if (h1 == h2 && ++h2 > bmz->n) h2 = 0; + DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, bmz->g[h1], bmz->g[h2], bmz->m); + return bmz->g[h1] + bmz->g[h2]; +} +void bmz_destroy(cmph_t *mphf) +{ + bmz_data_t *data = (bmz_data_t *)mphf->data; + free(data->g); + hash_state_destroy(data->hashes[0]); + hash_state_destroy(data->hashes[1]); + free(data->hashes); + free(data); + free(mphf); +} + +/** \fn void bmz_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void bmz_pack(cmph_t *mphf, void *packed_mphf) +{ + + bmz_data_t *data = (bmz_data_t *)mphf->data; + cmph_uint8 * ptr = packed_mphf; + + // packing h1 type + CMPH_HASH h1_type = hash_get_type(data->hashes[0]); + *((cmph_uint32 *) ptr) = h1_type; + ptr += sizeof(cmph_uint32); + + // packing h1 + hash_state_pack(data->hashes[0], ptr); + ptr += hash_state_packed_size(h1_type); + + // packing h2 type + CMPH_HASH h2_type = hash_get_type(data->hashes[1]); + *((cmph_uint32 *) ptr) = h2_type; + ptr += sizeof(cmph_uint32); + + // packing h2 + hash_state_pack(data->hashes[1], ptr); + ptr += hash_state_packed_size(h2_type); + + // packing n + *((cmph_uint32 *) ptr) = data->n; + ptr += sizeof(data->n); + + // packing g + memcpy(ptr, data->g, sizeof(cmph_uint32)*data->n); +} + +/** \fn cmph_uint32 bmz_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 bmz_packed_size(cmph_t *mphf) +{ + bmz_data_t *data = (bmz_data_t *)mphf->data; + CMPH_HASH h1_type = hash_get_type(data->hashes[0]); + CMPH_HASH h2_type = hash_get_type(data->hashes[1]); + + return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) + + 3*sizeof(cmph_uint32) + sizeof(cmph_uint32)*data->n); +} + +/** cmph_uint32 bmz_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 bmz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) +{ + register cmph_uint8 *h1_ptr = packed_mphf; + register CMPH_HASH h1_type = *((cmph_uint32 *)h1_ptr); + h1_ptr += 4; + + register cmph_uint8 *h2_ptr = h1_ptr + hash_state_packed_size(h1_type); + register CMPH_HASH h2_type = *((cmph_uint32 *)h2_ptr); + h2_ptr += 4; + + register cmph_uint32 *g_ptr = (cmph_uint32 *)(h2_ptr + hash_state_packed_size(h2_type)); + + register cmph_uint32 n = *g_ptr++; + + register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n; + register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n; + if (h1 == h2 && ++h2 > n) h2 = 0; + return (g_ptr[h1] + g_ptr[h2]); +} diff --git a/cmph/bmz.h b/cmph/bmz.h new file mode 100644 index 000000000..ee5f61dda --- /dev/null +++ b/cmph/bmz.h @@ -0,0 +1,42 @@ +#ifndef __CMPH_BMZ_H__ +#define __CMPH_BMZ_H__ + +#include "cmph.h" + +typedef struct __bmz_data_t bmz_data_t; +typedef struct __bmz_config_data_t bmz_config_data_t; + +bmz_config_data_t *bmz_config_new(); +void bmz_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs); +void bmz_config_destroy(cmph_config_t *mph); +cmph_t *bmz_new(cmph_config_t *mph, double c); + +void bmz_load(FILE *f, cmph_t *mphf); +int bmz_dump(cmph_t *mphf, FILE *f); +void bmz_destroy(cmph_t *mphf); +cmph_uint32 bmz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); + +/** \fn void bmz_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void bmz_pack(cmph_t *mphf, void *packed_mphf); + +/** \fn cmph_uint32 bmz_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 bmz_packed_size(cmph_t *mphf); + +/** cmph_uint32 bmz_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 bmz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen); + +#endif diff --git a/cmph/bmz8.c b/cmph/bmz8.c new file mode 100644 index 000000000..203f4fc19 --- /dev/null +++ b/cmph/bmz8.c @@ -0,0 +1,632 @@ +#include "graph.h" +#include "bmz8.h" +#include "cmph_structs.h" +#include "bmz8_structs.h" +#include "hash.h" +#include "vqueue.h" +#include "bitbool.h" +#include +#include +#include +#include +#include + +//#define DEBUG +#include "debug.h" + +static int bmz8_gen_edges(cmph_config_t *mph); +static cmph_uint8 bmz8_traverse_critical_nodes(bmz8_config_data_t *bmz8, cmph_uint32 v, cmph_uint8 * biggest_g_value, cmph_uint8 * biggest_edge_value, cmph_uint8 * used_edges, cmph_uint8 * visited); +static cmph_uint8 bmz8_traverse_critical_nodes_heuristic(bmz8_config_data_t *bmz8, cmph_uint32 v, cmph_uint8 * biggest_g_value, cmph_uint8 * biggest_edge_value, cmph_uint8 * used_edges, cmph_uint8 * visited); +static void bmz8_traverse_non_critical_nodes(bmz8_config_data_t *bmz8, cmph_uint8 * used_edges, cmph_uint8 * visited); + +bmz8_config_data_t *bmz8_config_new() +{ + bmz8_config_data_t *bmz8; + bmz8 = (bmz8_config_data_t *)malloc(sizeof(bmz8_config_data_t)); + assert(bmz8); + memset(bmz8, 0, sizeof(bmz8_config_data_t)); + bmz8->hashfuncs[0] = CMPH_HASH_JENKINS; + bmz8->hashfuncs[1] = CMPH_HASH_JENKINS; + bmz8->g = NULL; + bmz8->graph = NULL; + bmz8->hashes = NULL; + return bmz8; +} + +void bmz8_config_destroy(cmph_config_t *mph) +{ + bmz8_config_data_t *data = (bmz8_config_data_t *)mph->data; + DEBUGP("Destroying algorithm dependent data\n"); + free(data); +} + +void bmz8_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) +{ + bmz8_config_data_t *bmz8 = (bmz8_config_data_t *)mph->data; + CMPH_HASH *hashptr = hashfuncs; + cmph_uint8 i = 0; + while(*hashptr != CMPH_HASH_COUNT) + { + if (i >= 2) break; //bmz8 only uses two hash functions + bmz8->hashfuncs[i] = *hashptr; + ++i, ++hashptr; + } +} + +cmph_t *bmz8_new(cmph_config_t *mph, double c) +{ + cmph_t *mphf = NULL; + bmz8_data_t *bmz8f = NULL; + cmph_uint8 i; + cmph_uint8 iterations; + cmph_uint8 iterations_map = 20; + cmph_uint8 *used_edges = NULL; + cmph_uint8 restart_mapping = 0; + cmph_uint8 * visited = NULL; + bmz8_config_data_t *bmz8 = (bmz8_config_data_t *)mph->data; + + if (mph->key_source->nkeys >= 256) + { + if (mph->verbosity) fprintf(stderr, "The number of keys in BMZ8 must be lower than 256.\n"); + return NULL; + } + if (c == 0) c = 1.15; // validating restrictions over parameter c. + DEBUGP("c: %f\n", c); + bmz8->m = (cmph_uint8) mph->key_source->nkeys; + bmz8->n = (cmph_uint8) ceil(c * mph->key_source->nkeys); + DEBUGP("m (edges): %u n (vertices): %u c: %f\n", bmz8->m, bmz8->n, c); + bmz8->graph = graph_new(bmz8->n, bmz8->m); + DEBUGP("Created graph\n"); + + bmz8->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*3); + for(i = 0; i < 3; ++i) bmz8->hashes[i] = NULL; + + do + { + // Mapping step + cmph_uint8 biggest_g_value = 0; + cmph_uint8 biggest_edge_value = 1; + iterations = 100; + if (mph->verbosity) + { + fprintf(stderr, "Entering mapping step for mph creation of %u keys with graph sized %u\n", bmz8->m, bmz8->n); + } + while(1) + { + int ok; + DEBUGP("hash function 1\n"); + bmz8->hashes[0] = hash_state_new(bmz8->hashfuncs[0], bmz8->n); + DEBUGP("hash function 2\n"); + bmz8->hashes[1] = hash_state_new(bmz8->hashfuncs[1], bmz8->n); + DEBUGP("Generating edges\n"); + ok = bmz8_gen_edges(mph); + if (!ok) + { + --iterations; + hash_state_destroy(bmz8->hashes[0]); + bmz8->hashes[0] = NULL; + hash_state_destroy(bmz8->hashes[1]); + bmz8->hashes[1] = NULL; + DEBUGP("%u iterations remaining\n", iterations); + if (mph->verbosity) + { + fprintf(stderr, "simple graph creation failure - %u iterations remaining\n", iterations); + } + if (iterations == 0) break; + } + else break; + } + if (iterations == 0) + { + graph_destroy(bmz8->graph); + return NULL; + } + + // Ordering step + if (mph->verbosity) + { + fprintf(stderr, "Starting ordering step\n"); + } + + graph_obtain_critical_nodes(bmz8->graph); + + // Searching step + if (mph->verbosity) + { + fprintf(stderr, "Starting Searching step.\n"); + fprintf(stderr, "\tTraversing critical vertices.\n"); + } + DEBUGP("Searching step\n"); + visited = (cmph_uint8 *)malloc((size_t)bmz8->n/8 + 1); + memset(visited, 0, (size_t)bmz8->n/8 + 1); + used_edges = (cmph_uint8 *)malloc((size_t)bmz8->m/8 + 1); + memset(used_edges, 0, (size_t)bmz8->m/8 + 1); + free(bmz8->g); + bmz8->g = (cmph_uint8 *)calloc((size_t)bmz8->n, sizeof(cmph_uint8)); + assert(bmz8->g); + for (i = 0; i < bmz8->n; ++i) // critical nodes + { + if (graph_node_is_critical(bmz8->graph, i) && (!GETBIT(visited,i))) + { + if(c > 1.14) restart_mapping = bmz8_traverse_critical_nodes(bmz8, i, &biggest_g_value, &biggest_edge_value, used_edges, visited); + else restart_mapping = bmz8_traverse_critical_nodes_heuristic(bmz8, i, &biggest_g_value, &biggest_edge_value, used_edges, visited); + if(restart_mapping) break; + } + } + if(!restart_mapping) + { + if (mph->verbosity) + { + fprintf(stderr, "\tTraversing non critical vertices.\n"); + } + bmz8_traverse_non_critical_nodes(bmz8, used_edges, visited); // non_critical_nodes + } + else + { + iterations_map--; + if (mph->verbosity) fprintf(stderr, "Restarting mapping step. %u iterations remaining.\n", iterations_map); + } + + free(used_edges); + free(visited); + + }while(restart_mapping && iterations_map > 0); + graph_destroy(bmz8->graph); + bmz8->graph = NULL; + if (iterations_map == 0) + { + return NULL; + } + mphf = (cmph_t *)malloc(sizeof(cmph_t)); + mphf->algo = mph->algo; + bmz8f = (bmz8_data_t *)malloc(sizeof(bmz8_data_t)); + bmz8f->g = bmz8->g; + bmz8->g = NULL; //transfer memory ownership + bmz8f->hashes = bmz8->hashes; + bmz8->hashes = NULL; //transfer memory ownership + bmz8f->n = bmz8->n; + bmz8f->m = bmz8->m; + mphf->data = bmz8f; + mphf->size = bmz8->m; + DEBUGP("Successfully generated minimal perfect hash\n"); + if (mph->verbosity) + { + fprintf(stderr, "Successfully generated minimal perfect hash function\n"); + } + return mphf; +} + +static cmph_uint8 bmz8_traverse_critical_nodes(bmz8_config_data_t *bmz8, cmph_uint32 v, cmph_uint8 * biggest_g_value, cmph_uint8 * biggest_edge_value, cmph_uint8 * used_edges, cmph_uint8 * visited) +{ + cmph_uint8 next_g; + cmph_uint32 u; /* Auxiliary vertex */ + cmph_uint32 lav; /* lookahead vertex */ + cmph_uint8 collision; + vqueue_t * q = vqueue_new((cmph_uint32)(graph_ncritical_nodes(bmz8->graph))); + graph_iterator_t it, it1; + + DEBUGP("Labelling critical vertices\n"); + bmz8->g[v] = (cmph_uint8)(ceil ((double)(*biggest_edge_value)/2) - 1); + SETBIT(visited, v); + next_g = (cmph_uint8)floor((double)(*biggest_edge_value/2)); /* next_g is incremented in the do..while statement*/ + vqueue_insert(q, v); + while(!vqueue_is_empty(q)) + { + v = vqueue_remove(q); + it = graph_neighbors_it(bmz8->graph, v); + while ((u = graph_next_neighbor(bmz8->graph, &it)) != GRAPH_NO_NEIGHBOR) + { + if (graph_node_is_critical(bmz8->graph, u) && (!GETBIT(visited,u))) + { + collision = 1; + while(collision) // lookahead to resolve collisions + { + next_g = (cmph_uint8)(*biggest_g_value + 1); + it1 = graph_neighbors_it(bmz8->graph, u); + collision = 0; + while((lav = graph_next_neighbor(bmz8->graph, &it1)) != GRAPH_NO_NEIGHBOR) + { + if (graph_node_is_critical(bmz8->graph, lav) && GETBIT(visited,lav)) + { + if(next_g + bmz8->g[lav] >= bmz8->m) + { + vqueue_destroy(q); + return 1; // restart mapping step. + } + if (GETBIT(used_edges, (next_g + bmz8->g[lav]))) + { + collision = 1; + break; + } + } + } + if (next_g > *biggest_g_value) *biggest_g_value = next_g; + } + // Marking used edges... + it1 = graph_neighbors_it(bmz8->graph, u); + while((lav = graph_next_neighbor(bmz8->graph, &it1)) != GRAPH_NO_NEIGHBOR) + { + if (graph_node_is_critical(bmz8->graph, lav) && GETBIT(visited, lav)) + { + SETBIT(used_edges,(next_g + bmz8->g[lav])); + + if(next_g + bmz8->g[lav] > *biggest_edge_value) + *biggest_edge_value = (cmph_uint8)(next_g + bmz8->g[lav]); + } + } + bmz8->g[u] = next_g; // Labelling vertex u. + SETBIT(visited,u); + vqueue_insert(q, u); + } + } + + } + vqueue_destroy(q); + return 0; +} + +static cmph_uint8 bmz8_traverse_critical_nodes_heuristic(bmz8_config_data_t *bmz8, cmph_uint32 v, cmph_uint8 * biggest_g_value, cmph_uint8 * biggest_edge_value, cmph_uint8 * used_edges, cmph_uint8 * visited) +{ + cmph_uint8 next_g; + cmph_uint32 u; + cmph_uint32 lav; + cmph_uint8 collision; + cmph_uint8 * unused_g_values = NULL; + cmph_uint8 unused_g_values_capacity = 0; + cmph_uint8 nunused_g_values = 0; + vqueue_t * q = vqueue_new((cmph_uint32)(graph_ncritical_nodes(bmz8->graph))); + graph_iterator_t it, it1; + + DEBUGP("Labelling critical vertices\n"); + bmz8->g[v] = (cmph_uint8)(ceil ((double)(*biggest_edge_value)/2) - 1); + SETBIT(visited, v); + next_g = (cmph_uint8)floor((double)(*biggest_edge_value/2)); + vqueue_insert(q, v); + while(!vqueue_is_empty(q)) + { + v = vqueue_remove(q); + it = graph_neighbors_it(bmz8->graph, v); + while ((u = graph_next_neighbor(bmz8->graph, &it)) != GRAPH_NO_NEIGHBOR) + { + if (graph_node_is_critical(bmz8->graph, u) && (!GETBIT(visited,u))) + { + cmph_uint8 next_g_index = 0; + collision = 1; + while(collision) // lookahead to resolve collisions + { + if (next_g_index < nunused_g_values) + { + next_g = unused_g_values[next_g_index++]; + } + else + { + next_g = (cmph_uint8)(*biggest_g_value + 1); + next_g_index = 255;//UINT_MAX; + } + it1 = graph_neighbors_it(bmz8->graph, u); + collision = 0; + while((lav = graph_next_neighbor(bmz8->graph, &it1)) != GRAPH_NO_NEIGHBOR) + { + if (graph_node_is_critical(bmz8->graph, lav) && GETBIT(visited,lav)) + { + if(next_g + bmz8->g[lav] >= bmz8->m) + { + vqueue_destroy(q); + free(unused_g_values); + return 1; // restart mapping step. + } + if (GETBIT(used_edges, (next_g + bmz8->g[lav]))) + { + collision = 1; + break; + } + } + } + if(collision && (next_g > *biggest_g_value)) // saving the current g value stored in next_g. + { + if(nunused_g_values == unused_g_values_capacity) + { + unused_g_values = (cmph_uint8*)realloc(unused_g_values, ((size_t)(unused_g_values_capacity + BUFSIZ))*sizeof(cmph_uint8)); + unused_g_values_capacity += (cmph_uint8)BUFSIZ; + } + unused_g_values[nunused_g_values++] = next_g; + + } + if (next_g > *biggest_g_value) *biggest_g_value = next_g; + } + + next_g_index--; + if (next_g_index < nunused_g_values) unused_g_values[next_g_index] = unused_g_values[--nunused_g_values]; + + // Marking used edges... + it1 = graph_neighbors_it(bmz8->graph, u); + while((lav = graph_next_neighbor(bmz8->graph, &it1)) != GRAPH_NO_NEIGHBOR) + { + if (graph_node_is_critical(bmz8->graph, lav) && GETBIT(visited, lav)) + { + SETBIT(used_edges,(next_g + bmz8->g[lav])); + if(next_g + bmz8->g[lav] > *biggest_edge_value) + *biggest_edge_value = (cmph_uint8)(next_g + bmz8->g[lav]); + } + } + + bmz8->g[u] = next_g; // Labelling vertex u. + SETBIT(visited, u); + vqueue_insert(q, u); + + } + } + + } + vqueue_destroy(q); + free(unused_g_values); + return 0; +} + +static cmph_uint8 next_unused_edge(bmz8_config_data_t *bmz8, cmph_uint8 * used_edges, cmph_uint32 unused_edge_index) +{ + while(1) + { + assert(unused_edge_index < bmz8->m); + if(GETBIT(used_edges, unused_edge_index)) unused_edge_index ++; + else break; + } + return (cmph_uint8)unused_edge_index; +} + +static void bmz8_traverse(bmz8_config_data_t *bmz8, cmph_uint8 * used_edges, cmph_uint32 v, cmph_uint8 * unused_edge_index, cmph_uint8 * visited) +{ + graph_iterator_t it = graph_neighbors_it(bmz8->graph, v); + cmph_uint32 neighbor = 0; + while((neighbor = graph_next_neighbor(bmz8->graph, &it)) != GRAPH_NO_NEIGHBOR) + { + if(GETBIT(visited,neighbor)) continue; + //DEBUGP("Visiting neighbor %u\n", neighbor); + *unused_edge_index = next_unused_edge(bmz8, used_edges, *unused_edge_index); + bmz8->g[neighbor] = (cmph_uint8)(*unused_edge_index - bmz8->g[v]); + //if (bmz8->g[neighbor] >= bmz8->m) bmz8->g[neighbor] += bmz8->m; + SETBIT(visited, neighbor); + (*unused_edge_index)++; + bmz8_traverse(bmz8, used_edges, neighbor, unused_edge_index, visited); + + } +} + +static void bmz8_traverse_non_critical_nodes(bmz8_config_data_t *bmz8, cmph_uint8 * used_edges, cmph_uint8 * visited) +{ + + cmph_uint8 i, v1, v2, unused_edge_index = 0; + DEBUGP("Labelling non critical vertices\n"); + for(i = 0; i < bmz8->m; i++) + { + v1 = (cmph_uint8)graph_vertex_id(bmz8->graph, i, 0); + v2 = (cmph_uint8)graph_vertex_id(bmz8->graph, i, 1); + if((GETBIT(visited,v1) && GETBIT(visited,v2)) || (!GETBIT(visited,v1) && !GETBIT(visited,v2))) continue; + if(GETBIT(visited,v1)) bmz8_traverse(bmz8, used_edges, v1, &unused_edge_index, visited); + else bmz8_traverse(bmz8, used_edges, v2, &unused_edge_index, visited); + + } + + for(i = 0; i < bmz8->n; i++) + { + if(!GETBIT(visited,i)) + { + bmz8->g[i] = 0; + SETBIT(visited, i); + bmz8_traverse(bmz8, used_edges, i, &unused_edge_index, visited); + } + } + +} + +static int bmz8_gen_edges(cmph_config_t *mph) +{ + cmph_uint8 e; + bmz8_config_data_t *bmz8 = (bmz8_config_data_t *)mph->data; + cmph_uint8 multiple_edges = 0; + DEBUGP("Generating edges for %u vertices\n", bmz8->n); + graph_clear_edges(bmz8->graph); + mph->key_source->rewind(mph->key_source->data); + for (e = 0; e < mph->key_source->nkeys; ++e) + { + cmph_uint8 h1, h2; + cmph_uint32 keylen; + char *key = NULL; + mph->key_source->read(mph->key_source->data, &key, &keylen); + +// if (key == NULL)fprintf(stderr, "key = %s -- read BMZ\n", key); + h1 = (cmph_uint8)(hash(bmz8->hashes[0], key, keylen) % bmz8->n); + h2 = (cmph_uint8)(hash(bmz8->hashes[1], key, keylen) % bmz8->n); + if (h1 == h2) if (++h2 >= bmz8->n) h2 = 0; + if (h1 == h2) + { + if (mph->verbosity) fprintf(stderr, "Self loop for key %u\n", e); + mph->key_source->dispose(mph->key_source->data, key, keylen); + return 0; + } + //DEBUGP("Adding edge: %u -> %u for key %s\n", h1, h2, key); + mph->key_source->dispose(mph->key_source->data, key, keylen); +// fprintf(stderr, "key = %s -- dispose BMZ\n", key); + multiple_edges = graph_contains_edge(bmz8->graph, h1, h2); + if (mph->verbosity && multiple_edges) fprintf(stderr, "A non simple graph was generated\n"); + if (multiple_edges) return 0; // checking multiple edge restriction. + graph_add_edge(bmz8->graph, h1, h2); + } + return !multiple_edges; +} + +int bmz8_dump(cmph_t *mphf, FILE *fd) +{ + char *buf = NULL; + cmph_uint32 buflen; + cmph_uint8 two = 2; //number of hash functions + bmz8_data_t *data = (bmz8_data_t *)mphf->data; + register size_t nbytes; + __cmph_dump(mphf, fd); + + nbytes = fwrite(&two, sizeof(cmph_uint8), (size_t)1, fd); + + hash_state_dump(data->hashes[0], &buf, &buflen); + DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); + nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd); + free(buf); + + hash_state_dump(data->hashes[1], &buf, &buflen); + DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); + nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd); + free(buf); + + nbytes = fwrite(&(data->n), sizeof(cmph_uint8), (size_t)1, fd); + nbytes = fwrite(&(data->m), sizeof(cmph_uint8), (size_t)1, fd); + + nbytes = fwrite(data->g, sizeof(cmph_uint8)*(data->n), (size_t)1, fd); +/* #ifdef DEBUG + fprintf(stderr, "G: "); + for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", data->g[i]); + fprintf(stderr, "\n"); + #endif*/ + return 1; +} + +void bmz8_load(FILE *f, cmph_t *mphf) +{ + cmph_uint8 nhashes; + char *buf = NULL; + cmph_uint32 buflen; + cmph_uint8 i; + register size_t nbytes; + bmz8_data_t *bmz8 = (bmz8_data_t *)malloc(sizeof(bmz8_data_t)); + + DEBUGP("Loading bmz8 mphf\n"); + mphf->data = bmz8; + nbytes = fread(&nhashes, sizeof(cmph_uint8), (size_t)1, f); + bmz8->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*(size_t)(nhashes + 1)); + bmz8->hashes[nhashes] = NULL; + DEBUGP("Reading %u hashes\n", nhashes); + for (i = 0; i < nhashes; ++i) + { + hash_state_t *state = NULL; + nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f); + DEBUGP("Hash state has %u bytes\n", buflen); + buf = (char *)malloc((size_t)buflen); + nbytes = fread(buf, (size_t)buflen, (size_t)1, f); + state = hash_state_load(buf, buflen); + bmz8->hashes[i] = state; + free(buf); + } + + DEBUGP("Reading m and n\n"); + nbytes = fread(&(bmz8->n), sizeof(cmph_uint8), (size_t)1, f); + nbytes = fread(&(bmz8->m), sizeof(cmph_uint8), (size_t)1, f); + + bmz8->g = (cmph_uint8 *)malloc(sizeof(cmph_uint8)*bmz8->n); + nbytes = fread(bmz8->g, bmz8->n*sizeof(cmph_uint8), (size_t)1, f); + #ifdef DEBUG + fprintf(stderr, "G: "); + for (i = 0; i < bmz8->n; ++i) fprintf(stderr, "%u ", bmz8->g[i]); + fprintf(stderr, "\n"); + #endif + return; +} + + +cmph_uint8 bmz8_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) +{ + bmz8_data_t *bmz8 = mphf->data; + cmph_uint8 h1 = (cmph_uint8)(hash(bmz8->hashes[0], key, keylen) % bmz8->n); + cmph_uint8 h2 = (cmph_uint8)(hash(bmz8->hashes[1], key, keylen) % bmz8->n); + DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2); + if (h1 == h2 && ++h2 > bmz8->n) h2 = 0; + DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, bmz8->g[h1], bmz8->g[h2], bmz8->m); + return (cmph_uint8)(bmz8->g[h1] + bmz8->g[h2]); +} +void bmz8_destroy(cmph_t *mphf) +{ + bmz8_data_t *data = (bmz8_data_t *)mphf->data; + free(data->g); + hash_state_destroy(data->hashes[0]); + hash_state_destroy(data->hashes[1]); + free(data->hashes); + free(data); + free(mphf); +} + +/** \fn void bmz8_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void bmz8_pack(cmph_t *mphf, void *packed_mphf) +{ + bmz8_data_t *data = (bmz8_data_t *)mphf->data; + cmph_uint8 * ptr = packed_mphf; + + // packing h1 type + CMPH_HASH h1_type = hash_get_type(data->hashes[0]); + *((cmph_uint32 *) ptr) = h1_type; + ptr += sizeof(cmph_uint32); + + // packing h1 + hash_state_pack(data->hashes[0], ptr); + ptr += hash_state_packed_size(h1_type); + + // packing h2 type + CMPH_HASH h2_type = hash_get_type(data->hashes[1]); + *((cmph_uint32 *) ptr) = h2_type; + ptr += sizeof(cmph_uint32); + + // packing h2 + hash_state_pack(data->hashes[1], ptr); + ptr += hash_state_packed_size(h2_type); + + // packing n + *ptr++ = data->n; + + // packing g + memcpy(ptr, data->g, sizeof(cmph_uint8)*data->n); +} + +/** \fn cmph_uint32 bmz8_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 bmz8_packed_size(cmph_t *mphf) +{ + bmz8_data_t *data = (bmz8_data_t *)mphf->data; + CMPH_HASH h1_type = hash_get_type(data->hashes[0]); + CMPH_HASH h2_type = hash_get_type(data->hashes[1]); + + return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) + + 2*sizeof(cmph_uint32) + sizeof(cmph_uint8) + sizeof(cmph_uint8)*data->n); +} + +/** cmph_uint8 bmz8_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint8 bmz8_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) +{ + register cmph_uint8 *h1_ptr = packed_mphf; + register CMPH_HASH h1_type = *((cmph_uint32 *)h1_ptr); + h1_ptr += 4; + + register cmph_uint8 *h2_ptr = h1_ptr + hash_state_packed_size(h1_type); + register CMPH_HASH h2_type = *((cmph_uint32 *)h2_ptr); + h2_ptr += 4; + + register cmph_uint8 *g_ptr = h2_ptr + hash_state_packed_size(h2_type); + + register cmph_uint8 n = *g_ptr++; + + register cmph_uint8 h1 = (cmph_uint8)(hash_packed(h1_ptr, h1_type, key, keylen) % n); + register cmph_uint8 h2 = (cmph_uint8)(hash_packed(h2_ptr, h2_type, key, keylen) % n); + DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2); + if (h1 == h2 && ++h2 > n) h2 = 0; + return (cmph_uint8)(g_ptr[h1] + g_ptr[h2]); +} diff --git a/cmph/bmz8.h b/cmph/bmz8.h new file mode 100644 index 000000000..5456759ef --- /dev/null +++ b/cmph/bmz8.h @@ -0,0 +1,42 @@ +#ifndef __CMPH_BMZ8_H__ +#define __CMPH_BMZ8_H__ + +#include "cmph.h" + +typedef struct __bmz8_data_t bmz8_data_t; +typedef struct __bmz8_config_data_t bmz8_config_data_t; + +bmz8_config_data_t *bmz8_config_new(); +void bmz8_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs); +void bmz8_config_destroy(cmph_config_t *mph); +cmph_t *bmz8_new(cmph_config_t *mph, double c); + +void bmz8_load(FILE *f, cmph_t *mphf); +int bmz8_dump(cmph_t *mphf, FILE *f); +void bmz8_destroy(cmph_t *mphf); +cmph_uint8 bmz8_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); + +/** \fn void bmz8_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void bmz8_pack(cmph_t *mphf, void *packed_mphf); + +/** \fn cmph_uint32 bmz8_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 bmz8_packed_size(cmph_t *mphf); + +/** cmph_uint8 bmz8_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint8 bmz8_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen); + +#endif diff --git a/cmph/bmz8_structs.h b/cmph/bmz8_structs.h new file mode 100644 index 000000000..408b52998 --- /dev/null +++ b/cmph/bmz8_structs.h @@ -0,0 +1,25 @@ +#ifndef __CMPH_BMZ8_STRUCTS_H__ +#define __CMPH_BMZ8_STRUCTS_H__ + +#include "hash_state.h" + +struct __bmz8_data_t +{ + cmph_uint8 m; //edges (words) count + cmph_uint8 n; //vertex count + cmph_uint8 *g; + hash_state_t **hashes; +}; + + +struct __bmz8_config_data_t +{ + CMPH_HASH hashfuncs[2]; + cmph_uint8 m; //edges (words) count + cmph_uint8 n; //vertex count + graph_t *graph; + cmph_uint8 *g; + hash_state_t **hashes; +}; + +#endif diff --git a/cmph/bmz_structs.h b/cmph/bmz_structs.h new file mode 100644 index 000000000..67065a005 --- /dev/null +++ b/cmph/bmz_structs.h @@ -0,0 +1,25 @@ +#ifndef __CMPH_BMZ_STRUCTS_H__ +#define __CMPH_BMZ_STRUCTS_H__ + +#include "hash_state.h" + +struct __bmz_data_t +{ + cmph_uint32 m; //edges (words) count + cmph_uint32 n; //vertex count + cmph_uint32 *g; + hash_state_t **hashes; +}; + + +struct __bmz_config_data_t +{ + CMPH_HASH hashfuncs[2]; + cmph_uint32 m; //edges (words) count + cmph_uint32 n; //vertex count + graph_t *graph; + cmph_uint32 *g; + hash_state_t **hashes; +}; + +#endif diff --git a/cmph/brz.c b/cmph/brz.c new file mode 100755 index 000000000..eb89ac06c --- /dev/null +++ b/cmph/brz.c @@ -0,0 +1,985 @@ +#include "graph.h" +#include "fch.h" +#include "fch_structs.h" +#include "bmz8.h" +#include "bmz8_structs.h" +#include "brz.h" +#include "cmph_structs.h" +#include "brz_structs.h" +#include "buffer_manager.h" +#include "cmph.h" +#include "hash.h" +#include "bitbool.h" +#include +#include +#include +#include +#include +#define MAX_BUCKET_SIZE 255 +//#define DEBUG +#include "debug.h" + +static int brz_gen_mphf(cmph_config_t *mph); +static cmph_uint32 brz_min_index(cmph_uint32 * vector, cmph_uint32 n); +static void brz_destroy_keys_vd(cmph_uint8 ** keys_vd, cmph_uint32 nkeys); +static char * brz_copy_partial_fch_mphf(brz_config_data_t *brz, fch_data_t * fchf, cmph_uint32 index, cmph_uint32 *buflen); +static char * brz_copy_partial_bmz8_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen); +brz_config_data_t *brz_config_new() +{ + brz_config_data_t *brz = NULL; + brz = (brz_config_data_t *)malloc(sizeof(brz_config_data_t)); + brz->algo = CMPH_FCH; + brz->b = 128; + brz->hashfuncs[0] = CMPH_HASH_JENKINS; + brz->hashfuncs[1] = CMPH_HASH_JENKINS; + brz->hashfuncs[2] = CMPH_HASH_JENKINS; + brz->size = NULL; + brz->offset = NULL; + brz->g = NULL; + brz->h1 = NULL; + brz->h2 = NULL; + brz->h0 = NULL; + brz->memory_availability = 1024*1024; + brz->tmp_dir = (cmph_uint8 *)calloc((size_t)10, sizeof(cmph_uint8)); + brz->mphf_fd = NULL; + strcpy((char *)(brz->tmp_dir), "/var/tmp/"); + assert(brz); + return brz; +} + +void brz_config_destroy(cmph_config_t *mph) +{ + brz_config_data_t *data = (brz_config_data_t *)mph->data; + free(data->tmp_dir); + DEBUGP("Destroying algorithm dependent data\n"); + free(data); +} + +void brz_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) +{ + brz_config_data_t *brz = (brz_config_data_t *)mph->data; + CMPH_HASH *hashptr = hashfuncs; + cmph_uint32 i = 0; + while(*hashptr != CMPH_HASH_COUNT) + { + if (i >= 3) break; //brz only uses three hash functions + brz->hashfuncs[i] = *hashptr; + ++i, ++hashptr; + } +} + +void brz_config_set_memory_availability(cmph_config_t *mph, cmph_uint32 memory_availability) +{ + brz_config_data_t *brz = (brz_config_data_t *)mph->data; + if(memory_availability > 0) brz->memory_availability = memory_availability*1024*1024; +} + +void brz_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir) +{ + brz_config_data_t *brz = (brz_config_data_t *)mph->data; + if(tmp_dir) + { + size_t len = strlen((char *)tmp_dir); + free(brz->tmp_dir); + if(tmp_dir[len-1] != '/') + { + brz->tmp_dir = (cmph_uint8 *)calloc((size_t)len+2, sizeof(cmph_uint8)); + sprintf((char *)(brz->tmp_dir), "%s/", (char *)tmp_dir); + } + else + { + brz->tmp_dir = (cmph_uint8 *)calloc((size_t)len+1, sizeof(cmph_uint8)); + sprintf((char *)(brz->tmp_dir), "%s", (char *)tmp_dir); + } + + } +} + +void brz_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd) +{ + brz_config_data_t *brz = (brz_config_data_t *)mph->data; + brz->mphf_fd = mphf_fd; + assert(brz->mphf_fd); +} + +void brz_config_set_b(cmph_config_t *mph, cmph_uint32 b) +{ + brz_config_data_t *brz = (brz_config_data_t *)mph->data; + if(b <= 64 || b >= 175) + { + b = 128; + } + brz->b = (cmph_uint8)b; +} + +void brz_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo) +{ + if (algo == CMPH_BMZ8 || algo == CMPH_FCH) // supported algorithms + { + brz_config_data_t *brz = (brz_config_data_t *)mph->data; + brz->algo = algo; + } +} + +cmph_t *brz_new(cmph_config_t *mph, double c) +{ + cmph_t *mphf = NULL; + brz_data_t *brzf = NULL; + cmph_uint32 i; + cmph_uint32 iterations = 20; + + DEBUGP("c: %f\n", c); + brz_config_data_t *brz = (brz_config_data_t *)mph->data; + switch(brz->algo) // validating restrictions over parameter c. + { + case CMPH_BMZ8: + if (c == 0 || c >= 2.0) c = 1; + break; + case CMPH_FCH: + if (c <= 2.0) c = 2.6; + break; + default: + assert(0); + } + brz->c = c; + brz->m = mph->key_source->nkeys; + DEBUGP("m: %u\n", brz->m); + brz->k = (cmph_uint32)ceil(brz->m/((double)brz->b)); + DEBUGP("k: %u\n", brz->k); + brz->size = (cmph_uint8 *) calloc((size_t)brz->k, sizeof(cmph_uint8)); + + // Clustering the keys by graph id. + if (mph->verbosity) + { + fprintf(stderr, "Partioning the set of keys.\n"); + } + + while(1) + { + int ok; + DEBUGP("hash function 3\n"); + brz->h0 = hash_state_new(brz->hashfuncs[2], brz->k); + DEBUGP("Generating graphs\n"); + ok = brz_gen_mphf(mph); + if (!ok) + { + --iterations; + hash_state_destroy(brz->h0); + brz->h0 = NULL; + DEBUGP("%u iterations remaining to create the graphs in a external file\n", iterations); + if (mph->verbosity) + { + fprintf(stderr, "Failure: A graph with more than 255 keys was created - %u iterations remaining\n", iterations); + } + if (iterations == 0) break; + } + else break; + } + if (iterations == 0) + { + DEBUGP("Graphs with more than 255 keys were created in all 20 iterations\n"); + free(brz->size); + return NULL; + } + DEBUGP("Graphs generated\n"); + + brz->offset = (cmph_uint32 *)calloc((size_t)brz->k, sizeof(cmph_uint32)); + for (i = 1; i < brz->k; ++i) + { + brz->offset[i] = brz->size[i-1] + brz->offset[i-1]; + } + // Generating a mphf + mphf = (cmph_t *)malloc(sizeof(cmph_t)); + mphf->algo = mph->algo; + brzf = (brz_data_t *)malloc(sizeof(brz_data_t)); + brzf->g = brz->g; + brz->g = NULL; //transfer memory ownership + brzf->h1 = brz->h1; + brz->h1 = NULL; //transfer memory ownership + brzf->h2 = brz->h2; + brz->h2 = NULL; //transfer memory ownership + brzf->h0 = brz->h0; + brz->h0 = NULL; //transfer memory ownership + brzf->size = brz->size; + brz->size = NULL; //transfer memory ownership + brzf->offset = brz->offset; + brz->offset = NULL; //transfer memory ownership + brzf->k = brz->k; + brzf->c = brz->c; + brzf->m = brz->m; + brzf->algo = brz->algo; + mphf->data = brzf; + mphf->size = brz->m; + DEBUGP("Successfully generated minimal perfect hash\n"); + if (mph->verbosity) + { + fprintf(stderr, "Successfully generated minimal perfect hash function\n"); + } + return mphf; +} + +static int brz_gen_mphf(cmph_config_t *mph) +{ + cmph_uint32 i, e, error; + brz_config_data_t *brz = (brz_config_data_t *)mph->data; + cmph_uint32 memory_usage = 0; + cmph_uint32 nkeys_in_buffer = 0; + cmph_uint8 *buffer = (cmph_uint8 *)malloc((size_t)brz->memory_availability); + cmph_uint32 *buckets_size = (cmph_uint32 *)calloc((size_t)brz->k, sizeof(cmph_uint32)); + cmph_uint32 *keys_index = NULL; + cmph_uint8 **buffer_merge = NULL; + cmph_uint32 *buffer_h0 = NULL; + cmph_uint32 nflushes = 0; + cmph_uint32 h0; + register size_t nbytes; + FILE * tmp_fd = NULL; + buffer_manager_t * buff_manager = NULL; + char *filename = NULL; + char *key = NULL; + cmph_uint32 keylen; + cmph_uint32 cur_bucket = 0; + cmph_uint8 nkeys_vd = 0; + cmph_uint8 ** keys_vd = NULL; + + mph->key_source->rewind(mph->key_source->data); + DEBUGP("Generating graphs from %u keys\n", brz->m); + // Partitioning + for (e = 0; e < brz->m; ++e) + { + mph->key_source->read(mph->key_source->data, &key, &keylen); + + /* Buffers management */ + if (memory_usage + keylen + sizeof(keylen) > brz->memory_availability) // flush buffers + { + if(mph->verbosity) + { + fprintf(stderr, "Flushing %u\n", nkeys_in_buffer); + } + cmph_uint32 value = buckets_size[0]; + cmph_uint32 sum = 0; + cmph_uint32 keylen1 = 0; + buckets_size[0] = 0; + for(i = 1; i < brz->k; i++) + { + if(buckets_size[i] == 0) continue; + sum += value; + value = buckets_size[i]; + buckets_size[i] = sum; + + } + memory_usage = 0; + keys_index = (cmph_uint32 *)calloc((size_t)nkeys_in_buffer, sizeof(cmph_uint32)); + for(i = 0; i < nkeys_in_buffer; i++) + { + memcpy(&keylen1, buffer + memory_usage, sizeof(keylen1)); + h0 = hash(brz->h0, (char *)(buffer + memory_usage + sizeof(keylen1)), keylen1) % brz->k; + keys_index[buckets_size[h0]] = memory_usage; + buckets_size[h0]++; + memory_usage += keylen1 + (cmph_uint32)sizeof(keylen1); + } + filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char)); + sprintf(filename, "%s%u.cmph",brz->tmp_dir, nflushes); + tmp_fd = fopen(filename, "wb"); + free(filename); + filename = NULL; + for(i = 0; i < nkeys_in_buffer; i++) + { + memcpy(&keylen1, buffer + keys_index[i], sizeof(keylen1)); + nbytes = fwrite(buffer + keys_index[i], (size_t)1, keylen1 + sizeof(keylen1), tmp_fd); + } + nkeys_in_buffer = 0; + memory_usage = 0; + memset((void *)buckets_size, 0, brz->k*sizeof(cmph_uint32)); + nflushes++; + free(keys_index); + fclose(tmp_fd); + } + memcpy(buffer + memory_usage, &keylen, sizeof(keylen)); + memcpy(buffer + memory_usage + sizeof(keylen), key, (size_t)keylen); + memory_usage += keylen + (cmph_uint32)sizeof(keylen); + h0 = hash(brz->h0, key, keylen) % brz->k; + + if ((brz->size[h0] == MAX_BUCKET_SIZE) || (brz->algo == CMPH_BMZ8 && ((brz->c >= 1.0) && (cmph_uint8)(brz->c * brz->size[h0]) < brz->size[h0]))) + { + free(buffer); + free(buckets_size); + return 0; + } + brz->size[h0] = (cmph_uint8)(brz->size[h0] + 1U); + buckets_size[h0] ++; + nkeys_in_buffer++; + mph->key_source->dispose(mph->key_source->data, key, keylen); + } + if (memory_usage != 0) // flush buffers + { + if(mph->verbosity) + { + fprintf(stderr, "Flushing %u\n", nkeys_in_buffer); + } + cmph_uint32 value = buckets_size[0]; + cmph_uint32 sum = 0; + cmph_uint32 keylen1 = 0; + buckets_size[0] = 0; + for(i = 1; i < brz->k; i++) + { + if(buckets_size[i] == 0) continue; + sum += value; + value = buckets_size[i]; + buckets_size[i] = sum; + } + memory_usage = 0; + keys_index = (cmph_uint32 *)calloc((size_t)nkeys_in_buffer, sizeof(cmph_uint32)); + for(i = 0; i < nkeys_in_buffer; i++) + { + memcpy(&keylen1, buffer + memory_usage, sizeof(keylen1)); + h0 = hash(brz->h0, (char *)(buffer + memory_usage + sizeof(keylen1)), keylen1) % brz->k; + keys_index[buckets_size[h0]] = memory_usage; + buckets_size[h0]++; + memory_usage += keylen1 + (cmph_uint32)sizeof(keylen1); + } + filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char)); + sprintf(filename, "%s%u.cmph",brz->tmp_dir, nflushes); + tmp_fd = fopen(filename, "wb"); + free(filename); + filename = NULL; + for(i = 0; i < nkeys_in_buffer; i++) + { + memcpy(&keylen1, buffer + keys_index[i], sizeof(keylen1)); + nbytes = fwrite(buffer + keys_index[i], (size_t)1, keylen1 + sizeof(keylen1), tmp_fd); + } + nkeys_in_buffer = 0; + memory_usage = 0; + memset((void *)buckets_size, 0, brz->k*sizeof(cmph_uint32)); + nflushes++; + free(keys_index); + fclose(tmp_fd); + } + + free(buffer); + free(buckets_size); + if(nflushes > 1024) return 0; // Too many files generated. + // mphf generation + if(mph->verbosity) + { + fprintf(stderr, "\nMPHF generation \n"); + } + /* Starting to dump to disk the resultant MPHF: __cmph_dump function */ + nbytes = fwrite(cmph_names[CMPH_BRZ], (size_t)(strlen(cmph_names[CMPH_BRZ]) + 1), (size_t)1, brz->mphf_fd); + nbytes = fwrite(&(brz->m), sizeof(brz->m), (size_t)1, brz->mphf_fd); + nbytes = fwrite(&(brz->c), sizeof(double), (size_t)1, brz->mphf_fd); + nbytes = fwrite(&(brz->algo), sizeof(brz->algo), (size_t)1, brz->mphf_fd); + nbytes = fwrite(&(brz->k), sizeof(cmph_uint32), (size_t)1, brz->mphf_fd); // number of MPHFs + nbytes = fwrite(brz->size, sizeof(cmph_uint8)*(brz->k), (size_t)1, brz->mphf_fd); + + //tmp_fds = (FILE **)calloc(nflushes, sizeof(FILE *)); + buff_manager = buffer_manager_new(brz->memory_availability, nflushes); + buffer_merge = (cmph_uint8 **)calloc((size_t)nflushes, sizeof(cmph_uint8 *)); + buffer_h0 = (cmph_uint32 *)calloc((size_t)nflushes, sizeof(cmph_uint32)); + + memory_usage = 0; + for(i = 0; i < nflushes; i++) + { + filename = (char *)calloc(strlen((char *)(brz->tmp_dir)) + 11, sizeof(char)); + sprintf(filename, "%s%u.cmph",brz->tmp_dir, i); + buffer_manager_open(buff_manager, i, filename); + free(filename); + filename = NULL; + key = (char *)buffer_manager_read_key(buff_manager, i, &keylen); + h0 = hash(brz->h0, key+sizeof(keylen), keylen) % brz->k; + buffer_h0[i] = h0; + buffer_merge[i] = (cmph_uint8 *)key; + key = NULL; //transfer memory ownership + } + e = 0; + keys_vd = (cmph_uint8 **)calloc((size_t)MAX_BUCKET_SIZE, sizeof(cmph_uint8 *)); + nkeys_vd = 0; + error = 0; + while(e < brz->m) + { + i = brz_min_index(buffer_h0, nflushes); + cur_bucket = buffer_h0[i]; + key = (char *)buffer_manager_read_key(buff_manager, i, &keylen); + if(key) + { + while(key) + { + //keylen = strlen(key); + h0 = hash(brz->h0, key+sizeof(keylen), keylen) % brz->k; + if (h0 != buffer_h0[i]) break; + keys_vd[nkeys_vd++] = (cmph_uint8 *)key; + key = NULL; //transfer memory ownership + e++; + key = (char *)buffer_manager_read_key(buff_manager, i, &keylen); + } + if (key) + { + assert(nkeys_vd < brz->size[cur_bucket]); + keys_vd[nkeys_vd++] = buffer_merge[i]; + buffer_merge[i] = NULL; //transfer memory ownership + e++; + buffer_h0[i] = h0; + buffer_merge[i] = (cmph_uint8 *)key; + } + } + if(!key) + { + assert(nkeys_vd < brz->size[cur_bucket]); + keys_vd[nkeys_vd++] = buffer_merge[i]; + buffer_merge[i] = NULL; //transfer memory ownership + e++; + buffer_h0[i] = UINT_MAX; + } + + if(nkeys_vd == brz->size[cur_bucket]) // Generating mphf for each bucket. + { + cmph_io_adapter_t *source = NULL; + cmph_config_t *config = NULL; + cmph_t *mphf_tmp = NULL; + char *bufmphf = NULL; + cmph_uint32 buflenmphf = 0; + // Source of keys + source = cmph_io_byte_vector_adapter(keys_vd, (cmph_uint32)nkeys_vd); + config = cmph_config_new(source); + cmph_config_set_algo(config, brz->algo); + //cmph_config_set_algo(config, CMPH_BMZ8); + cmph_config_set_graphsize(config, brz->c); + mphf_tmp = cmph_new(config); + if (mphf_tmp == NULL) + { + if(mph->verbosity) fprintf(stderr, "ERROR: Can't generate MPHF for bucket %u out of %u\n", cur_bucket + 1, brz->k); + error = 1; + cmph_config_destroy(config); + brz_destroy_keys_vd(keys_vd, nkeys_vd); + cmph_io_byte_vector_adapter_destroy(source); + break; + } + if(mph->verbosity) + { + if (cur_bucket % 1000 == 0) + { + fprintf(stderr, "MPHF for bucket %u out of %u was generated.\n", cur_bucket + 1, brz->k); + } + } + switch(brz->algo) + { + case CMPH_FCH: + { + fch_data_t * fchf = NULL; + fchf = (fch_data_t *)mphf_tmp->data; + bufmphf = brz_copy_partial_fch_mphf(brz, fchf, cur_bucket, &buflenmphf); + } + break; + case CMPH_BMZ8: + { + bmz8_data_t * bmzf = NULL; + bmzf = (bmz8_data_t *)mphf_tmp->data; + bufmphf = brz_copy_partial_bmz8_mphf(brz, bmzf, cur_bucket, &buflenmphf); + } + break; + default: assert(0); + } + nbytes = fwrite(bufmphf, (size_t)buflenmphf, (size_t)1, brz->mphf_fd); + free(bufmphf); + bufmphf = NULL; + cmph_config_destroy(config); + brz_destroy_keys_vd(keys_vd, nkeys_vd); + cmph_destroy(mphf_tmp); + cmph_io_byte_vector_adapter_destroy(source); + nkeys_vd = 0; + } + } + buffer_manager_destroy(buff_manager); + free(keys_vd); + free(buffer_merge); + free(buffer_h0); + if (error) return 0; + return 1; +} + +static cmph_uint32 brz_min_index(cmph_uint32 * vector, cmph_uint32 n) +{ + cmph_uint32 i, min_index = 0; + for(i = 1; i < n; i++) + { + if(vector[i] < vector[min_index]) min_index = i; + } + return min_index; +} + +static void brz_destroy_keys_vd(cmph_uint8 ** keys_vd, cmph_uint32 nkeys) +{ + cmph_uint8 i; + for(i = 0; i < nkeys; i++) { free(keys_vd[i]); keys_vd[i] = NULL;} +} + +static char * brz_copy_partial_fch_mphf(brz_config_data_t *brz, fch_data_t * fchf, cmph_uint32 index, cmph_uint32 *buflen) +{ + cmph_uint32 i = 0; + cmph_uint32 buflenh1 = 0; + cmph_uint32 buflenh2 = 0; + char * bufh1 = NULL; + char * bufh2 = NULL; + char * buf = NULL; + cmph_uint32 n = fchf->b;//brz->size[index]; + hash_state_dump(fchf->h1, &bufh1, &buflenh1); + hash_state_dump(fchf->h2, &bufh2, &buflenh2); + *buflen = buflenh1 + buflenh2 + n + 2U * (cmph_uint32)sizeof(cmph_uint32); + buf = (char *)malloc((size_t)(*buflen)); + memcpy(buf, &buflenh1, sizeof(cmph_uint32)); + memcpy(buf+sizeof(cmph_uint32), bufh1, (size_t)buflenh1); + memcpy(buf+sizeof(cmph_uint32)+buflenh1, &buflenh2, sizeof(cmph_uint32)); + memcpy(buf+2*sizeof(cmph_uint32)+buflenh1, bufh2, (size_t)buflenh2); + for (i = 0; i < n; i++) memcpy(buf+2*sizeof(cmph_uint32)+buflenh1+buflenh2+i,(fchf->g + i), (size_t)1); + free(bufh1); + free(bufh2); + return buf; +} +static char * brz_copy_partial_bmz8_mphf(brz_config_data_t *brz, bmz8_data_t * bmzf, cmph_uint32 index, cmph_uint32 *buflen) +{ + cmph_uint32 buflenh1 = 0; + cmph_uint32 buflenh2 = 0; + char * bufh1 = NULL; + char * bufh2 = NULL; + char * buf = NULL; + cmph_uint32 n = (cmph_uint32)ceil(brz->c * brz->size[index]); + hash_state_dump(bmzf->hashes[0], &bufh1, &buflenh1); + hash_state_dump(bmzf->hashes[1], &bufh2, &buflenh2); + *buflen = buflenh1 + buflenh2 + n + 2U * (cmph_uint32)sizeof(cmph_uint32); + buf = (char *)malloc((size_t)(*buflen)); + memcpy(buf, &buflenh1, sizeof(cmph_uint32)); + memcpy(buf+sizeof(cmph_uint32), bufh1, (size_t)buflenh1); + memcpy(buf+sizeof(cmph_uint32)+buflenh1, &buflenh2, sizeof(cmph_uint32)); + memcpy(buf+2*sizeof(cmph_uint32)+buflenh1, bufh2, (size_t)buflenh2); + memcpy(buf+2*sizeof(cmph_uint32)+buflenh1+buflenh2,bmzf->g, (size_t)n); + free(bufh1); + free(bufh2); + return buf; +} + + +int brz_dump(cmph_t *mphf, FILE *fd) +{ + brz_data_t *data = (brz_data_t *)mphf->data; + char *buf = NULL; + cmph_uint32 buflen; + register size_t nbytes; + DEBUGP("Dumping brzf\n"); + // The initial part of the MPHF have already been dumped to disk during construction + // Dumping h0 + hash_state_dump(data->h0, &buf, &buflen); + DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); + nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd); + free(buf); + // Dumping m and the vector offset. + nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(data->offset, sizeof(cmph_uint32)*(data->k), (size_t)1, fd); + return 1; +} + +void brz_load(FILE *f, cmph_t *mphf) +{ + char *buf = NULL; + cmph_uint32 buflen; + register size_t nbytes; + cmph_uint32 i, n; + brz_data_t *brz = (brz_data_t *)malloc(sizeof(brz_data_t)); + + DEBUGP("Loading brz mphf\n"); + mphf->data = brz; + nbytes = fread(&(brz->c), sizeof(double), (size_t)1, f); + nbytes = fread(&(brz->algo), sizeof(brz->algo), (size_t)1, f); // Reading algo. + nbytes = fread(&(brz->k), sizeof(cmph_uint32), (size_t)1, f); + brz->size = (cmph_uint8 *) malloc(sizeof(cmph_uint8)*brz->k); + nbytes = fread(brz->size, sizeof(cmph_uint8)*(brz->k), (size_t)1, f); + brz->h1 = (hash_state_t **)malloc(sizeof(hash_state_t *)*brz->k); + brz->h2 = (hash_state_t **)malloc(sizeof(hash_state_t *)*brz->k); + brz->g = (cmph_uint8 **) calloc((size_t)brz->k, sizeof(cmph_uint8 *)); + DEBUGP("Reading c = %f k = %u algo = %u \n", brz->c, brz->k, brz->algo); + //loading h_i1, h_i2 and g_i. + for(i = 0; i < brz->k; i++) + { + // h1 + nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f); + DEBUGP("Hash state 1 has %u bytes\n", buflen); + buf = (char *)malloc((size_t)buflen); + nbytes = fread(buf, (size_t)buflen, (size_t)1, f); + brz->h1[i] = hash_state_load(buf, buflen); + free(buf); + //h2 + nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f); + DEBUGP("Hash state 2 has %u bytes\n", buflen); + buf = (char *)malloc((size_t)buflen); + nbytes = fread(buf, (size_t)buflen, (size_t)1, f); + brz->h2[i] = hash_state_load(buf, buflen); + free(buf); + switch(brz->algo) + { + case CMPH_FCH: + n = fch_calc_b(brz->c, brz->size[i]); + break; + case CMPH_BMZ8: + n = (cmph_uint32)ceil(brz->c * brz->size[i]); + break; + default: assert(0); + } + DEBUGP("g_i has %u bytes\n", n); + brz->g[i] = (cmph_uint8 *)calloc((size_t)n, sizeof(cmph_uint8)); + nbytes = fread(brz->g[i], sizeof(cmph_uint8)*n, (size_t)1, f); + } + //loading h0 + nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f); + DEBUGP("Hash state has %u bytes\n", buflen); + buf = (char *)malloc((size_t)buflen); + nbytes = fread(buf, (size_t)buflen, (size_t)1, f); + brz->h0 = hash_state_load(buf, buflen); + free(buf); + + //loading c, m, and the vector offset. + nbytes = fread(&(brz->m), sizeof(cmph_uint32), (size_t)1, f); + brz->offset = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*brz->k); + nbytes = fread(brz->offset, sizeof(cmph_uint32)*(brz->k), (size_t)1, f); + return; +} + +static cmph_uint32 brz_bmz8_search(brz_data_t *brz, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint) +{ + register cmph_uint32 h0; + + hash_vector(brz->h0, key, keylen, fingerprint); + h0 = fingerprint[2] % brz->k; + + register cmph_uint32 m = brz->size[h0]; + register cmph_uint32 n = (cmph_uint32)ceil(brz->c * m); + register cmph_uint32 h1 = hash(brz->h1[h0], key, keylen) % n; + register cmph_uint32 h2 = hash(brz->h2[h0], key, keylen) % n; + register cmph_uint8 mphf_bucket; + + if (h1 == h2 && ++h2 >= n) h2 = 0; + mphf_bucket = (cmph_uint8)(brz->g[h0][h1] + brz->g[h0][h2]); + DEBUGP("key: %s h1: %u h2: %u h0: %u\n", key, h1, h2, h0); + DEBUGP("key: %s g[h1]: %u g[h2]: %u offset[h0]: %u edges: %u\n", key, brz->g[h0][h1], brz->g[h0][h2], brz->offset[h0], brz->m); + DEBUGP("Address: %u\n", mphf_bucket + brz->offset[h0]); + return (mphf_bucket + brz->offset[h0]); +} + +static cmph_uint32 brz_fch_search(brz_data_t *brz, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint) +{ + register cmph_uint32 h0; + + hash_vector(brz->h0, key, keylen, fingerprint); + h0 = fingerprint[2] % brz->k; + + register cmph_uint32 m = brz->size[h0]; + register cmph_uint32 b = fch_calc_b(brz->c, m); + register double p1 = fch_calc_p1(m); + register double p2 = fch_calc_p2(b); + register cmph_uint32 h1 = hash(brz->h1[h0], key, keylen) % m; + register cmph_uint32 h2 = hash(brz->h2[h0], key, keylen) % m; + register cmph_uint8 mphf_bucket = 0; + h1 = mixh10h11h12(b, p1, p2, h1); + mphf_bucket = (cmph_uint8)((h2 + brz->g[h0][h1]) % m); + return (mphf_bucket + brz->offset[h0]); +} + +cmph_uint32 brz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) +{ + brz_data_t *brz = mphf->data; + cmph_uint32 fingerprint[3]; + switch(brz->algo) + { + case CMPH_FCH: + return brz_fch_search(brz, key, keylen, fingerprint); + case CMPH_BMZ8: + return brz_bmz8_search(brz, key, keylen, fingerprint); + default: assert(0); + } + return 0; +} +void brz_destroy(cmph_t *mphf) +{ + cmph_uint32 i; + brz_data_t *data = (brz_data_t *)mphf->data; + if(data->g) + { + for(i = 0; i < data->k; i++) + { + free(data->g[i]); + hash_state_destroy(data->h1[i]); + hash_state_destroy(data->h2[i]); + } + free(data->g); + free(data->h1); + free(data->h2); + } + hash_state_destroy(data->h0); + free(data->size); + free(data->offset); + free(data); + free(mphf); +} + +/** \fn void brz_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void brz_pack(cmph_t *mphf, void *packed_mphf) +{ + brz_data_t *data = (brz_data_t *)mphf->data; + cmph_uint8 * ptr = packed_mphf; + cmph_uint32 i,n; + + // packing internal algo type + memcpy(ptr, &(data->algo), sizeof(data->algo)); + ptr += sizeof(data->algo); + + // packing h0 type + CMPH_HASH h0_type = hash_get_type(data->h0); + memcpy(ptr, &h0_type, sizeof(h0_type)); + ptr += sizeof(h0_type); + + // packing h0 + hash_state_pack(data->h0, ptr); + ptr += hash_state_packed_size(h0_type); + + // packing k + memcpy(ptr, &(data->k), sizeof(data->k)); + ptr += sizeof(data->k); + + // packing c + *((cmph_uint64 *)ptr) = (cmph_uint64)data->c; + ptr += sizeof(data->c); + + // packing h1 type + CMPH_HASH h1_type = hash_get_type(data->h1[0]); + memcpy(ptr, &h1_type, sizeof(h1_type)); + ptr += sizeof(h1_type); + + // packing h2 type + CMPH_HASH h2_type = hash_get_type(data->h2[0]); + memcpy(ptr, &h2_type, sizeof(h2_type)); + ptr += sizeof(h2_type); + + // packing size + memcpy(ptr, data->size, sizeof(cmph_uint8)*data->k); + ptr += data->k; + + // packing offset + memcpy(ptr, data->offset, sizeof(cmph_uint32)*data->k); + ptr += sizeof(cmph_uint32)*data->k; + + #if defined (__ia64) || defined (__x86_64__) + cmph_uint64 * g_is_ptr = (cmph_uint64 *)ptr; + #else + cmph_uint32 * g_is_ptr = (cmph_uint32 *)ptr; + #endif + + cmph_uint8 * g_i = (cmph_uint8 *) (g_is_ptr + data->k); + + for(i = 0; i < data->k; i++) + { + #if defined (__ia64) || defined (__x86_64__) + *g_is_ptr++ = (cmph_uint64)g_i; + #else + *g_is_ptr++ = (cmph_uint32)g_i; + #endif + // packing h1[i] + hash_state_pack(data->h1[i], g_i); + g_i += hash_state_packed_size(h1_type); + + // packing h2[i] + hash_state_pack(data->h2[i], g_i); + g_i += hash_state_packed_size(h2_type); + + // packing g_i + switch(data->algo) + { + case CMPH_FCH: + n = fch_calc_b(data->c, data->size[i]); + break; + case CMPH_BMZ8: + n = (cmph_uint32)ceil(data->c * data->size[i]); + break; + default: assert(0); + } + memcpy(g_i, data->g[i], sizeof(cmph_uint8)*n); + g_i += n; + + } + +} + +/** \fn cmph_uint32 brz_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 brz_packed_size(cmph_t *mphf) +{ + cmph_uint32 i; + cmph_uint32 size = 0; + brz_data_t *data = (brz_data_t *)mphf->data; + CMPH_HASH h0_type = hash_get_type(data->h0); + CMPH_HASH h1_type = hash_get_type(data->h1[0]); + CMPH_HASH h2_type = hash_get_type(data->h2[0]); + size = (cmph_uint32)(2*sizeof(CMPH_ALGO) + 3*sizeof(CMPH_HASH) + hash_state_packed_size(h0_type) + sizeof(cmph_uint32) + + sizeof(double) + sizeof(cmph_uint8)*data->k + sizeof(cmph_uint32)*data->k); + // pointers to g_is + #if defined (__ia64) || defined (__x86_64__) + size += (cmph_uint32) sizeof(cmph_uint64)*data->k; + #else + size += (cmph_uint32) sizeof(cmph_uint32)*data->k; + #endif + + size += hash_state_packed_size(h1_type) * data->k; + size += hash_state_packed_size(h2_type) * data->k; + + cmph_uint32 n = 0; + for(i = 0; i < data->k; i++) + { + switch(data->algo) + { + case CMPH_FCH: + n = fch_calc_b(data->c, data->size[i]); + break; + case CMPH_BMZ8: + n = (cmph_uint32)ceil(data->c * data->size[i]); + break; + default: assert(0); + } + size += n; + } + return size; +} + + + +static cmph_uint32 brz_bmz8_search_packed(cmph_uint32 *packed_mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint) +{ + register CMPH_HASH h0_type = *packed_mphf++; + register cmph_uint32 *h0_ptr = packed_mphf; + packed_mphf = (cmph_uint32 *)(((cmph_uint8 *)packed_mphf) + hash_state_packed_size(h0_type)); + + register cmph_uint32 k = *packed_mphf++; + + register double c = (double)(*((cmph_uint64*)packed_mphf)); + packed_mphf += 2; + + register CMPH_HASH h1_type = *packed_mphf++; + + register CMPH_HASH h2_type = *packed_mphf++; + + register cmph_uint8 * size = (cmph_uint8 *) packed_mphf; + packed_mphf = (cmph_uint32 *)(size + k); + + register cmph_uint32 * offset = packed_mphf; + packed_mphf += k; + + register cmph_uint32 h0; + + hash_vector_packed(h0_ptr, h0_type, key, keylen, fingerprint); + h0 = fingerprint[2] % k; + + register cmph_uint32 m = size[h0]; + register cmph_uint32 n = (cmph_uint32)ceil(c * m); + + #if defined (__ia64) || defined (__x86_64__) + register cmph_uint64 * g_is_ptr = (cmph_uint64 *)packed_mphf; + #else + register cmph_uint32 * g_is_ptr = packed_mphf; + #endif + + register cmph_uint8 * h1_ptr = (cmph_uint8 *) g_is_ptr[h0]; + + register cmph_uint8 * h2_ptr = h1_ptr + hash_state_packed_size(h1_type); + + register cmph_uint8 * g = h2_ptr + hash_state_packed_size(h2_type); + + register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n; + register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n; + + register cmph_uint8 mphf_bucket; + + if (h1 == h2 && ++h2 >= n) h2 = 0; + mphf_bucket = (cmph_uint8)(g[h1] + g[h2]); + DEBUGP("key: %s h1: %u h2: %u h0: %u\n", key, h1, h2, h0); + DEBUGP("Address: %u\n", mphf_bucket + offset[h0]); + return (mphf_bucket + offset[h0]); +} + +static cmph_uint32 brz_fch_search_packed(cmph_uint32 *packed_mphf, const char *key, cmph_uint32 keylen, cmph_uint32 * fingerprint) +{ + register CMPH_HASH h0_type = *packed_mphf++; + + register cmph_uint32 *h0_ptr = packed_mphf; + packed_mphf = (cmph_uint32 *)(((cmph_uint8 *)packed_mphf) + hash_state_packed_size(h0_type)); + + register cmph_uint32 k = *packed_mphf++; + + register double c = (double)(*((cmph_uint64*)packed_mphf)); + packed_mphf += 2; + + register CMPH_HASH h1_type = *packed_mphf++; + + register CMPH_HASH h2_type = *packed_mphf++; + + register cmph_uint8 * size = (cmph_uint8 *) packed_mphf; + packed_mphf = (cmph_uint32 *)(size + k); + + register cmph_uint32 * offset = packed_mphf; + packed_mphf += k; + + register cmph_uint32 h0; + + hash_vector_packed(h0_ptr, h0_type, key, keylen, fingerprint); + h0 = fingerprint[2] % k; + + register cmph_uint32 m = size[h0]; + register cmph_uint32 b = fch_calc_b(c, m); + register double p1 = fch_calc_p1(m); + register double p2 = fch_calc_p2(b); + + #if defined (__ia64) || defined (__x86_64__) + register cmph_uint64 * g_is_ptr = (cmph_uint64 *)packed_mphf; + #else + register cmph_uint32 * g_is_ptr = packed_mphf; + #endif + + register cmph_uint8 * h1_ptr = (cmph_uint8 *) g_is_ptr[h0]; + + register cmph_uint8 * h2_ptr = h1_ptr + hash_state_packed_size(h1_type); + + register cmph_uint8 * g = h2_ptr + hash_state_packed_size(h2_type); + + register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % m; + register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % m; + + register cmph_uint8 mphf_bucket = 0; + h1 = mixh10h11h12(b, p1, p2, h1); + mphf_bucket = (cmph_uint8)((h2 + g[h1]) % m); + return (mphf_bucket + offset[h0]); +} + +/** cmph_uint32 brz_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 brz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) +{ + register cmph_uint32 *ptr = (cmph_uint32 *)packed_mphf; + register CMPH_ALGO algo = *ptr++; + cmph_uint32 fingerprint[3]; + switch(algo) + { + case CMPH_FCH: + return brz_fch_search_packed(ptr, key, keylen, fingerprint); + case CMPH_BMZ8: + return brz_bmz8_search_packed(ptr, key, keylen, fingerprint); + default: assert(0); + } +} + diff --git a/cmph/brz.h b/cmph/brz.h new file mode 100644 index 000000000..ac07ed762 --- /dev/null +++ b/cmph/brz.h @@ -0,0 +1,47 @@ +#ifndef __CMPH_BRZ_H__ +#define __CMPH_BRZ_H__ + +#include "cmph.h" + +typedef struct __brz_data_t brz_data_t; +typedef struct __brz_config_data_t brz_config_data_t; + +brz_config_data_t *brz_config_new(); +void brz_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs); +void brz_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir); +void brz_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd); +void brz_config_set_b(cmph_config_t *mph, cmph_uint32 b); +void brz_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo); +void brz_config_set_memory_availability(cmph_config_t *mph, cmph_uint32 memory_availability); +void brz_config_destroy(cmph_config_t *mph); +cmph_t *brz_new(cmph_config_t *mph, double c); + +void brz_load(FILE *f, cmph_t *mphf); +int brz_dump(cmph_t *mphf, FILE *f); +void brz_destroy(cmph_t *mphf); +cmph_uint32 brz_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); + +/** \fn void brz_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void brz_pack(cmph_t *mphf, void *packed_mphf); + +/** \fn cmph_uint32 brz_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 brz_packed_size(cmph_t *mphf); + +/** cmph_uint32 brz_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 brz_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen); + +#endif diff --git a/cmph/brz_structs.h b/cmph/brz_structs.h new file mode 100755 index 000000000..b781107f9 --- /dev/null +++ b/cmph/brz_structs.h @@ -0,0 +1,39 @@ +#ifndef __CMPH_BRZ_STRUCTS_H__ +#define __CMPH_BRZ_STRUCTS_H__ + +#include "hash_state.h" + +struct __brz_data_t +{ + CMPH_ALGO algo; // CMPH algo for generating the MPHFs for the buckets (Just CMPH_FCH and CMPH_BMZ8) + cmph_uint32 m; // edges (words) count + double c; // constant c + cmph_uint8 *size; // size[i] stores the number of edges represented by g[i][...]. + cmph_uint32 *offset; // offset[i] stores the sum: size[0] + size[1] + ... size[i-1]. + cmph_uint8 **g; // g function. + cmph_uint32 k; // number of components + hash_state_t **h1; + hash_state_t **h2; + hash_state_t * h0; +}; + +struct __brz_config_data_t +{ + CMPH_HASH hashfuncs[3]; + CMPH_ALGO algo; // CMPH algo for generating the MPHFs for the buckets (Just CMPH_FCH and CMPH_BMZ8) + double c; // constant c + cmph_uint32 m; // edges (words) count + cmph_uint8 *size; // size[i] stores the number of edges represented by g[i][...]. + cmph_uint32 *offset; // offset[i] stores the sum: size[0] + size[1] + ... size[i-1]. + cmph_uint8 **g; // g function. + cmph_uint8 b; // parameter b. + cmph_uint32 k; // number of components + hash_state_t **h1; + hash_state_t **h2; + hash_state_t * h0; + cmph_uint32 memory_availability; + cmph_uint8 * tmp_dir; // temporary directory + FILE * mphf_fd; // mphf file +}; + +#endif diff --git a/cmph/buffer_entry.c b/cmph/buffer_entry.c new file mode 100644 index 000000000..7f82aae13 --- /dev/null +++ b/cmph/buffer_entry.c @@ -0,0 +1,103 @@ +#include "buffer_entry.h" +#include +#include +#include +#include + +struct __buffer_entry_t +{ + FILE *fd; + cmph_uint8 * buff; + cmph_uint32 capacity, // buffer entry capacity + nbytes, // buffer entry used bytes + pos; // current read position in buffer entry + cmph_uint8 eof; // flag to indicate end of file +}; + +buffer_entry_t * buffer_entry_new(cmph_uint32 capacity) +{ + buffer_entry_t *buff_entry = (buffer_entry_t *)malloc(sizeof(buffer_entry_t)); + assert(buff_entry); + buff_entry->fd = NULL; + buff_entry->buff = NULL; + buff_entry->capacity = capacity; + buff_entry->nbytes = capacity; + buff_entry->pos = capacity; + buff_entry->eof = 0; + return buff_entry; +} + +void buffer_entry_open(buffer_entry_t * buffer_entry, char * filename) +{ + buffer_entry->fd = fopen(filename, "rb"); +} + +void buffer_entry_set_capacity(buffer_entry_t * buffer_entry, cmph_uint32 capacity) +{ + buffer_entry->capacity = capacity; +} + + +cmph_uint32 buffer_entry_get_capacity(buffer_entry_t * buffer_entry) +{ + return buffer_entry->capacity; +} + +void buffer_entry_load(buffer_entry_t * buffer_entry) +{ + free(buffer_entry->buff); + buffer_entry->buff = (cmph_uint8 *)calloc((size_t)buffer_entry->capacity, sizeof(cmph_uint8)); + buffer_entry->nbytes = (cmph_uint32)fread(buffer_entry->buff, (size_t)1, (size_t)buffer_entry->capacity, buffer_entry->fd); + if (buffer_entry->nbytes != buffer_entry->capacity) buffer_entry->eof = 1; + buffer_entry->pos = 0; +} + +cmph_uint8 * buffer_entry_read_key(buffer_entry_t * buffer_entry, cmph_uint32 * keylen) +{ + cmph_uint8 * buf = NULL; + cmph_uint32 lacked_bytes = sizeof(*keylen); + cmph_uint32 copied_bytes = 0; + if(buffer_entry->eof && (buffer_entry->pos == buffer_entry->nbytes)) // end + { + free(buf); + return NULL; + } + if((buffer_entry->pos + lacked_bytes) > buffer_entry->nbytes) + { + copied_bytes = buffer_entry->nbytes - buffer_entry->pos; + lacked_bytes = (buffer_entry->pos + lacked_bytes) - buffer_entry->nbytes; + if (copied_bytes != 0) memcpy(keylen, buffer_entry->buff + buffer_entry->pos, (size_t)copied_bytes); + buffer_entry_load(buffer_entry); + } + memcpy(keylen + copied_bytes, buffer_entry->buff + buffer_entry->pos, (size_t)lacked_bytes); + buffer_entry->pos += lacked_bytes; + + lacked_bytes = *keylen; + copied_bytes = 0; + buf = (cmph_uint8 *)malloc(*keylen + sizeof(*keylen)); + memcpy(buf, keylen, sizeof(*keylen)); + if((buffer_entry->pos + lacked_bytes) > buffer_entry->nbytes) { + copied_bytes = buffer_entry->nbytes - buffer_entry->pos; + lacked_bytes = (buffer_entry->pos + lacked_bytes) - buffer_entry->nbytes; + if (copied_bytes != 0) { + memcpy(buf + sizeof(*keylen), buffer_entry->buff + buffer_entry->pos, (size_t)copied_bytes); + } + buffer_entry_load(buffer_entry); + } + memcpy(buf+sizeof(*keylen)+copied_bytes, buffer_entry->buff + buffer_entry->pos, (size_t)lacked_bytes); + buffer_entry->pos += lacked_bytes; + return buf; +} + +void buffer_entry_destroy(buffer_entry_t * buffer_entry) +{ + fclose(buffer_entry->fd); + buffer_entry->fd = NULL; + free(buffer_entry->buff); + buffer_entry->buff = NULL; + buffer_entry->capacity = 0; + buffer_entry->nbytes = 0; + buffer_entry->pos = 0; + buffer_entry->eof = 0; + free(buffer_entry); +} diff --git a/cmph/buffer_entry.h b/cmph/buffer_entry.h new file mode 100644 index 000000000..62102baba --- /dev/null +++ b/cmph/buffer_entry.h @@ -0,0 +1,14 @@ +#ifndef __CMPH_BUFFER_ENTRY_H__ +#define __CMPH_BUFFER_ENTRY_H__ + +#include "cmph_types.h" +#include +typedef struct __buffer_entry_t buffer_entry_t; + +buffer_entry_t * buffer_entry_new(cmph_uint32 capacity); +void buffer_entry_set_capacity(buffer_entry_t * buffer_entry, cmph_uint32 capacity); +cmph_uint32 buffer_entry_get_capacity(buffer_entry_t * buffer_entry); +void buffer_entry_open(buffer_entry_t * buffer_entry, char * filename); +cmph_uint8 * buffer_entry_read_key(buffer_entry_t * buffer_entry, cmph_uint32 * keylen); +void buffer_entry_destroy(buffer_entry_t * buffer_entry); +#endif diff --git a/cmph/buffer_manage.c b/cmph/buffer_manage.c new file mode 100644 index 000000000..fdefc620d --- /dev/null +++ b/cmph/buffer_manage.c @@ -0,0 +1,66 @@ +#include "buffer_manage.h" +#include "buffer_entry.h" +#include +#include +#include +struct __buffer_manage_t +{ + cmph_uint32 memory_avail; // memory available + buffer_entry_t ** buffer_entries; // buffer entries to be managed + cmph_uint32 nentries; // number of entries to be managed + cmph_uint32 *memory_avail_list; // memory available list + int pos_avail_list; // current position in memory available list +}; + +buffer_manage_t * buffer_manage_new(cmph_uint32 memory_avail, cmph_uint32 nentries) +{ + cmph_uint32 memory_avail_entry, i; + buffer_manage_t *buff_manage = (buffer_manage_t *)malloc(sizeof(buffer_manage_t)); + assert(buff_manage); + buff_manage->memory_avail = memory_avail; + buff_manage->buffer_entries = (buffer_entry_t **)calloc((size_t)nentries, sizeof(buffer_entry_t *)); + buff_manage->memory_avail_list = (cmph_uint32 *)calloc((size_t)nentries, sizeof(cmph_uint32)); + buff_manage->pos_avail_list = -1; + buff_manage->nentries = nentries; + memory_avail_entry = buff_manage->memory_avail/buff_manage->nentries + 1; + for(i = 0; i < buff_manage->nentries; i++) + { + buff_manage->buffer_entries[i] = buffer_entry_new(memory_avail_entry); + } + return buff_manage; +} + +void buffer_manage_open(buffer_manage_t * buffer_manage, cmph_uint32 index, char * filename) +{ + buffer_entry_open(buffer_manage->buffer_entries[index], filename); +} + +cmph_uint8 * buffer_manage_read_key(buffer_manage_t * buffer_manage, cmph_uint32 index) +{ + cmph_uint8 * key = NULL; + if (buffer_manage->pos_avail_list >= 0 ) // recovering memory + { + cmph_uint32 new_capacity = buffer_entry_get_capacity(buffer_manage->buffer_entries[index]) + buffer_manage->memory_avail_list[(buffer_manage->pos_avail_list)--]; + buffer_entry_set_capacity(buffer_manage->buffer_entries[index], new_capacity); + //fprintf(stderr, "recovering memory\n"); + } + key = buffer_entry_read_key(buffer_manage->buffer_entries[index]); + if (key == NULL) // storing memory to be recovered + { + buffer_manage->memory_avail_list[++(buffer_manage->pos_avail_list)] = buffer_entry_get_capacity(buffer_manage->buffer_entries[index]); + //fprintf(stderr, "storing memory to be recovered\n"); + } + return key; +} + +void buffer_manage_destroy(buffer_manage_t * buffer_manage) +{ + cmph_uint32 i; + for(i = 0; i < buffer_manage->nentries; i++) + { + buffer_entry_destroy(buffer_manage->buffer_entries[i]); + } + free(buffer_manage->memory_avail_list); + free(buffer_manage->buffer_entries); + free(buffer_manage); +} diff --git a/cmph/buffer_manage.h b/cmph/buffer_manage.h new file mode 100644 index 000000000..8c66cffc1 --- /dev/null +++ b/cmph/buffer_manage.h @@ -0,0 +1,12 @@ +#ifndef __CMPH_BUFFER_MANAGE_H__ +#define __CMPH_BUFFER_MANAGE_H__ + +#include "cmph_types.h" +#include +typedef struct __buffer_manage_t buffer_manage_t; + +buffer_manage_t * buffer_manage_new(cmph_uint32 memory_avail, cmph_uint32 nentries); +void buffer_manage_open(buffer_manage_t * buffer_manage, cmph_uint32 index, char * filename); +cmph_uint8 * buffer_manage_read_key(buffer_manage_t * buffer_manage, cmph_uint32 index); +void buffer_manage_destroy(buffer_manage_t * buffer_manage); +#endif diff --git a/cmph/buffer_manager.c b/cmph/buffer_manager.c new file mode 100644 index 000000000..5a051e2f0 --- /dev/null +++ b/cmph/buffer_manager.c @@ -0,0 +1,64 @@ +#include "buffer_manager.h" +#include "buffer_entry.h" +#include +#include +#include +struct __buffer_manager_t +{ + cmph_uint32 memory_avail; // memory available + buffer_entry_t ** buffer_entries; // buffer entries to be managed + cmph_uint32 nentries; // number of entries to be managed + cmph_uint32 *memory_avail_list; // memory available list + int pos_avail_list; // current position in memory available list +}; + +buffer_manager_t * buffer_manager_new(cmph_uint32 memory_avail, cmph_uint32 nentries) +{ + cmph_uint32 memory_avail_entry, i; + buffer_manager_t *buff_manager = (buffer_manager_t *)malloc(sizeof(buffer_manager_t)); + assert(buff_manager); + buff_manager->memory_avail = memory_avail; + buff_manager->buffer_entries = (buffer_entry_t **)calloc((size_t)nentries, sizeof(buffer_entry_t *)); + buff_manager->memory_avail_list = (cmph_uint32 *)calloc((size_t)nentries, sizeof(cmph_uint32)); + buff_manager->pos_avail_list = -1; + buff_manager->nentries = nentries; + memory_avail_entry = buff_manager->memory_avail/buff_manager->nentries + 1; + for(i = 0; i < buff_manager->nentries; i++) + { + buff_manager->buffer_entries[i] = buffer_entry_new(memory_avail_entry); + } + return buff_manager; +} + +void buffer_manager_open(buffer_manager_t * buffer_manager, cmph_uint32 index, char * filename) +{ + buffer_entry_open(buffer_manager->buffer_entries[index], filename); +} + +cmph_uint8 * buffer_manager_read_key(buffer_manager_t * buffer_manager, cmph_uint32 index, cmph_uint32 * keylen) +{ + cmph_uint8 * key = NULL; + if (buffer_manager->pos_avail_list >= 0 ) // recovering memory + { + cmph_uint32 new_capacity = buffer_entry_get_capacity(buffer_manager->buffer_entries[index]) + buffer_manager->memory_avail_list[(buffer_manager->pos_avail_list)--]; + buffer_entry_set_capacity(buffer_manager->buffer_entries[index], new_capacity); + } + key = buffer_entry_read_key(buffer_manager->buffer_entries[index], keylen); + if (key == NULL) // storing memory to be recovered + { + buffer_manager->memory_avail_list[++(buffer_manager->pos_avail_list)] = buffer_entry_get_capacity(buffer_manager->buffer_entries[index]); + } + return key; +} + +void buffer_manager_destroy(buffer_manager_t * buffer_manager) +{ + cmph_uint32 i; + for(i = 0; i < buffer_manager->nentries; i++) + { + buffer_entry_destroy(buffer_manager->buffer_entries[i]); + } + free(buffer_manager->memory_avail_list); + free(buffer_manager->buffer_entries); + free(buffer_manager); +} diff --git a/cmph/buffer_manager.h b/cmph/buffer_manager.h new file mode 100644 index 000000000..af99c20f6 --- /dev/null +++ b/cmph/buffer_manager.h @@ -0,0 +1,12 @@ +#ifndef __CMPH_BUFFER_MANAGE_H__ +#define __CMPH_BUFFER_MANAGE_H__ + +#include "cmph_types.h" +#include +typedef struct __buffer_manager_t buffer_manager_t; + +buffer_manager_t * buffer_manager_new(cmph_uint32 memory_avail, cmph_uint32 nentries); +void buffer_manager_open(buffer_manager_t * buffer_manager, cmph_uint32 index, char * filename); +cmph_uint8 * buffer_manager_read_key(buffer_manager_t * buffer_manager, cmph_uint32 index, cmph_uint32 * keylen); +void buffer_manager_destroy(buffer_manager_t * buffer_manager); +#endif diff --git a/cmph/chd.c b/cmph/chd.c new file mode 100644 index 000000000..7fb3b8bb8 --- /dev/null +++ b/cmph/chd.c @@ -0,0 +1,271 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "cmph_structs.h" +#include "chd_structs.h" +#include "chd.h" +#include "bitbool.h" +//#define DEBUG +#include "debug.h" + +chd_config_data_t *chd_config_new(cmph_config_t *mph) +{ + cmph_io_adapter_t *key_source = mph->key_source; + chd_config_data_t *chd; + chd = (chd_config_data_t *)malloc(sizeof(chd_config_data_t)); + assert(chd); + memset(chd, 0, sizeof(chd_config_data_t)); + + chd->chd_ph = cmph_config_new(key_source); + cmph_config_set_algo(chd->chd_ph, CMPH_CHD_PH); + + return chd; +} + +void chd_config_destroy(cmph_config_t *mph) +{ + chd_config_data_t *data = (chd_config_data_t *) mph->data; + DEBUGP("Destroying algorithm dependent data\n"); + if(data->chd_ph) + { + cmph_config_destroy(data->chd_ph); + data->chd_ph = NULL; + } + free(data); +} + + +void chd_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) +{ + chd_config_data_t *data = (chd_config_data_t *) mph->data; + cmph_config_set_hashfuncs(data->chd_ph, hashfuncs); +} + + +void chd_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket) +{ + chd_config_data_t *data = (chd_config_data_t *) mph->data; + cmph_config_set_b(data->chd_ph, keys_per_bucket); +} + + +void chd_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin) +{ + chd_config_data_t *data = (chd_config_data_t *) mph->data; + cmph_config_set_keys_per_bin(data->chd_ph, keys_per_bin); +} + + +cmph_t *chd_new(cmph_config_t *mph, double c) +{ + cmph_t *mphf = NULL; + chd_data_t *chdf = NULL; + chd_config_data_t *chd = (chd_config_data_t *)mph->data; + chd_ph_config_data_t * chd_ph = (chd_ph_config_data_t *)chd->chd_ph->data; + compressed_rank_t cr; + + register cmph_t * chd_phf = NULL; + register cmph_uint32 packed_chd_phf_size = 0; + cmph_uint8 * packed_chd_phf = NULL; + + register cmph_uint32 packed_cr_size = 0; + cmph_uint8 * packed_cr = NULL; + + register cmph_uint32 i, idx, nkeys, nvals, nbins; + cmph_uint32 * vals_table = NULL; + register cmph_uint32 * occup_table = NULL; + #ifdef CMPH_TIMING + double construction_time_begin = 0.0; + double construction_time = 0.0; + ELAPSED_TIME_IN_SECONDS(&construction_time_begin); + #endif + + cmph_config_set_verbosity(chd->chd_ph, mph->verbosity); + cmph_config_set_graphsize(chd->chd_ph, c); + + if (mph->verbosity) + { + fprintf(stderr, "Generating a CHD_PH perfect hash function with a load factor equal to %.3f\n", c); + } + + chd_phf = cmph_new(chd->chd_ph); + + if(chd_phf == NULL) + { + return NULL; + } + + packed_chd_phf_size = cmph_packed_size(chd_phf); + DEBUGP("packed_chd_phf_size = %u\n", packed_chd_phf_size); + + /* Make sure that we have enough space to pack the mphf. */ + packed_chd_phf = calloc((size_t)packed_chd_phf_size,(size_t)1); + + /* Pack the mphf. */ + cmph_pack(chd_phf, packed_chd_phf); + + cmph_destroy(chd_phf); + + + if (mph->verbosity) + { + fprintf(stderr, "Compressing the range of the resulting CHD_PH perfect hash function\n"); + } + + compressed_rank_init(&cr); + nbins = chd_ph->n; + nkeys = chd_ph->m; + nvals = nbins - nkeys; + + vals_table = (cmph_uint32 *)calloc(nvals, sizeof(cmph_uint32)); + occup_table = (cmph_uint32 *)chd_ph->occup_table; + + for(i = 0, idx = 0; i < nbins; i++) + { + if(!GETBIT32(occup_table, i)) + { + vals_table[idx++] = i; + } + } + + compressed_rank_generate(&cr, vals_table, nvals); + free(vals_table); + + packed_cr_size = compressed_rank_packed_size(&cr); + packed_cr = (cmph_uint8 *) calloc(packed_cr_size, sizeof(cmph_uint8)); + compressed_rank_pack(&cr, packed_cr); + compressed_rank_destroy(&cr); + + mphf = (cmph_t *)malloc(sizeof(cmph_t)); + mphf->algo = mph->algo; + chdf = (chd_data_t *)malloc(sizeof(chd_data_t)); + + chdf->packed_cr = packed_cr; + packed_cr = NULL; //transfer memory ownership + + chdf->packed_chd_phf = packed_chd_phf; + packed_chd_phf = NULL; //transfer memory ownership + + chdf->packed_chd_phf_size = packed_chd_phf_size; + chdf->packed_cr_size = packed_cr_size; + + mphf->data = chdf; + mphf->size = nkeys; + + DEBUGP("Successfully generated minimal perfect hash\n"); + if (mph->verbosity) + { + fprintf(stderr, "Successfully generated minimal perfect hash function\n"); + } + #ifdef CMPH_TIMING + ELAPSED_TIME_IN_SECONDS(&construction_time); + register cmph_uint32 space_usage = chd_packed_size(mphf)*8; + construction_time = construction_time - construction_time_begin; + fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\n", nkeys, c, chd_ph->keys_per_bucket, construction_time, space_usage/(double)nkeys); + #endif + + return mphf; +} + +void chd_load(FILE *fd, cmph_t *mphf) +{ + register size_t nbytes; + chd_data_t *chd = (chd_data_t *)malloc(sizeof(chd_data_t)); + + DEBUGP("Loading chd mphf\n"); + mphf->data = chd; + + nbytes = fread(&chd->packed_chd_phf_size, sizeof(cmph_uint32), (size_t)1, fd); + DEBUGP("Loading CHD_PH perfect hash function with %u bytes to disk\n", chd->packed_chd_phf_size); + chd->packed_chd_phf = (cmph_uint8 *) calloc((size_t)chd->packed_chd_phf_size,(size_t)1); + nbytes = fread(chd->packed_chd_phf, chd->packed_chd_phf_size, (size_t)1, fd); + + nbytes = fread(&chd->packed_cr_size, sizeof(cmph_uint32), (size_t)1, fd); + DEBUGP("Loading Compressed rank structure, which has %u bytes\n", chd->packed_cr_size); + chd->packed_cr = (cmph_uint8 *) calloc((size_t)chd->packed_cr_size, (size_t)1); + nbytes = fread(chd->packed_cr, chd->packed_cr_size, (size_t)1, fd); +} + +int chd_dump(cmph_t *mphf, FILE *fd) +{ + register size_t nbytes; + chd_data_t *data = (chd_data_t *)mphf->data; + + __cmph_dump(mphf, fd); + // Dumping CHD_PH perfect hash function + + DEBUGP("Dumping CHD_PH perfect hash function with %u bytes to disk\n", data->packed_chd_phf_size); + nbytes = fwrite(&data->packed_chd_phf_size, sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(data->packed_chd_phf, data->packed_chd_phf_size, (size_t)1, fd); + + DEBUGP("Dumping compressed rank structure with %u bytes to disk\n", buflen); + nbytes = fwrite(&data->packed_cr_size, sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(data->packed_cr, data->packed_cr_size, (size_t)1, fd); + + return 1; +} + +void chd_destroy(cmph_t *mphf) +{ + chd_data_t *data = (chd_data_t *)mphf->data; + free(data->packed_chd_phf); + free(data->packed_cr); + free(data); + free(mphf); +} + +static inline cmph_uint32 _chd_search(void * packed_chd_phf, void * packed_cr, const char *key, cmph_uint32 keylen) +{ + register cmph_uint32 bin_idx = cmph_search_packed(packed_chd_phf, key, keylen); + register cmph_uint32 rank = compressed_rank_query_packed(packed_cr, bin_idx); + return bin_idx - rank; +} + +cmph_uint32 chd_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) +{ + register chd_data_t * chd = mphf->data; + return _chd_search(chd->packed_chd_phf, chd->packed_cr, key, keylen); +} + +void chd_pack(cmph_t *mphf, void *packed_mphf) +{ + chd_data_t *data = (chd_data_t *)mphf->data; + cmph_uint32 * ptr = packed_mphf; + cmph_uint8 * ptr8; + + // packing packed_cr_size and packed_cr + *ptr = data->packed_cr_size; + ptr8 = (cmph_uint8 *) (ptr + 1); + + memcpy(ptr8, data->packed_cr, data->packed_cr_size); + ptr8 += data->packed_cr_size; + + ptr = (cmph_uint32 *) ptr8; + *ptr = data->packed_chd_phf_size; + + ptr8 = (cmph_uint8 *) (ptr + 1); + memcpy(ptr8, data->packed_chd_phf, data->packed_chd_phf_size); +} + +cmph_uint32 chd_packed_size(cmph_t *mphf) +{ + register chd_data_t *data = (chd_data_t *)mphf->data; + return (cmph_uint32)(sizeof(CMPH_ALGO) + 2*sizeof(cmph_uint32) + data->packed_cr_size + data->packed_chd_phf_size); + +} + +cmph_uint32 chd_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) +{ + + register cmph_uint32 * ptr = packed_mphf; + register cmph_uint32 packed_cr_size = *ptr++; + register cmph_uint8 * packed_chd_phf = ((cmph_uint8 *) ptr) + packed_cr_size + sizeof(cmph_uint32); + return _chd_search(packed_chd_phf, ptr, key, keylen); +} + + diff --git a/cmph/chd.h b/cmph/chd.h new file mode 100644 index 000000000..e829df816 --- /dev/null +++ b/cmph/chd.h @@ -0,0 +1,59 @@ +#ifndef _CMPH_CHD_H__ +#define _CMPH_CHD_H__ + +#include "cmph.h" + +typedef struct __chd_data_t chd_data_t; +typedef struct __chd_config_data_t chd_config_data_t; + +/* Config API */ +chd_config_data_t *chd_config_new(cmph_config_t * mph); +void chd_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs); + +/** \fn void chd_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin); + * \brief Allows to set the number of keys per bin. + * \param mph pointer to the configuration structure + * \param keys_per_bin value for the number of keys per bin + */ +void chd_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin); + +/** \fn void chd_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket); + * \brief Allows to set the number of keys per bucket. + * \param mph pointer to the configuration structure + * \param keys_per_bucket value for the number of keys per bucket + */ +void chd_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket); +void chd_config_destroy(cmph_config_t *mph); + + +/* Chd algorithm API */ +cmph_t *chd_new(cmph_config_t *mph, double c); +void chd_load(FILE *fd, cmph_t *mphf); +int chd_dump(cmph_t *mphf, FILE *fd); +void chd_destroy(cmph_t *mphf); +cmph_uint32 chd_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); + +/** \fn void chd_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void chd_pack(cmph_t *mphf, void *packed_mphf); + +/** \fn cmph_uint32 chd_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 chd_packed_size(cmph_t *mphf); + +/** cmph_uint32 chd_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 chd_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen); + +#endif diff --git a/cmph/chd_ph.c b/cmph/chd_ph.c new file mode 100644 index 000000000..b34415b93 --- /dev/null +++ b/cmph/chd_ph.c @@ -0,0 +1,988 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "cmph_structs.h" +#include "chd_structs_ph.h" +#include "chd_ph.h" +#include"miller_rabin.h" +#include"bitbool.h" + + +//#define DEBUG +#include "debug.h" + +// NO_ELEMENT is equivalent to null pointer +#ifndef NO_ELEMENT +#define NO_ELEMENT UINT_MAX +#endif + +// struct used to represent items at mapping, ordering and searching phases +struct _chd_ph_item_t +{ + cmph_uint32 f; + cmph_uint32 h; +}; +typedef struct _chd_ph_item_t chd_ph_item_t; + +// struct to represent the items at mapping phase only. +struct _chd_ph_map_item_t +{ + cmph_uint32 f; + cmph_uint32 h; + cmph_uint32 bucket_num; +}; +typedef struct _chd_ph_map_item_t chd_ph_map_item_t; + +// struct to represent a bucket +struct _chd_ph_bucket_t +{ + cmph_uint32 items_list; // offset + union + { + cmph_uint32 size; + cmph_uint32 bucket_id; + }; +}; + +typedef struct _chd_ph_bucket_t chd_ph_bucket_t; + +struct _chd_ph_sorted_list_t +{ + cmph_uint32 buckets_list; + cmph_uint32 size; +}; + +typedef struct _chd_ph_sorted_list_t chd_ph_sorted_list_t; + + +static inline chd_ph_bucket_t * chd_ph_bucket_new(cmph_uint32 nbuckets); +static inline void chd_ph_bucket_clean(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets); +static inline void chd_ph_bucket_destroy(chd_ph_bucket_t * buckets); + +chd_ph_bucket_t * chd_ph_bucket_new(cmph_uint32 nbuckets) +{ + chd_ph_bucket_t * buckets = (chd_ph_bucket_t *) calloc(nbuckets, sizeof(chd_ph_bucket_t)); + return buckets; +} + +void chd_ph_bucket_clean(chd_ph_bucket_t * buckets, cmph_uint32 nbuckets) +{ + register cmph_uint32 i = 0; + assert(buckets); + for(i = 0; i < nbuckets; i++) + buckets[i].size = 0; +} +cmph_uint8 chd_ph_bucket_insert(chd_ph_bucket_t * buckets,chd_ph_map_item_t * map_items, chd_ph_item_t * items, + cmph_uint32 nbuckets,cmph_uint32 item_idx) +{ + register cmph_uint32 i = 0; + register chd_ph_item_t * tmp_item; + register chd_ph_map_item_t * tmp_map_item = map_items + item_idx; + register chd_ph_bucket_t * bucket = buckets + tmp_map_item->bucket_num; + tmp_item = items + bucket->items_list; + + for(i = 0; i < bucket->size; i++) + { + if(tmp_item->f == tmp_map_item->f && tmp_item->h == tmp_map_item->h) + { + DEBUGP("Item not added\n"); + return 0; + }; + tmp_item++; + }; + tmp_item->f = tmp_map_item->f; + tmp_item->h = tmp_map_item->h; + bucket->size++; + return 1; +}; +void chd_ph_bucket_destroy(chd_ph_bucket_t * buckets) +{ + free(buckets); +} + +static inline cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items, + cmph_uint32 *max_bucket_size); + +static chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets,chd_ph_item_t ** items, + cmph_uint32 nbuckets,cmph_uint32 nitems, cmph_uint32 max_bucket_size); + +static cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items , + cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, cmph_uint32 * disp_table); + +static inline double chd_ph_space_lower_bound(cmph_uint32 _n, cmph_uint32 _r) +{ + double r = _r, n = _n; + return (1 + (r/n - 1.0 + 1.0/(2.0*n))*log(1 - n/r))/log(2); +}; + +/* computes the entropy of non empty buckets.*/ +static inline double chd_ph_get_entropy(cmph_uint32 * disp_table, cmph_uint32 n, cmph_uint32 max_probes) +{ + register cmph_uint32 * probe_counts = (cmph_uint32 *) calloc(max_probes, sizeof(cmph_uint32)); + register cmph_uint32 i; + register double entropy = 0; + + for(i = 0; i < n; i++) + { + probe_counts[disp_table[i]]++; + }; + + for(i = 0; i < max_probes; i++) + { + if(probe_counts[i] > 0) + entropy -= probe_counts[i]*log((double)probe_counts[i]/(double)n)/log(2); + }; + free(probe_counts); + return entropy; +}; + +chd_ph_config_data_t *chd_ph_config_new() +{ + chd_ph_config_data_t *chd_ph; + chd_ph = (chd_ph_config_data_t *)malloc(sizeof(chd_ph_config_data_t)); + assert(chd_ph); + memset(chd_ph, 0, sizeof(chd_ph_config_data_t)); + + chd_ph->hashfunc = CMPH_HASH_JENKINS; + chd_ph->cs = NULL; + chd_ph->nbuckets = 0; + chd_ph->n = 0; + chd_ph->hl = NULL; + + chd_ph->m = 0; + chd_ph->use_h = 1; + chd_ph->keys_per_bin = 1; + chd_ph->keys_per_bucket = 4; + chd_ph->occup_table = 0; + + return chd_ph; +} + +void chd_ph_config_destroy(cmph_config_t *mph) +{ + chd_ph_config_data_t *data = (chd_ph_config_data_t *) mph->data; + DEBUGP("Destroying algorithm dependent data\n"); + if(data->occup_table) + { + free(data->occup_table); + data->occup_table = NULL; + } + free(data); +} + + +void chd_ph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) +{ + chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data; + CMPH_HASH *hashptr = hashfuncs; + cmph_uint32 i = 0; + while(*hashptr != CMPH_HASH_COUNT) + { + if (i >= 1) break; //chd_ph only uses one linear hash function + chd_ph->hashfunc = *hashptr; + ++i, ++hashptr; + } +} + + +void chd_ph_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket) +{ + assert(mph); + chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data; + if(keys_per_bucket < 1 || keys_per_bucket >= 15) + { + keys_per_bucket = 4; + } + chd_ph->keys_per_bucket = keys_per_bucket; +} + + +void chd_ph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin) +{ + assert(mph); + chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data; + if(keys_per_bin <= 1 || keys_per_bin >= 128) + { + keys_per_bin = 1; + } + chd_ph->keys_per_bin = keys_per_bin; +} + +cmph_uint8 chd_ph_mapping(cmph_config_t *mph, chd_ph_bucket_t * buckets, chd_ph_item_t * items, cmph_uint32 *max_bucket_size) +{ + register cmph_uint32 i = 0, g = 0; + cmph_uint32 hl[3]; + chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data; + char * key = NULL; + cmph_uint32 keylen = 0; + chd_ph_map_item_t * map_item; + chd_ph_map_item_t * map_items = malloc(chd_ph->m*sizeof(chd_ph_map_item_t)); + register cmph_uint32 mapping_iterations = 1000; + *max_bucket_size = 0; + while(1) + { + mapping_iterations--; + if (chd_ph->hl) hash_state_destroy(chd_ph->hl); + chd_ph->hl = hash_state_new(chd_ph->hashfunc, chd_ph->m); + + chd_ph_bucket_clean(buckets, chd_ph->nbuckets); + + mph->key_source->rewind(mph->key_source->data); + + for(i = 0; i < chd_ph->m; i++) + { + mph->key_source->read(mph->key_source->data, &key, &keylen); + hash_vector(chd_ph->hl, key, keylen, hl); + + map_item = (map_items + i); + + g = hl[0] % chd_ph->nbuckets; + map_item->f = hl[1] % chd_ph->n; + map_item->h = hl[2] % (chd_ph->n - 1) + 1; + map_item->bucket_num=g; + mph->key_source->dispose(mph->key_source->data, key, keylen); +// if(buckets[g].size == (chd_ph->keys_per_bucket << 2)) +// { +// DEBUGP("BUCKET = %u -- SIZE = %u -- MAXIMUM SIZE = %u\n", g, buckets[g].size, (chd_ph->keys_per_bucket << 2)); +// goto error; +// } + buckets[g].size++; + if(buckets[g].size > *max_bucket_size) + { + *max_bucket_size = buckets[g].size; + } + } + buckets[0].items_list = 0; + for(i = 1; i < chd_ph->nbuckets; i++) + { + buckets[i].items_list = buckets[i-1].items_list + buckets[i - 1].size; + buckets[i - 1].size = 0; + }; + buckets[i - 1].size = 0; + for(i = 0; i < chd_ph->m; i++) + { + map_item = (map_items + i); + if(!chd_ph_bucket_insert(buckets, map_items, items, chd_ph->nbuckets, i)) + break; + } + if(i == chd_ph->m) + { + free(map_items); + return 1; // SUCCESS + } + + if(mapping_iterations == 0) + { + goto error; + } + } +error: + free(map_items); + hash_state_destroy(chd_ph->hl); + chd_ph->hl = NULL; + return 0; // FAILURE +} + +chd_ph_sorted_list_t * chd_ph_ordering(chd_ph_bucket_t ** _buckets, chd_ph_item_t ** _items, + cmph_uint32 nbuckets, cmph_uint32 nitems, cmph_uint32 max_bucket_size) +{ + chd_ph_sorted_list_t * sorted_lists = (chd_ph_sorted_list_t *) calloc(max_bucket_size + 1, sizeof(chd_ph_sorted_list_t)); + + chd_ph_bucket_t * input_buckets = (*_buckets); + chd_ph_bucket_t * output_buckets; + chd_ph_item_t * input_items = (*_items); + chd_ph_item_t * output_items; + register cmph_uint32 i, j, bucket_size, position, position2; +// cmph_uint32 non_empty_buckets; + DEBUGP("MAX BUCKET SIZE = %u\n", max_bucket_size); + // Determine size of each list of buckets + for(i = 0; i < nbuckets; i++) + { + bucket_size = input_buckets[i].size; + if(bucket_size == 0) + continue; + sorted_lists[bucket_size].size++; + }; + sorted_lists[1].buckets_list = 0; + // Determine final position of list of buckets into the contiguous array that will store all the buckets + for(i = 2; i <= max_bucket_size; i++) + { + sorted_lists[i].buckets_list = sorted_lists[i-1].buckets_list + sorted_lists[i-1].size; + sorted_lists[i-1].size = 0; + }; + sorted_lists[i-1].size = 0; + // Store the buckets in a new array which is sorted by bucket sizes + output_buckets = calloc(nbuckets, sizeof(chd_ph_bucket_t)); // everything is initialized with zero +// non_empty_buckets = nbuckets; + + for(i = 0; i < nbuckets; i++) + { + bucket_size = input_buckets[i].size; + if(bucket_size == 0) + { +// non_empty_buckets--; + continue; + }; + position = sorted_lists[bucket_size].buckets_list + sorted_lists[bucket_size].size; + output_buckets[position].bucket_id = i; + output_buckets[position].items_list = input_buckets[i].items_list; + sorted_lists[bucket_size].size++; + }; +/* for(i = non_empty_buckets; i < nbuckets; i++) + output_buckets[i].size=0;*/ + // Return the buckets sorted in new order and free the old buckets sorted in old order + free(input_buckets); + (*_buckets) = output_buckets; + + + // Store the items according to the new order of buckets. + output_items = (chd_ph_item_t*)calloc(nitems, sizeof(chd_ph_item_t)); + position = 0; + i = 0; + for(bucket_size = 1; bucket_size <= max_bucket_size; bucket_size++) + { + for(i = sorted_lists[bucket_size].buckets_list; i < sorted_lists[bucket_size].size + sorted_lists[bucket_size].buckets_list; i++) + { + position2 = output_buckets[i].items_list; + output_buckets[i].items_list = position; + for(j = 0; j < bucket_size; j++) + { + output_items[position].f = input_items[position2].f; + output_items[position].h = input_items[position2].h; + position++; + position2++; + }; + }; + }; + //Return the items sorted in new order and free the old items sorted in old order + free(input_items); + (*_items) = output_items; + return sorted_lists; +}; + +static inline cmph_uint8 place_bucket_probe(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, + chd_ph_item_t *items, cmph_uint32 probe0_num, cmph_uint32 probe1_num, + cmph_uint32 bucket_num, cmph_uint32 size) +{ + register cmph_uint32 i; + register chd_ph_item_t * item; + register cmph_uint32 position; + + item = items + buckets[bucket_num].items_list; + // try place bucket with probe_num + if(chd_ph->keys_per_bin > 1) + { + for(i = 0; i < size; i++) // placement + { + position = (cmph_uint32)((item->f + ((cmph_uint64)item->h)*probe0_num + probe1_num) % chd_ph->n); + if(chd_ph->occup_table[position] >= chd_ph->keys_per_bin) + { + break; + } + (chd_ph->occup_table[position])++; + item++; + }; + } else + { + for(i = 0; i < size; i++) // placement + { + position = (cmph_uint32)((item->f + ((cmph_uint64)item->h)*probe0_num + probe1_num) % chd_ph->n); + if(GETBIT32(((cmph_uint32 *)chd_ph->occup_table), position)) + { + break; + } + SETBIT32(((cmph_uint32*)chd_ph->occup_table), position); + item++; + }; + }; + if(i != size) // Undo the placement + { + item = items + buckets[bucket_num].items_list; + if(chd_ph->keys_per_bin > 1) + { + while(1) + { + if(i == 0) + { + break; + } + position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n); + (chd_ph->occup_table[position])--; + item++; + i--; + }; + } else + { + while(1) + { + if(i == 0) + { + break; + } + position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n); + UNSETBIT32(((cmph_uint32*)chd_ph->occup_table), position); + +// ([position/32]^=(1<<(position%32)); + item++; + i--; + }; + }; + return 0; + } + return 1; +}; + +static inline cmph_uint8 place_bucket(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items, cmph_uint32 max_probes, + cmph_uint32 * disp_table, cmph_uint32 bucket_num, cmph_uint32 size) + +{ + register cmph_uint32 probe0_num, probe1_num, probe_num; + probe0_num = 0; + probe1_num = 0; + probe_num = 0; + + while(1) + { + if(place_bucket_probe(chd_ph, buckets, items, probe0_num, probe1_num, bucket_num,size)) + { + disp_table[buckets[bucket_num].bucket_id] = probe0_num + probe1_num * chd_ph->n; + return 1; + } + probe0_num++; + if(probe0_num >= chd_ph->n) + { + probe0_num -= chd_ph->n; + probe1_num++; + }; + probe_num++; + if(probe_num >= max_probes || probe1_num >= chd_ph->n) + { + return 0; + }; + }; + return 0; +}; + +static inline cmph_uint8 place_buckets1(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t * buckets, chd_ph_item_t *items, + cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, + cmph_uint32 * disp_table) +{ + register cmph_uint32 i = 0; + register cmph_uint32 curr_bucket = 0; + + for(i = max_bucket_size; i > 0; i--) + { + curr_bucket = sorted_lists[i].buckets_list; + while(curr_bucket < sorted_lists[i].size + sorted_lists[i].buckets_list) + { + if(!place_bucket(chd_ph, buckets, items, max_probes, disp_table, curr_bucket, i)) + { + return 0; + } + curr_bucket++; + }; + }; + return 1; +}; + +static inline cmph_uint8 place_buckets2(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t * items, + cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, + cmph_uint32 * disp_table) +{ + register cmph_uint32 i,j, non_placed_bucket; + register cmph_uint32 curr_bucket; + register cmph_uint32 probe_num, probe0_num, probe1_num; + cmph_uint32 sorted_list_size; +#ifdef DEBUG + cmph_uint32 items_list; + cmph_uint32 bucket_id; +#endif + DEBUGP("USING HEURISTIC TO PLACE BUCKETS\n"); + for(i = max_bucket_size; i > 0; i--) + { + probe_num = 0; + probe0_num = 0; + probe1_num = 0; + sorted_list_size = sorted_lists[i].size; + while(sorted_lists[i].size != 0) + { + curr_bucket = sorted_lists[i].buckets_list; + for(j = 0, non_placed_bucket = 0; j < sorted_lists[i].size; j++) + { + // if bucket is successfully placed remove it from list + if(place_bucket_probe(chd_ph, buckets, items, probe0_num, probe1_num, curr_bucket, i)) + { + disp_table[buckets[curr_bucket].bucket_id] = probe0_num + probe1_num * chd_ph->n; +// DEBUGP("BUCKET %u PLACED --- DISPLACEMENT = %u\n", curr_bucket, disp_table[curr_bucket]); + } + else + { +// DEBUGP("BUCKET %u NOT PLACED\n", curr_bucket); +#ifdef DEBUG + items_list = buckets[non_placed_bucket + sorted_lists[i].buckets_list].items_list; + bucket_id = buckets[non_placed_bucket + sorted_lists[i].buckets_list].bucket_id; +#endif + buckets[non_placed_bucket + sorted_lists[i].buckets_list].items_list = buckets[curr_bucket].items_list; + buckets[non_placed_bucket + sorted_lists[i].buckets_list].bucket_id = buckets[curr_bucket].bucket_id; +#ifdef DEBUG + buckets[curr_bucket].items_list=items_list; + buckets[curr_bucket].bucket_id=bucket_id; +#endif + non_placed_bucket++; + } + curr_bucket++; + }; + sorted_lists[i].size = non_placed_bucket; + probe0_num++; + if(probe0_num >= chd_ph->n) + { + probe0_num -= chd_ph->n; + probe1_num++; + }; + probe_num++; + if(probe_num >= max_probes || probe1_num >= chd_ph->n) + { + sorted_lists[i].size = sorted_list_size; + return 0; + }; + }; + sorted_lists[i].size = sorted_list_size; + }; + return 1; +}; + +cmph_uint8 chd_ph_searching(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items , + cmph_uint32 max_bucket_size, chd_ph_sorted_list_t *sorted_lists, cmph_uint32 max_probes, + cmph_uint32 * disp_table) +{ + if(chd_ph->use_h) + { + return place_buckets2(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table); + } + else + { + return place_buckets1(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table); + } + +} + +static inline cmph_uint8 chd_ph_check_bin_hashing(chd_ph_config_data_t *chd_ph, chd_ph_bucket_t *buckets, chd_ph_item_t *items, + cmph_uint32 * disp_table, chd_ph_sorted_list_t * sorted_lists,cmph_uint32 max_bucket_size) +{ + register cmph_uint32 bucket_size, i, j; + register cmph_uint32 position, probe0_num, probe1_num; + register cmph_uint32 m = 0; + register chd_ph_item_t * item; + if(chd_ph->keys_per_bin > 1) + memset(chd_ph->occup_table, 0, chd_ph->n); + else + memset(chd_ph->occup_table, 0, ((chd_ph->n + 31)/32) * sizeof(cmph_uint32)); + + for(bucket_size = 1; bucket_size <= max_bucket_size; bucket_size++) + for(i = sorted_lists[bucket_size].buckets_list; i < sorted_lists[bucket_size].size + + sorted_lists[bucket_size].buckets_list; i++) + { + j = bucket_size; + item = items + buckets[i].items_list; + probe0_num = disp_table[buckets[i].bucket_id] % chd_ph->n; + probe1_num = disp_table[buckets[i].bucket_id] / chd_ph->n; + for(; j > 0; j--) + { + m++; + position = (cmph_uint32)((item->f + ((cmph_uint64 )item->h) * probe0_num + probe1_num) % chd_ph->n); + if(chd_ph->keys_per_bin > 1) + { + if(chd_ph->occup_table[position] >= chd_ph->keys_per_bin) + { + return 0; + } + (chd_ph->occup_table[position])++; + } + else + { + if(GETBIT32(((cmph_uint32*)chd_ph->occup_table), position)) + { + return 0; + } + SETBIT32(((cmph_uint32*)chd_ph->occup_table), position); + }; + item++; + }; + }; + DEBUGP("We were able to place m = %u keys\n", m); + return 1; +}; + + +cmph_t *chd_ph_new(cmph_config_t *mph, double c) +{ + cmph_t *mphf = NULL; + chd_ph_data_t *chd_phf = NULL; + chd_ph_config_data_t *chd_ph = (chd_ph_config_data_t *)mph->data; + + register double load_factor = c; + register cmph_uint8 searching_success = 0; + register cmph_uint32 max_probes = 1 << 20; // default value for max_probes + register cmph_uint32 iterations = 100; + chd_ph_bucket_t * buckets = NULL; + chd_ph_item_t * items = NULL; + register cmph_uint8 failure = 0; + cmph_uint32 max_bucket_size = 0; + chd_ph_sorted_list_t * sorted_lists = NULL; + cmph_uint32 * disp_table = NULL; + register double space_lower_bound = 0; + #ifdef CMPH_TIMING + double construction_time_begin = 0.0; + double construction_time = 0.0; + ELAPSED_TIME_IN_SECONDS(&construction_time_begin); + #endif + + + chd_ph->m = mph->key_source->nkeys; + DEBUGP("m = %u\n", chd_ph->m); + + chd_ph->nbuckets = (cmph_uint32)(chd_ph->m/chd_ph->keys_per_bucket) + 1; + DEBUGP("nbuckets = %u\n", chd_ph->nbuckets); + + if(load_factor < 0.5 ) + { + load_factor = 0.5; + } + + if(load_factor >= 0.99) + { + load_factor = 0.99; + } + + DEBUGP("load_factor = %.3f\n", load_factor); + + chd_ph->n = (cmph_uint32)(chd_ph->m/(chd_ph->keys_per_bin * load_factor)) + 1; + + //Round the number of bins to the prime immediately above + if(chd_ph->n % 2 == 0) chd_ph->n++; + for(;;) + { + if(check_primality(chd_ph->n) == 1) + break; + chd_ph->n += 2; // just odd numbers can be primes for n > 2 + + }; + + DEBUGP("n = %u \n", chd_ph->n); + if(chd_ph->keys_per_bin == 1) + { + space_lower_bound = chd_ph_space_lower_bound(chd_ph->m, chd_ph->n); + } + + if(mph->verbosity) + { + fprintf(stderr, "space lower bound is %.3f bits per key\n", space_lower_bound); + } + + // We allocate the working tables + buckets = chd_ph_bucket_new(chd_ph->nbuckets); + items = (chd_ph_item_t *) calloc(chd_ph->m, sizeof(chd_ph_item_t)); + + max_probes = (cmph_uint32)(((log(chd_ph->m)/log(2))/20) * max_probes); + + if(chd_ph->keys_per_bin == 1) + chd_ph->occup_table = (cmph_uint8 *) calloc(((chd_ph->n + 31)/32), sizeof(cmph_uint32)); + else + chd_ph->occup_table = (cmph_uint8 *) calloc(chd_ph->n, sizeof(cmph_uint8)); + + disp_table = (cmph_uint32 *) calloc(chd_ph->nbuckets, sizeof(cmph_uint32)); +// +// init_genrand(time(0)); + + while(1) + { + iterations --; + if (mph->verbosity) + { + fprintf(stderr, "Starting mapping step for mph creation of %u keys with %u bins\n", chd_ph->m, chd_ph->n); + } + + if(!chd_ph_mapping(mph, buckets, items, &max_bucket_size)) + { + if (mph->verbosity) + { + fprintf(stderr, "Failure in mapping step\n"); + } + failure = 1; + goto cleanup; + } + + if (mph->verbosity) + { + fprintf(stderr, "Starting ordering step\n"); + } + if(sorted_lists) + { + free(sorted_lists); + } + + sorted_lists = chd_ph_ordering(&buckets, &items, chd_ph->nbuckets, chd_ph->m, max_bucket_size); + + if (mph->verbosity) + { + fprintf(stderr, "Starting searching step\n"); + } + + searching_success = chd_ph_searching(chd_ph, buckets, items, max_bucket_size, sorted_lists, max_probes, disp_table); + if(searching_success) break; + + // reset occup_table + if(chd_ph->keys_per_bin > 1) + memset(chd_ph->occup_table, 0, chd_ph->n); + else + memset(chd_ph->occup_table, 0, ((chd_ph->n + 31)/32) * sizeof(cmph_uint32)); + if(iterations == 0) + { + // Cleanup memory + if (mph->verbosity) + { + fprintf(stderr, "Failure because the max trials was exceeded\n"); + } + failure = 1; + goto cleanup; + }; + } + + #ifdef DEBUG + { + if(!chd_ph_check_bin_hashing(chd_ph, buckets, items, disp_table,sorted_lists,max_bucket_size)) + { + + DEBUGP("Error for bin packing generation"); + failure = 1; + goto cleanup; + } + } + #endif + + if (mph->verbosity) + { + fprintf(stderr, "Starting compressing step\n"); + } + + if(chd_ph->cs) + { + free(chd_ph->cs); + } + chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t)); + compressed_seq_init(chd_ph->cs); + compressed_seq_generate(chd_ph->cs, disp_table, chd_ph->nbuckets); + + #ifdef CMPH_TIMING + ELAPSED_TIME_IN_SECONDS(&construction_time); + register double entropy = chd_ph_get_entropy(disp_table, chd_ph->nbuckets, max_probes); + DEBUGP("Entropy = %.4f\n", entropy/chd_ph->m); + #endif + +cleanup: + chd_ph_bucket_destroy(buckets); + free(items); + free(sorted_lists); + free(disp_table); + if(failure) + { + if(chd_ph->hl) + { + hash_state_destroy(chd_ph->hl); + } + chd_ph->hl = NULL; + return NULL; + } + + mphf = (cmph_t *)malloc(sizeof(cmph_t)); + mphf->algo = mph->algo; + chd_phf = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t)); + + chd_phf->cs = chd_ph->cs; + chd_ph->cs = NULL; //transfer memory ownership + chd_phf->hl = chd_ph->hl; + chd_ph->hl = NULL; //transfer memory ownership + chd_phf->n = chd_ph->n; + chd_phf->nbuckets = chd_ph->nbuckets; + + mphf->data = chd_phf; + mphf->size = chd_ph->n; + + DEBUGP("Successfully generated minimal perfect hash\n"); + if (mph->verbosity) + { + fprintf(stderr, "Successfully generated minimal perfect hash function\n"); + } + + #ifdef CMPH_TIMING + register cmph_uint32 space_usage = chd_ph_packed_size(mphf)*8; + construction_time = construction_time - construction_time_begin; + fprintf(stdout, "%u\t%.2f\t%u\t%.4f\t%.4f\t%.4f\t%.4f\n", chd_ph->m, load_factor, chd_ph->keys_per_bucket, construction_time, space_usage/(double)chd_ph->m, space_lower_bound, entropy/chd_ph->m); + #endif + + return mphf; +} + + + +void chd_ph_load(FILE *fd, cmph_t *mphf) +{ + char *buf = NULL; + cmph_uint32 buflen; + register size_t nbytes; + chd_ph_data_t *chd_ph = (chd_ph_data_t *)malloc(sizeof(chd_ph_data_t)); + + DEBUGP("Loading chd_ph mphf\n"); + mphf->data = chd_ph; + + nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + DEBUGP("Hash state has %u bytes\n", buflen); + buf = (char *)malloc((size_t)buflen); + nbytes = fread(buf, (size_t)buflen, (size_t)1, fd); + chd_ph->hl = hash_state_load(buf, buflen); + free(buf); + + nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + DEBUGP("Compressed sequence structure has %u bytes\n", buflen); + buf = (char *)malloc((size_t)buflen); + nbytes = fread(buf, (size_t)buflen, (size_t)1, fd); + chd_ph->cs = (compressed_seq_t *) calloc(1, sizeof(compressed_seq_t)); + compressed_seq_load(chd_ph->cs, buf, buflen); + free(buf); + + // loading n and nbuckets + DEBUGP("Reading n and nbuckets\n"); + nbytes = fread(&(chd_ph->n), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fread(&(chd_ph->nbuckets), sizeof(cmph_uint32), (size_t)1, fd); +} + +int chd_ph_dump(cmph_t *mphf, FILE *fd) +{ + char *buf = NULL; + cmph_uint32 buflen; + register size_t nbytes; + chd_ph_data_t *data = (chd_ph_data_t *)mphf->data; + + __cmph_dump(mphf, fd); + + hash_state_dump(data->hl, &buf, &buflen); + DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); + nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd); + free(buf); + + compressed_seq_dump(data->cs, &buf, &buflen); + DEBUGP("Dumping compressed sequence structure with %u bytes to disk\n", buflen); + nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd); + free(buf); + + // dumping n and nbuckets + nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(&(data->nbuckets), sizeof(cmph_uint32), (size_t)1, fd); + return 1; +} + +void chd_ph_destroy(cmph_t *mphf) +{ + chd_ph_data_t *data = (chd_ph_data_t *)mphf->data; + compressed_seq_destroy(data->cs); + free(data->cs); + hash_state_destroy(data->hl); + free(data); + free(mphf); + +} + +cmph_uint32 chd_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) +{ + register chd_ph_data_t * chd_ph = mphf->data; + cmph_uint32 hl[3]; + register cmph_uint32 disp,position; + register cmph_uint32 probe0_num,probe1_num; + register cmph_uint32 f,g,h; + hash_vector(chd_ph->hl, key, keylen, hl); + g = hl[0] % chd_ph->nbuckets; + f = hl[1] % chd_ph->n; + h = hl[2] % (chd_ph->n-1) + 1; + + disp = compressed_seq_query(chd_ph->cs, g); + probe0_num = disp % chd_ph->n; + probe1_num = disp/chd_ph->n; + position = (cmph_uint32)((f + ((cmph_uint64 )h)*probe0_num + probe1_num) % chd_ph->n); + return position; +} + +void chd_ph_pack(cmph_t *mphf, void *packed_mphf) +{ + chd_ph_data_t *data = (chd_ph_data_t *)mphf->data; + cmph_uint8 * ptr = packed_mphf; + + // packing hl type + CMPH_HASH hl_type = hash_get_type(data->hl); + *((cmph_uint32 *) ptr) = hl_type; + ptr += sizeof(cmph_uint32); + + // packing hl + hash_state_pack(data->hl, ptr); + ptr += hash_state_packed_size(hl_type); + + // packing n + *((cmph_uint32 *) ptr) = data->n; + ptr += sizeof(data->n); + + // packing nbuckets + *((cmph_uint32 *) ptr) = data->nbuckets; + ptr += sizeof(data->nbuckets); + + // packing cs + compressed_seq_pack(data->cs, ptr); + //ptr += compressed_seq_packed_size(data->cs); + +} + +cmph_uint32 chd_ph_packed_size(cmph_t *mphf) +{ + register chd_ph_data_t *data = (chd_ph_data_t *)mphf->data; + register CMPH_HASH hl_type = hash_get_type(data->hl); + register cmph_uint32 hash_state_pack_size = hash_state_packed_size(hl_type); + register cmph_uint32 cs_pack_size = compressed_seq_packed_size(data->cs); + + return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_pack_size + cs_pack_size + 3*sizeof(cmph_uint32)); + +} + +cmph_uint32 chd_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) +{ + register CMPH_HASH hl_type = *(cmph_uint32 *)packed_mphf; + register cmph_uint8 *hl_ptr = (cmph_uint8 *)(packed_mphf) + 4; + + register cmph_uint32 * ptr = (cmph_uint32 *)(hl_ptr + hash_state_packed_size(hl_type)); + register cmph_uint32 n = *ptr++; + register cmph_uint32 nbuckets = *ptr++; + cmph_uint32 hl[3]; + + register cmph_uint32 disp,position; + register cmph_uint32 probe0_num,probe1_num; + register cmph_uint32 f,g,h; + + hash_vector_packed(hl_ptr, hl_type, key, keylen, hl); + + g = hl[0] % nbuckets; + f = hl[1] % n; + h = hl[2] % (n-1) + 1; + + disp = compressed_seq_query_packed(ptr, g); + probe0_num = disp % n; + probe1_num = disp/n; + position = (cmph_uint32)((f + ((cmph_uint64 )h)*probe0_num + probe1_num) % n); + return position; +} + + + diff --git a/cmph/chd_ph.h b/cmph/chd_ph.h new file mode 100644 index 000000000..d2bdb0286 --- /dev/null +++ b/cmph/chd_ph.h @@ -0,0 +1,59 @@ +#ifndef _CMPH_CHD_PH_H__ +#define _CMPH_CHD_PH_H__ + +#include "cmph.h" + +typedef struct __chd_ph_data_t chd_ph_data_t; +typedef struct __chd_ph_config_data_t chd_ph_config_data_t; + +/* Config API */ +chd_ph_config_data_t *chd_ph_config_new(); +void chd_ph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs); + +/** \fn void chd_ph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin); + * \brief Allows to set the number of keys per bin. + * \param mph pointer to the configuration structure + * \param keys_per_bin value for the number of keys per bin + */ +void chd_ph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin); + +/** \fn void chd_ph_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket); + * \brief Allows to set the number of keys per bucket. + * \param mph pointer to the configuration structure + * \param keys_per_bucket value for the number of keys per bucket + */ +void chd_ph_config_set_b(cmph_config_t *mph, cmph_uint32 keys_per_bucket); +void chd_ph_config_destroy(cmph_config_t *mph); + + +/* Chd algorithm API */ +cmph_t *chd_ph_new(cmph_config_t *mph, double c); +void chd_ph_load(FILE *fd, cmph_t *mphf); +int chd_ph_dump(cmph_t *mphf, FILE *fd); +void chd_ph_destroy(cmph_t *mphf); +cmph_uint32 chd_ph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); + +/** \fn void chd_ph_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void chd_ph_pack(cmph_t *mphf, void *packed_mphf); + +/** \fn cmph_uint32 chd_ph_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 chd_ph_packed_size(cmph_t *mphf); + +/** cmph_uint32 chd_ph_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 chd_ph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen); + +#endif diff --git a/cmph/chd_structs.h b/cmph/chd_structs.h new file mode 100644 index 000000000..d62f68268 --- /dev/null +++ b/cmph/chd_structs.h @@ -0,0 +1,21 @@ +#ifndef __CMPH_CHD_STRUCTS_H__ +#define __CMPH_CHD_STRUCTS_H__ + +#include "chd_structs_ph.h" +#include "chd_ph.h" +#include "compressed_rank.h" + +struct __chd_data_t +{ + cmph_uint32 packed_cr_size; + cmph_uint8 * packed_cr; // packed compressed rank structure to control the number of zeros in a bit vector + + cmph_uint32 packed_chd_phf_size; + cmph_uint8 * packed_chd_phf; +}; + +struct __chd_config_data_t +{ + cmph_config_t *chd_ph; // chd_ph algorithm must be used here +}; +#endif diff --git a/cmph/chd_structs_ph.h b/cmph/chd_structs_ph.h new file mode 100644 index 000000000..d86921822 --- /dev/null +++ b/cmph/chd_structs_ph.h @@ -0,0 +1,29 @@ +#ifndef __CMPH_CHD_PH_STRUCTS_H__ +#define __CMPH_CHD_PH_STRUCTS_H__ + +#include "hash_state.h" +#include "compressed_seq.h" + +struct __chd_ph_data_t +{ + compressed_seq_t * cs; // compressed displacement values + cmph_uint32 nbuckets; // number of buckets + cmph_uint32 n; // number of bins + hash_state_t *hl; // linear hash function +}; + +struct __chd_ph_config_data_t +{ + CMPH_HASH hashfunc; // linear hash function to be used + compressed_seq_t * cs; // compressed displacement values + cmph_uint32 nbuckets; // number of buckets + cmph_uint32 n; // number of bins + hash_state_t *hl; // linear hash function + + cmph_uint32 m; // number of keys + cmph_uint8 use_h; // flag to indicate the of use of a heuristic (use_h = 1) + cmph_uint32 keys_per_bin;//maximum number of keys per bin + cmph_uint32 keys_per_bucket; // average number of keys per bucket + cmph_uint8 *occup_table; // table that indicates occupied positions +}; +#endif diff --git a/cmph/chm.c b/cmph/chm.c new file mode 100644 index 000000000..e03cca80e --- /dev/null +++ b/cmph/chm.c @@ -0,0 +1,381 @@ +#include "graph.h" +#include "chm.h" +#include "cmph_structs.h" +#include "chm_structs.h" +#include "hash.h" +#include "bitbool.h" + +#include +#include +#include +#include +#include + +//#define DEBUG +#include "debug.h" + +static int chm_gen_edges(cmph_config_t *mph); +static void chm_traverse(chm_config_data_t *chm, cmph_uint8 *visited, cmph_uint32 v); + +chm_config_data_t *chm_config_new() +{ + chm_config_data_t *chm = NULL; + chm = (chm_config_data_t *)malloc(sizeof(chm_config_data_t)); + assert(chm); + memset(chm, 0, sizeof(chm_config_data_t)); + chm->hashfuncs[0] = CMPH_HASH_JENKINS; + chm->hashfuncs[1] = CMPH_HASH_JENKINS; + chm->g = NULL; + chm->graph = NULL; + chm->hashes = NULL; + return chm; +} +void chm_config_destroy(cmph_config_t *mph) +{ + chm_config_data_t *data = (chm_config_data_t *)mph->data; + DEBUGP("Destroying algorithm dependent data\n"); + free(data); +} + +void chm_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) +{ + chm_config_data_t *chm = (chm_config_data_t *)mph->data; + CMPH_HASH *hashptr = hashfuncs; + cmph_uint32 i = 0; + while(*hashptr != CMPH_HASH_COUNT) + { + if (i >= 2) break; //chm only uses two hash functions + chm->hashfuncs[i] = *hashptr; + ++i, ++hashptr; + } +} + +cmph_t *chm_new(cmph_config_t *mph, double c) +{ + cmph_t *mphf = NULL; + chm_data_t *chmf = NULL; + + cmph_uint32 i; + cmph_uint32 iterations = 20; + cmph_uint8 *visited = NULL; + chm_config_data_t *chm = (chm_config_data_t *)mph->data; + chm->m = mph->key_source->nkeys; + if (c == 0) c = 2.09; + chm->n = (cmph_uint32)ceil(c * mph->key_source->nkeys); + DEBUGP("m (edges): %u n (vertices): %u c: %f\n", chm->m, chm->n, c); + chm->graph = graph_new(chm->n, chm->m); + DEBUGP("Created graph\n"); + + chm->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*3); + for(i = 0; i < 3; ++i) chm->hashes[i] = NULL; + //Mapping step + if (mph->verbosity) + { + fprintf(stderr, "Entering mapping step for mph creation of %u keys with graph sized %u\n", chm->m, chm->n); + } + while(1) + { + int ok; + chm->hashes[0] = hash_state_new(chm->hashfuncs[0], chm->n); + chm->hashes[1] = hash_state_new(chm->hashfuncs[1], chm->n); + ok = chm_gen_edges(mph); + if (!ok) + { + --iterations; + hash_state_destroy(chm->hashes[0]); + chm->hashes[0] = NULL; + hash_state_destroy(chm->hashes[1]); + chm->hashes[1] = NULL; + DEBUGP("%u iterations remaining\n", iterations); + if (mph->verbosity) + { + fprintf(stderr, "Acyclic graph creation failure - %u iterations remaining\n", iterations); + } + if (iterations == 0) break; + } + else break; + } + if (iterations == 0) + { + graph_destroy(chm->graph); + return NULL; + } + + //Assignment step + if (mph->verbosity) + { + fprintf(stderr, "Starting assignment step\n"); + } + DEBUGP("Assignment step\n"); + visited = (cmph_uint8 *)malloc((size_t)(chm->n/8 + 1)); + memset(visited, 0, (size_t)(chm->n/8 + 1)); + free(chm->g); + chm->g = (cmph_uint32 *)malloc(chm->n * sizeof(cmph_uint32)); + assert(chm->g); + for (i = 0; i < chm->n; ++i) + { + if (!GETBIT(visited,i)) + { + chm->g[i] = 0; + chm_traverse(chm, visited, i); + } + } + graph_destroy(chm->graph); + free(visited); + chm->graph = NULL; + + mphf = (cmph_t *)malloc(sizeof(cmph_t)); + mphf->algo = mph->algo; + chmf = (chm_data_t *)malloc(sizeof(chm_data_t)); + chmf->g = chm->g; + chm->g = NULL; //transfer memory ownership + chmf->hashes = chm->hashes; + chm->hashes = NULL; //transfer memory ownership + chmf->n = chm->n; + chmf->m = chm->m; + mphf->data = chmf; + mphf->size = chm->m; + DEBUGP("Successfully generated minimal perfect hash\n"); + if (mph->verbosity) + { + fprintf(stderr, "Successfully generated minimal perfect hash function\n"); + } + return mphf; +} + +static void chm_traverse(chm_config_data_t *chm, cmph_uint8 *visited, cmph_uint32 v) +{ + + graph_iterator_t it = graph_neighbors_it(chm->graph, v); + cmph_uint32 neighbor = 0; + SETBIT(visited,v); + + DEBUGP("Visiting vertex %u\n", v); + while((neighbor = graph_next_neighbor(chm->graph, &it)) != GRAPH_NO_NEIGHBOR) + { + DEBUGP("Visiting neighbor %u\n", neighbor); + if(GETBIT(visited,neighbor)) continue; + DEBUGP("Visiting neighbor %u\n", neighbor); + DEBUGP("Visiting edge %u->%u with id %u\n", v, neighbor, graph_edge_id(chm->graph, v, neighbor)); + chm->g[neighbor] = graph_edge_id(chm->graph, v, neighbor) - chm->g[v]; + DEBUGP("g is %u (%u - %u mod %u)\n", chm->g[neighbor], graph_edge_id(chm->graph, v, neighbor), chm->g[v], chm->m); + chm_traverse(chm, visited, neighbor); + } +} + +static int chm_gen_edges(cmph_config_t *mph) +{ + cmph_uint32 e; + chm_config_data_t *chm = (chm_config_data_t *)mph->data; + int cycles = 0; + + DEBUGP("Generating edges for %u vertices with hash functions %s and %s\n", chm->n, cmph_hash_names[chm->hashfuncs[0]], cmph_hash_names[chm->hashfuncs[1]]); + graph_clear_edges(chm->graph); + mph->key_source->rewind(mph->key_source->data); + for (e = 0; e < mph->key_source->nkeys; ++e) + { + cmph_uint32 h1, h2; + cmph_uint32 keylen; + char *key; + mph->key_source->read(mph->key_source->data, &key, &keylen); + h1 = hash(chm->hashes[0], key, keylen) % chm->n; + h2 = hash(chm->hashes[1], key, keylen) % chm->n; + if (h1 == h2) if (++h2 >= chm->n) h2 = 0; + if (h1 == h2) + { + if (mph->verbosity) fprintf(stderr, "Self loop for key %u\n", e); + mph->key_source->dispose(mph->key_source->data, key, keylen); + return 0; + } + DEBUGP("Adding edge: %u -> %u for key %s\n", h1, h2, key); + mph->key_source->dispose(mph->key_source->data, key, keylen); + graph_add_edge(chm->graph, h1, h2); + } + cycles = graph_is_cyclic(chm->graph); + if (mph->verbosity && cycles) fprintf(stderr, "Cyclic graph generated\n"); + DEBUGP("Looking for cycles: %u\n", cycles); + + return ! cycles; +} + +int chm_dump(cmph_t *mphf, FILE *fd) +{ + char *buf = NULL; + cmph_uint32 buflen; + cmph_uint32 two = 2; //number of hash functions + chm_data_t *data = (chm_data_t *)mphf->data; + register size_t nbytes; + + __cmph_dump(mphf, fd); + + nbytes = fwrite(&two, sizeof(cmph_uint32), (size_t)1, fd); + hash_state_dump(data->hashes[0], &buf, &buflen); + DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); + nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd); + free(buf); + + hash_state_dump(data->hashes[1], &buf, &buflen); + DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); + nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd); + free(buf); + + nbytes = fwrite(&(data->n), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd); + + nbytes = fwrite(data->g, sizeof(cmph_uint32)*data->n, (size_t)1, fd); +/* #ifdef DEBUG + fprintf(stderr, "G: "); + for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", data->g[i]); + fprintf(stderr, "\n"); + #endif*/ + return 1; +} + +void chm_load(FILE *f, cmph_t *mphf) +{ + cmph_uint32 nhashes; + char *buf = NULL; + cmph_uint32 buflen; + cmph_uint32 i; + chm_data_t *chm = (chm_data_t *)malloc(sizeof(chm_data_t)); + register size_t nbytes; + DEBUGP("Loading chm mphf\n"); + mphf->data = chm; + nbytes = fread(&nhashes, sizeof(cmph_uint32), (size_t)1, f); + chm->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*(nhashes + 1)); + chm->hashes[nhashes] = NULL; + DEBUGP("Reading %u hashes\n", nhashes); + for (i = 0; i < nhashes; ++i) + { + hash_state_t *state = NULL; + nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f); + DEBUGP("Hash state has %u bytes\n", buflen); + buf = (char *)malloc((size_t)buflen); + nbytes = fread(buf, (size_t)buflen, (size_t)1, f); + state = hash_state_load(buf, buflen); + chm->hashes[i] = state; + free(buf); + } + + DEBUGP("Reading m and n\n"); + nbytes = fread(&(chm->n), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(chm->m), sizeof(cmph_uint32), (size_t)1, f); + + chm->g = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*chm->n); + nbytes = fread(chm->g, chm->n*sizeof(cmph_uint32), (size_t)1, f); + #ifdef DEBUG + fprintf(stderr, "G: "); + for (i = 0; i < chm->n; ++i) fprintf(stderr, "%u ", chm->g[i]); + fprintf(stderr, "\n"); + #endif + return; +} + + +cmph_uint32 chm_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) +{ + chm_data_t *chm = mphf->data; + cmph_uint32 h1 = hash(chm->hashes[0], key, keylen) % chm->n; + cmph_uint32 h2 = hash(chm->hashes[1], key, keylen) % chm->n; + DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2); + if (h1 == h2 && ++h2 >= chm->n) h2 = 0; + DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, chm->g[h1], chm->g[h2], chm->m); + return (chm->g[h1] + chm->g[h2]) % chm->m; +} +void chm_destroy(cmph_t *mphf) +{ + chm_data_t *data = (chm_data_t *)mphf->data; + free(data->g); + hash_state_destroy(data->hashes[0]); + hash_state_destroy(data->hashes[1]); + free(data->hashes); + free(data); + free(mphf); +} + +/** \fn void chm_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void chm_pack(cmph_t *mphf, void *packed_mphf) +{ + chm_data_t *data = (chm_data_t *)mphf->data; + cmph_uint8 * ptr = packed_mphf; + + // packing h1 type + CMPH_HASH h1_type = hash_get_type(data->hashes[0]); + *((cmph_uint32 *) ptr) = h1_type; + ptr += sizeof(cmph_uint32); + + // packing h1 + hash_state_pack(data->hashes[0], ptr); + ptr += hash_state_packed_size(h1_type); + + // packing h2 type + CMPH_HASH h2_type = hash_get_type(data->hashes[1]); + *((cmph_uint32 *) ptr) = h2_type; + ptr += sizeof(cmph_uint32); + + // packing h2 + hash_state_pack(data->hashes[1], ptr); + ptr += hash_state_packed_size(h2_type); + + // packing n + *((cmph_uint32 *) ptr) = data->n; + ptr += sizeof(data->n); + + // packing m + *((cmph_uint32 *) ptr) = data->m; + ptr += sizeof(data->m); + + // packing g + memcpy(ptr, data->g, sizeof(cmph_uint32)*data->n); +} + +/** \fn cmph_uint32 chm_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 chm_packed_size(cmph_t *mphf) +{ + chm_data_t *data = (chm_data_t *)mphf->data; + CMPH_HASH h1_type = hash_get_type(data->hashes[0]); + CMPH_HASH h2_type = hash_get_type(data->hashes[1]); + + return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) + + 4*sizeof(cmph_uint32) + sizeof(cmph_uint32)*data->n); +} + +/** cmph_uint32 chm_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 chm_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) +{ + register cmph_uint8 *h1_ptr = packed_mphf; + register CMPH_HASH h1_type = *((cmph_uint32 *)h1_ptr); + h1_ptr += 4; + + register cmph_uint8 *h2_ptr = h1_ptr + hash_state_packed_size(h1_type); + register CMPH_HASH h2_type = *((cmph_uint32 *)h2_ptr); + h2_ptr += 4; + + register cmph_uint32 *g_ptr = (cmph_uint32 *)(h2_ptr + hash_state_packed_size(h2_type)); + + register cmph_uint32 n = *g_ptr++; + register cmph_uint32 m = *g_ptr++; + + register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % n; + register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % n; + DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2); + if (h1 == h2 && ++h2 >= n) h2 = 0; + DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, g_ptr[h1], g_ptr[h2], m); + return (g_ptr[h1] + g_ptr[h2]) % m; +} diff --git a/cmph/chm.h b/cmph/chm.h new file mode 100644 index 000000000..341be29ea --- /dev/null +++ b/cmph/chm.h @@ -0,0 +1,42 @@ +#ifndef __CMPH_CHM_H__ +#define __CMPH_CHM_H__ + +#include "cmph.h" + +typedef struct __chm_data_t chm_data_t; +typedef struct __chm_config_data_t chm_config_data_t; + +chm_config_data_t *chm_config_new(); +void chm_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs); +void chm_config_destroy(cmph_config_t *mph); +cmph_t *chm_new(cmph_config_t *mph, double c); + +void chm_load(FILE *f, cmph_t *mphf); +int chm_dump(cmph_t *mphf, FILE *f); +void chm_destroy(cmph_t *mphf); +cmph_uint32 chm_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); + +/** \fn void chm_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void chm_pack(cmph_t *mphf, void *packed_mphf); + +/** \fn cmph_uint32 chm_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 chm_packed_size(cmph_t *mphf); + +/** cmph_uint32 chm_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 chm_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen); + +#endif diff --git a/cmph/chm_structs.h b/cmph/chm_structs.h new file mode 100644 index 000000000..fcad1bc3d --- /dev/null +++ b/cmph/chm_structs.h @@ -0,0 +1,24 @@ +#ifndef __CMPH_CHM_STRUCTS_H__ +#define __CMPH_CHM_STRUCTS_H__ + +#include "hash_state.h" + +struct __chm_data_t +{ + cmph_uint32 m; //edges (words) count + cmph_uint32 n; //vertex count + cmph_uint32 *g; + hash_state_t **hashes; +}; + +struct __chm_config_data_t +{ + CMPH_HASH hashfuncs[2]; + cmph_uint32 m; //edges (words) count + cmph_uint32 n; //vertex count + graph_t *graph; + cmph_uint32 *g; + hash_state_t **hashes; +}; + +#endif diff --git a/cmph/cmph.c b/cmph/cmph.c new file mode 100644 index 000000000..cba735f4f --- /dev/null +++ b/cmph/cmph.c @@ -0,0 +1,845 @@ +#include "cmph.h" +#include "cmph_structs.h" +#include "chm.h" +#include "bmz.h" +#include "bmz8.h" +#include "brz.h" +#include "fch.h" +#include "bdz.h" +#include "bdz_ph.h" +#include "chd_ph.h" +#include "chd.h" + +#include +#include +#include +//#define DEBUG +#include "debug.h" + +const char *cmph_names[] = {"bmz", "bmz8", "chm", "brz", "fch", "bdz", "bdz_ph", "chd_ph", "chd", NULL }; + +typedef struct +{ + void *vector; + cmph_uint32 position; // access position when data is a vector +} cmph_vector_t; + + + +/** + * Support a vector of struct as the source of keys. + * + * E.g. The keys could be the fieldB's in a vector of struct rec where + * struct rec is defined as: + * struct rec { + * fieldA; + * fieldB; + * fieldC; + * } + */ +typedef struct +{ + void *vector; /* Pointer to the vector of struct */ + cmph_uint32 position; /* current position */ + cmph_uint32 struct_size; /* The size of the struct */ + cmph_uint32 key_offset; /* The byte offset of the key in the struct */ + cmph_uint32 key_len; /* The length of the key */ +} cmph_struct_vector_t; + + +static cmph_io_adapter_t *cmph_io_vector_new(void * vector, cmph_uint32 nkeys); +static void cmph_io_vector_destroy(cmph_io_adapter_t * key_source); + +static cmph_io_adapter_t *cmph_io_struct_vector_new(void * vector, cmph_uint32 struct_size, cmph_uint32 key_offset, cmph_uint32 key_len, cmph_uint32 nkeys); +static void cmph_io_struct_vector_destroy(cmph_io_adapter_t * key_source); + +static int key_nlfile_read(void *data, char **key, cmph_uint32 *keylen) +{ + FILE *fd = (FILE *)data; + *key = NULL; + *keylen = 0; + while(1) + { + char buf[BUFSIZ]; + char *c = fgets(buf, BUFSIZ, fd); + if (c == NULL) return -1; + if (feof(fd)) return -1; + *key = (char *)realloc(*key, *keylen + strlen(buf) + 1); + memcpy(*key + *keylen, buf, strlen(buf)); + *keylen += (cmph_uint32)strlen(buf); + if (buf[strlen(buf) - 1] != '\n') continue; + break; + } + if ((*keylen) && (*key)[*keylen - 1] == '\n') + { + (*key)[(*keylen) - 1] = 0; + --(*keylen); + } + return (int)(*keylen); +} + +static int key_byte_vector_read(void *data, char **key, cmph_uint32 *keylen) +{ + cmph_vector_t *cmph_vector = (cmph_vector_t *)data; + cmph_uint8 **keys_vd = (cmph_uint8 **)cmph_vector->vector; + size_t size; + memcpy(keylen, keys_vd[cmph_vector->position], sizeof(*keylen)); + size = *keylen; + *key = (char *)malloc(size); + memcpy(*key, keys_vd[cmph_vector->position] + sizeof(*keylen), size); + cmph_vector->position = cmph_vector->position + 1; + return (int)(*keylen); + +} + +static int key_struct_vector_read(void *data, char **key, cmph_uint32 *keylen) +{ + cmph_struct_vector_t *cmph_struct_vector = (cmph_struct_vector_t *)data; + char *keys_vd = (char *)cmph_struct_vector->vector; + size_t size; + *keylen = cmph_struct_vector->key_len; + size = *keylen; + *key = (char *)malloc(size); + memcpy(*key, (keys_vd + (cmph_struct_vector->position * cmph_struct_vector->struct_size) + cmph_struct_vector->key_offset), size); + cmph_struct_vector->position = cmph_struct_vector->position + 1; + return (int)(*keylen); +} + +static int key_vector_read(void *data, char **key, cmph_uint32 *keylen) +{ + cmph_vector_t *cmph_vector = (cmph_vector_t *)data; + char **keys_vd = (char **)cmph_vector->vector; + size_t size; + *keylen = (cmph_uint32)strlen(keys_vd[cmph_vector->position]); + size = *keylen; + *key = (char *)malloc(size + 1); + strcpy(*key, keys_vd[cmph_vector->position]); + cmph_vector->position = cmph_vector->position + 1; + return (int)(*keylen); + +} + + +static void key_nlfile_dispose(void *data, char *key, cmph_uint32 keylen) +{ + free(key); +} + +static void key_vector_dispose(void *data, char *key, cmph_uint32 keylen) +{ + free(key); +} + +static void key_nlfile_rewind(void *data) +{ + FILE *fd = (FILE *)data; + rewind(fd); +} + +static void key_struct_vector_rewind(void *data) +{ + cmph_struct_vector_t *cmph_struct_vector = (cmph_struct_vector_t *)data; + cmph_struct_vector->position = 0; +} + +static void key_vector_rewind(void *data) +{ + cmph_vector_t *cmph_vector = (cmph_vector_t *)data; + cmph_vector->position = 0; +} + +static cmph_uint32 count_nlfile_keys(FILE *fd) +{ + cmph_uint32 count = 0; + register char * ptr; + rewind(fd); + while(1) + { + char buf[BUFSIZ]; + ptr = fgets(buf, BUFSIZ, fd); + if (feof(fd)) break; + if (buf[strlen(buf) - 1] != '\n') continue; + ++count; + } + rewind(fd); + return count; +} + +cmph_io_adapter_t *cmph_io_nlfile_adapter(FILE * keys_fd) +{ + cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t)); + assert(key_source); + key_source->data = (void *)keys_fd; + key_source->nkeys = count_nlfile_keys(keys_fd); + key_source->read = key_nlfile_read; + key_source->dispose = key_nlfile_dispose; + key_source->rewind = key_nlfile_rewind; + return key_source; +} + +void cmph_io_nlfile_adapter_destroy(cmph_io_adapter_t * key_source) +{ + free(key_source); +} + +cmph_io_adapter_t *cmph_io_nlnkfile_adapter(FILE * keys_fd, cmph_uint32 nkeys) +{ + cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t)); + assert(key_source); + key_source->data = (void *)keys_fd; + key_source->nkeys = nkeys; + key_source->read = key_nlfile_read; + key_source->dispose = key_nlfile_dispose; + key_source->rewind = key_nlfile_rewind; + return key_source; +} + +void cmph_io_nlnkfile_adapter_destroy(cmph_io_adapter_t * key_source) +{ + free(key_source); +} + + +static cmph_io_adapter_t *cmph_io_struct_vector_new(void * vector, cmph_uint32 struct_size, cmph_uint32 key_offset, cmph_uint32 key_len, cmph_uint32 nkeys) +{ + cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t)); + cmph_struct_vector_t * cmph_struct_vector = (cmph_struct_vector_t *)malloc(sizeof(cmph_struct_vector_t)); + assert(key_source); + assert(cmph_struct_vector); + cmph_struct_vector->vector = vector; + cmph_struct_vector->position = 0; + cmph_struct_vector->struct_size = struct_size; + cmph_struct_vector->key_offset = key_offset; + cmph_struct_vector->key_len = key_len; + key_source->data = (void *)cmph_struct_vector; + key_source->nkeys = nkeys; + return key_source; +} + +static void cmph_io_struct_vector_destroy(cmph_io_adapter_t * key_source) +{ + cmph_struct_vector_t *cmph_struct_vector = (cmph_struct_vector_t *)key_source->data; + cmph_struct_vector->vector = NULL; + free(cmph_struct_vector); + free(key_source); +} + +static cmph_io_adapter_t *cmph_io_vector_new(void * vector, cmph_uint32 nkeys) +{ + cmph_io_adapter_t * key_source = (cmph_io_adapter_t *)malloc(sizeof(cmph_io_adapter_t)); + cmph_vector_t * cmph_vector = (cmph_vector_t *)malloc(sizeof(cmph_vector_t)); + assert(key_source); + assert(cmph_vector); + cmph_vector->vector = vector; + cmph_vector->position = 0; + key_source->data = (void *)cmph_vector; + key_source->nkeys = nkeys; + return key_source; +} + +static void cmph_io_vector_destroy(cmph_io_adapter_t * key_source) +{ + cmph_vector_t *cmph_vector = (cmph_vector_t *)key_source->data; + cmph_vector->vector = NULL; + free(cmph_vector); + free(key_source); +} + +cmph_io_adapter_t *cmph_io_byte_vector_adapter(cmph_uint8 ** vector, cmph_uint32 nkeys) +{ + cmph_io_adapter_t * key_source = cmph_io_vector_new(vector, nkeys); + key_source->read = key_byte_vector_read; + key_source->dispose = key_vector_dispose; + key_source->rewind = key_vector_rewind; + return key_source; +} +void cmph_io_byte_vector_adapter_destroy(cmph_io_adapter_t * key_source) +{ + cmph_io_vector_destroy(key_source); +} + +cmph_io_adapter_t *cmph_io_struct_vector_adapter(void * vector, cmph_uint32 struct_size, cmph_uint32 key_offset, cmph_uint32 key_len, cmph_uint32 nkeys) +{ + cmph_io_adapter_t * key_source = cmph_io_struct_vector_new(vector, struct_size, key_offset, key_len, nkeys); + key_source->read = key_struct_vector_read; + key_source->dispose = key_vector_dispose; + key_source->rewind = key_struct_vector_rewind; + return key_source; +} + +void cmph_io_struct_vector_adapter_destroy(cmph_io_adapter_t * key_source) +{ + cmph_io_struct_vector_destroy(key_source); +} + +cmph_io_adapter_t *cmph_io_vector_adapter(char ** vector, cmph_uint32 nkeys) +{ + cmph_io_adapter_t * key_source = cmph_io_vector_new(vector, nkeys); + key_source->read = key_vector_read; + key_source->dispose = key_vector_dispose; + key_source->rewind = key_vector_rewind; + return key_source; +} + +void cmph_io_vector_adapter_destroy(cmph_io_adapter_t * key_source) +{ + cmph_io_vector_destroy(key_source); +} + +cmph_config_t *cmph_config_new(cmph_io_adapter_t *key_source) +{ + cmph_config_t *mph = NULL; + mph = __config_new(key_source); + assert(mph); + mph->algo = CMPH_CHM; // default value + mph->data = chm_config_new(); + return mph; +} + +void cmph_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo) +{ + if (algo != mph->algo) + { + switch (mph->algo) + { + case CMPH_CHM: + chm_config_destroy(mph); + break; + case CMPH_BMZ: + bmz_config_destroy(mph); + break; + case CMPH_BMZ8: + bmz8_config_destroy(mph); + break; + case CMPH_BRZ: + brz_config_destroy(mph); + break; + case CMPH_FCH: + fch_config_destroy(mph); + break; + case CMPH_BDZ: + bdz_config_destroy(mph); + break; + case CMPH_BDZ_PH: + bdz_ph_config_destroy(mph); + break; + case CMPH_CHD_PH: + chd_ph_config_destroy(mph); + break; + case CMPH_CHD: + chd_config_destroy(mph); + break; + default: + assert(0); + } + switch(algo) + { + case CMPH_CHM: + mph->data = chm_config_new(); + break; + case CMPH_BMZ: + mph->data = bmz_config_new(); + break; + case CMPH_BMZ8: + mph->data = bmz8_config_new(); + break; + case CMPH_BRZ: + mph->data = brz_config_new(); + break; + case CMPH_FCH: + mph->data = fch_config_new(); + break; + case CMPH_BDZ: + mph->data = bdz_config_new(); + break; + case CMPH_BDZ_PH: + mph->data = bdz_ph_config_new(); + break; + case CMPH_CHD_PH: + mph->data = chd_ph_config_new(); + break; + case CMPH_CHD: + mph->data = chd_config_new(mph); + break; + default: + assert(0); + } + } + mph->algo = algo; +} + +void cmph_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir) +{ + if (mph->algo == CMPH_BRZ) + { + brz_config_set_tmp_dir(mph, tmp_dir); + } +} + + +void cmph_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd) +{ + if (mph->algo == CMPH_BRZ) + { + brz_config_set_mphf_fd(mph, mphf_fd); + } +} + +void cmph_config_set_b(cmph_config_t *mph, cmph_uint32 b) +{ + if (mph->algo == CMPH_BRZ) + { + brz_config_set_b(mph, b); + } + else if (mph->algo == CMPH_BDZ) + { + bdz_config_set_b(mph, b); + } + else if (mph->algo == CMPH_CHD_PH) + { + chd_ph_config_set_b(mph, b); + } + else if (mph->algo == CMPH_CHD) + { + chd_config_set_b(mph, b); + } +} + +void cmph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin) +{ + if (mph->algo == CMPH_CHD_PH) + { + chd_ph_config_set_keys_per_bin(mph, keys_per_bin); + } + else if (mph->algo == CMPH_CHD) + { + chd_config_set_keys_per_bin(mph, keys_per_bin); + } +} + +void cmph_config_set_memory_availability(cmph_config_t *mph, cmph_uint32 memory_availability) +{ + if (mph->algo == CMPH_BRZ) + { + brz_config_set_memory_availability(mph, memory_availability); + } +} + +void cmph_config_destroy(cmph_config_t *mph) +{ + if(mph) + { + DEBUGP("Destroying mph with algo %s\n", cmph_names[mph->algo]); + switch (mph->algo) + { + case CMPH_CHM: + chm_config_destroy(mph); + break; + case CMPH_BMZ: /* included -- Fabiano */ + bmz_config_destroy(mph); + break; + case CMPH_BMZ8: /* included -- Fabiano */ + bmz8_config_destroy(mph); + break; + case CMPH_BRZ: /* included -- Fabiano */ + brz_config_destroy(mph); + break; + case CMPH_FCH: /* included -- Fabiano */ + fch_config_destroy(mph); + break; + case CMPH_BDZ: /* included -- Fabiano */ + bdz_config_destroy(mph); + break; + case CMPH_BDZ_PH: /* included -- Fabiano */ + bdz_ph_config_destroy(mph); + break; + case CMPH_CHD_PH: /* included -- Fabiano */ + chd_ph_config_destroy(mph); + break; + case CMPH_CHD: /* included -- Fabiano */ + chd_config_destroy(mph); + break; + default: + assert(0); + } + __config_destroy(mph); + } +} + +void cmph_config_set_verbosity(cmph_config_t *mph, cmph_uint32 verbosity) +{ + mph->verbosity = verbosity; +} + +void cmph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) +{ + switch (mph->algo) + { + case CMPH_CHM: + chm_config_set_hashfuncs(mph, hashfuncs); + break; + case CMPH_BMZ: /* included -- Fabiano */ + bmz_config_set_hashfuncs(mph, hashfuncs); + break; + case CMPH_BMZ8: /* included -- Fabiano */ + bmz8_config_set_hashfuncs(mph, hashfuncs); + break; + case CMPH_BRZ: /* included -- Fabiano */ + brz_config_set_hashfuncs(mph, hashfuncs); + break; + case CMPH_FCH: /* included -- Fabiano */ + fch_config_set_hashfuncs(mph, hashfuncs); + break; + case CMPH_BDZ: /* included -- Fabiano */ + bdz_config_set_hashfuncs(mph, hashfuncs); + break; + case CMPH_BDZ_PH: /* included -- Fabiano */ + bdz_ph_config_set_hashfuncs(mph, hashfuncs); + break; + case CMPH_CHD_PH: /* included -- Fabiano */ + chd_ph_config_set_hashfuncs(mph, hashfuncs); + break; + case CMPH_CHD: /* included -- Fabiano */ + chd_config_set_hashfuncs(mph, hashfuncs); + break; + default: + break; + } + return; +} +void cmph_config_set_graphsize(cmph_config_t *mph, double c) +{ + mph->c = c; + return; +} + +cmph_t *cmph_new(cmph_config_t *mph) +{ + cmph_t *mphf = NULL; + double c = mph->c; + + DEBUGP("Creating mph with algorithm %s\n", cmph_names[mph->algo]); + switch (mph->algo) + { + case CMPH_CHM: + DEBUGP("Creating chm hash\n"); + mphf = chm_new(mph, c); + break; + case CMPH_BMZ: /* included -- Fabiano */ + DEBUGP("Creating bmz hash\n"); + mphf = bmz_new(mph, c); + break; + case CMPH_BMZ8: /* included -- Fabiano */ + DEBUGP("Creating bmz8 hash\n"); + mphf = bmz8_new(mph, c); + break; + case CMPH_BRZ: /* included -- Fabiano */ + DEBUGP("Creating brz hash\n"); + if (c >= 2.0) brz_config_set_algo(mph, CMPH_FCH); + else brz_config_set_algo(mph, CMPH_BMZ8); + mphf = brz_new(mph, c); + break; + case CMPH_FCH: /* included -- Fabiano */ + DEBUGP("Creating fch hash\n"); + mphf = fch_new(mph, c); + break; + case CMPH_BDZ: /* included -- Fabiano */ + DEBUGP("Creating bdz hash\n"); + mphf = bdz_new(mph, c); + break; + case CMPH_BDZ_PH: /* included -- Fabiano */ + DEBUGP("Creating bdz_ph hash\n"); + mphf = bdz_ph_new(mph, c); + break; + case CMPH_CHD_PH: /* included -- Fabiano */ + DEBUGP("Creating chd_ph hash\n"); + mphf = chd_ph_new(mph, c); + break; + case CMPH_CHD: /* included -- Fabiano */ + DEBUGP("Creating chd hash\n"); + mphf = chd_new(mph, c); + break; + default: + assert(0); + } + return mphf; +} + +int cmph_dump(cmph_t *mphf, FILE *f) +{ + switch (mphf->algo) + { + case CMPH_CHM: + return chm_dump(mphf, f); + case CMPH_BMZ: /* included -- Fabiano */ + return bmz_dump(mphf, f); + case CMPH_BMZ8: /* included -- Fabiano */ + return bmz8_dump(mphf, f); + case CMPH_BRZ: /* included -- Fabiano */ + return brz_dump(mphf, f); + case CMPH_FCH: /* included -- Fabiano */ + return fch_dump(mphf, f); + case CMPH_BDZ: /* included -- Fabiano */ + return bdz_dump(mphf, f); + case CMPH_BDZ_PH: /* included -- Fabiano */ + return bdz_ph_dump(mphf, f); + case CMPH_CHD_PH: /* included -- Fabiano */ + return chd_ph_dump(mphf, f); + case CMPH_CHD: /* included -- Fabiano */ + return chd_dump(mphf, f); + default: + assert(0); + } + assert(0); + return 0; +} +cmph_t *cmph_load(FILE *f) +{ + cmph_t *mphf = NULL; + DEBUGP("Loading mphf generic parts\n"); + mphf = __cmph_load(f); + if (mphf == NULL) return NULL; + DEBUGP("Loading mphf algorithm dependent parts\n"); + + switch (mphf->algo) + { + case CMPH_CHM: + chm_load(f, mphf); + break; + case CMPH_BMZ: /* included -- Fabiano */ + DEBUGP("Loading bmz algorithm dependent parts\n"); + bmz_load(f, mphf); + break; + case CMPH_BMZ8: /* included -- Fabiano */ + DEBUGP("Loading bmz8 algorithm dependent parts\n"); + bmz8_load(f, mphf); + break; + case CMPH_BRZ: /* included -- Fabiano */ + DEBUGP("Loading brz algorithm dependent parts\n"); + brz_load(f, mphf); + break; + case CMPH_FCH: /* included -- Fabiano */ + DEBUGP("Loading fch algorithm dependent parts\n"); + fch_load(f, mphf); + break; + case CMPH_BDZ: /* included -- Fabiano */ + DEBUGP("Loading bdz algorithm dependent parts\n"); + bdz_load(f, mphf); + break; + case CMPH_BDZ_PH: /* included -- Fabiano */ + DEBUGP("Loading bdz_ph algorithm dependent parts\n"); + bdz_ph_load(f, mphf); + break; + case CMPH_CHD_PH: /* included -- Fabiano */ + DEBUGP("Loading chd_ph algorithm dependent parts\n"); + chd_ph_load(f, mphf); + break; + case CMPH_CHD: /* included -- Fabiano */ + DEBUGP("Loading chd algorithm dependent parts\n"); + chd_load(f, mphf); + break; + default: + assert(0); + } + DEBUGP("Loaded mphf\n"); + return mphf; +} + + +cmph_uint32 cmph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) +{ + DEBUGP("mphf algorithm: %u \n", mphf->algo); + switch(mphf->algo) + { + case CMPH_CHM: + return chm_search(mphf, key, keylen); + case CMPH_BMZ: /* included -- Fabiano */ + DEBUGP("bmz algorithm search\n"); + return bmz_search(mphf, key, keylen); + case CMPH_BMZ8: /* included -- Fabiano */ + DEBUGP("bmz8 algorithm search\n"); + return bmz8_search(mphf, key, keylen); + case CMPH_BRZ: /* included -- Fabiano */ + DEBUGP("brz algorithm search\n"); + return brz_search(mphf, key, keylen); + case CMPH_FCH: /* included -- Fabiano */ + DEBUGP("fch algorithm search\n"); + return fch_search(mphf, key, keylen); + case CMPH_BDZ: /* included -- Fabiano */ + DEBUGP("bdz algorithm search\n"); + return bdz_search(mphf, key, keylen); + case CMPH_BDZ_PH: /* included -- Fabiano */ + DEBUGP("bdz_ph algorithm search\n"); + return bdz_ph_search(mphf, key, keylen); + case CMPH_CHD_PH: /* included -- Fabiano */ + DEBUGP("chd_ph algorithm search\n"); + return chd_ph_search(mphf, key, keylen); + case CMPH_CHD: /* included -- Fabiano */ + DEBUGP("chd algorithm search\n"); + return chd_search(mphf, key, keylen); + default: + assert(0); + } + assert(0); + return 0; +} + +cmph_uint32 cmph_size(cmph_t *mphf) +{ + return mphf->size; +} + +void cmph_destroy(cmph_t *mphf) +{ + switch(mphf->algo) + { + case CMPH_CHM: + chm_destroy(mphf); + return; + case CMPH_BMZ: /* included -- Fabiano */ + bmz_destroy(mphf); + return; + case CMPH_BMZ8: /* included -- Fabiano */ + bmz8_destroy(mphf); + return; + case CMPH_BRZ: /* included -- Fabiano */ + brz_destroy(mphf); + return; + case CMPH_FCH: /* included -- Fabiano */ + fch_destroy(mphf); + return; + case CMPH_BDZ: /* included -- Fabiano */ + bdz_destroy(mphf); + return; + case CMPH_BDZ_PH: /* included -- Fabiano */ + bdz_ph_destroy(mphf); + return; + case CMPH_CHD_PH: /* included -- Fabiano */ + chd_ph_destroy(mphf); + return; + case CMPH_CHD: /* included -- Fabiano */ + chd_destroy(mphf); + return; + default: + assert(0); + } + assert(0); + return; +} + + +/** \fn void cmph_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void cmph_pack(cmph_t *mphf, void *packed_mphf) +{ + // packing algorithm type to be used in cmph.c + cmph_uint32 * ptr = (cmph_uint32 *) packed_mphf; + *ptr++ = mphf->algo; + DEBUGP("mphf->algo = %u\n", mphf->algo); + switch(mphf->algo) + { + case CMPH_CHM: + chm_pack(mphf, ptr); + break; + case CMPH_BMZ: /* included -- Fabiano */ + bmz_pack(mphf, ptr); + break; + case CMPH_BMZ8: /* included -- Fabiano */ + bmz8_pack(mphf, ptr); + break; + case CMPH_BRZ: /* included -- Fabiano */ + brz_pack(mphf, ptr); + break; + case CMPH_FCH: /* included -- Fabiano */ + fch_pack(mphf, ptr); + break; + case CMPH_BDZ: /* included -- Fabiano */ + bdz_pack(mphf, ptr); + break; + case CMPH_BDZ_PH: /* included -- Fabiano */ + bdz_ph_pack(mphf, ptr); + break; + case CMPH_CHD_PH: /* included -- Fabiano */ + chd_ph_pack(mphf, ptr); + break; + case CMPH_CHD: /* included -- Fabiano */ + chd_pack(mphf, ptr); + break; + default: + assert(0); + } + return; +} + +/** \fn cmph_uint32 cmph_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 cmph_packed_size(cmph_t *mphf) +{ + switch(mphf->algo) + { + case CMPH_CHM: + return chm_packed_size(mphf); + case CMPH_BMZ: /* included -- Fabiano */ + return bmz_packed_size(mphf); + case CMPH_BMZ8: /* included -- Fabiano */ + return bmz8_packed_size(mphf); + case CMPH_BRZ: /* included -- Fabiano */ + return brz_packed_size(mphf); + case CMPH_FCH: /* included -- Fabiano */ + return fch_packed_size(mphf); + case CMPH_BDZ: /* included -- Fabiano */ + return bdz_packed_size(mphf); + case CMPH_BDZ_PH: /* included -- Fabiano */ + return bdz_ph_packed_size(mphf); + case CMPH_CHD_PH: /* included -- Fabiano */ + return chd_ph_packed_size(mphf); + case CMPH_CHD: /* included -- Fabiano */ + return chd_packed_size(mphf); + default: + assert(0); + } + return 0; // FAILURE +} + +/** cmph_uint32 cmph_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 cmph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) +{ + cmph_uint32 *ptr = (cmph_uint32 *)packed_mphf; +// fprintf(stderr, "algo:%u\n", *ptr); + switch(*ptr) + { + case CMPH_CHM: + return chm_search_packed(++ptr, key, keylen); + case CMPH_BMZ: /* included -- Fabiano */ + return bmz_search_packed(++ptr, key, keylen); + case CMPH_BMZ8: /* included -- Fabiano */ + return bmz8_search_packed(++ptr, key, keylen); + case CMPH_BRZ: /* included -- Fabiano */ + return brz_search_packed(++ptr, key, keylen); + case CMPH_FCH: /* included -- Fabiano */ + return fch_search_packed(++ptr, key, keylen); + case CMPH_BDZ: /* included -- Fabiano */ + return bdz_search_packed(++ptr, key, keylen); + case CMPH_BDZ_PH: /* included -- Fabiano */ + return bdz_ph_search_packed(++ptr, key, keylen); + case CMPH_CHD_PH: /* included -- Fabiano */ + return chd_ph_search_packed(++ptr, key, keylen); + case CMPH_CHD: /* included -- Fabiano */ + return chd_search_packed(++ptr, key, keylen); + default: + assert(0); + } + return 0; // FAILURE +} diff --git a/cmph/cmph.h b/cmph/cmph.h new file mode 100644 index 000000000..1bc009e19 --- /dev/null +++ b/cmph/cmph.h @@ -0,0 +1,112 @@ +#ifndef __CMPH_H__ +#define __CMPH_H__ + +#include +#include + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include "cmph_types.h" + +typedef struct __config_t cmph_config_t; +typedef struct __cmph_t cmph_t; + +typedef struct +{ + void *data; + cmph_uint32 nkeys; + int (*read)(void *, char **, cmph_uint32 *); + void (*dispose)(void *, char *, cmph_uint32); + void (*rewind)(void *); +} cmph_io_adapter_t; + +/** Adapter pattern API **/ +/* please call free() in the created adapters */ +cmph_io_adapter_t *cmph_io_nlfile_adapter(FILE * keys_fd); +void cmph_io_nlfile_adapter_destroy(cmph_io_adapter_t * key_source); + +cmph_io_adapter_t *cmph_io_nlnkfile_adapter(FILE * keys_fd, cmph_uint32 nkeys); +void cmph_io_nlnkfile_adapter_destroy(cmph_io_adapter_t * key_source); + +cmph_io_adapter_t *cmph_io_vector_adapter(char ** vector, cmph_uint32 nkeys); +void cmph_io_vector_adapter_destroy(cmph_io_adapter_t * key_source); + +cmph_io_adapter_t *cmph_io_byte_vector_adapter(cmph_uint8 ** vector, cmph_uint32 nkeys); +void cmph_io_byte_vector_adapter_destroy(cmph_io_adapter_t * key_source); + +cmph_io_adapter_t *cmph_io_struct_vector_adapter(void * vector, + cmph_uint32 struct_size, + cmph_uint32 key_offset, + cmph_uint32 key_len, + cmph_uint32 nkeys); + +void cmph_io_struct_vector_adapter_destroy(cmph_io_adapter_t * key_source); + +/** Hash configuration API **/ +cmph_config_t *cmph_config_new(cmph_io_adapter_t *key_source); +void cmph_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs); +void cmph_config_set_verbosity(cmph_config_t *mph, cmph_uint32 verbosity); +void cmph_config_set_graphsize(cmph_config_t *mph, double c); +void cmph_config_set_algo(cmph_config_t *mph, CMPH_ALGO algo); +void cmph_config_set_tmp_dir(cmph_config_t *mph, cmph_uint8 *tmp_dir); +void cmph_config_set_mphf_fd(cmph_config_t *mph, FILE *mphf_fd); +void cmph_config_set_b(cmph_config_t *mph, cmph_uint32 b); +void cmph_config_set_keys_per_bin(cmph_config_t *mph, cmph_uint32 keys_per_bin); +void cmph_config_set_memory_availability(cmph_config_t *mph, cmph_uint32 memory_availability); +void cmph_config_destroy(cmph_config_t *mph); + +/** Hash API **/ +cmph_t *cmph_new(cmph_config_t *mph); + +/** cmph_uint32 cmph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); + * \brief Computes the mphf value. + * \param mphf pointer to the resulting function + * \param key is the key to be hashed + * \param keylen is the key legth in bytes + * \return The mphf value + */ +cmph_uint32 cmph_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); + +cmph_uint32 cmph_size(cmph_t *mphf); +void cmph_destroy(cmph_t *mphf); + +/** Hash serialization/deserialization */ +int cmph_dump(cmph_t *mphf, FILE *f); +cmph_t *cmph_load(FILE *f); + +/** \fn void cmph_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the + * \param resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void cmph_pack(cmph_t *mphf, void *packed_mphf); + +/** \fn cmph_uint32 cmph_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 cmph_packed_size(cmph_t *mphf); + +/** cmph_uint32 cmph_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 cmph_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen); + +// TIMING functions. To use the macro CMPH_TIMING must be defined +#include "cmph_time.h" + + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/cmph/cmph_structs.c b/cmph/cmph_structs.c new file mode 100644 index 000000000..b5634248f --- /dev/null +++ b/cmph/cmph_structs.c @@ -0,0 +1,69 @@ +#include "cmph_structs.h" + +#include + +//#define DEBUG +#include "debug.h" + +cmph_config_t *__config_new(cmph_io_adapter_t *key_source) +{ + cmph_config_t *mph = (cmph_config_t *)malloc(sizeof(cmph_config_t)); + memset(mph, 0, sizeof(cmph_config_t)); + if (mph == NULL) return NULL; + mph->key_source = key_source; + mph->verbosity = 0; + mph->data = NULL; + mph->c = 0; + return mph; +} + +void __config_destroy(cmph_config_t *mph) +{ + free(mph); +} + +void __cmph_dump(cmph_t *mphf, FILE *fd) +{ + register size_t nbytes; + nbytes = fwrite(cmph_names[mphf->algo], (size_t)(strlen(cmph_names[mphf->algo]) + 1), (size_t)1, fd); + nbytes = fwrite(&(mphf->size), sizeof(mphf->size), (size_t)1, fd); +} +cmph_t *__cmph_load(FILE *f) +{ + cmph_t *mphf = NULL; + cmph_uint32 i; + char algo_name[BUFSIZ]; + char *ptr = algo_name; + CMPH_ALGO algo = CMPH_COUNT; + register size_t nbytes; + + DEBUGP("Loading mphf\n"); + while(1) + { + size_t c = fread(ptr, (size_t)1, (size_t)1, f); + if (c != 1) return NULL; + if (*ptr == 0) break; + ++ptr; + } + for(i = 0; i < CMPH_COUNT; ++i) + { + if (strcmp(algo_name, cmph_names[i]) == 0) + { + algo = i; + } + } + if (algo == CMPH_COUNT) + { + DEBUGP("Algorithm %s not found\n", algo_name); + return NULL; + } + mphf = (cmph_t *)malloc(sizeof(cmph_t)); + mphf->algo = algo; + nbytes = fread(&(mphf->size), sizeof(mphf->size), (size_t)1, f); + mphf->data = NULL; + DEBUGP("Algorithm is %s and mphf is sized %u\n", cmph_names[algo], mphf->size); + + return mphf; +} + + diff --git a/cmph/cmph_structs.h b/cmph/cmph_structs.h new file mode 100644 index 000000000..88fafb6ce --- /dev/null +++ b/cmph/cmph_structs.h @@ -0,0 +1,33 @@ +#ifndef __CMPH_STRUCTS_H__ +#define __CMPH_STRUCTS_H__ + +#include "cmph.h" + +/** Hash generation algorithm data + */ +struct __config_t +{ + CMPH_ALGO algo; + cmph_io_adapter_t *key_source; + cmph_uint32 verbosity; + double c; + void *data; // algorithm dependent data +}; + +/** Hash querying algorithm data + */ +struct __cmph_t +{ + CMPH_ALGO algo; + cmph_uint32 size; + cmph_io_adapter_t *key_source; + void *data; // algorithm dependent data +}; + +cmph_config_t *__config_new(cmph_io_adapter_t *key_source); +void __config_destroy(cmph_config_t*); +void __cmph_dump(cmph_t *mphf, FILE *); +cmph_t *__cmph_load(FILE *f); + + +#endif diff --git a/cmph/cmph_time.h b/cmph/cmph_time.h new file mode 100644 index 000000000..d8018090c --- /dev/null +++ b/cmph/cmph_time.h @@ -0,0 +1,62 @@ +#ifdef ELAPSED_TIME_IN_SECONDS +#undef ELAPSED_TIME_IN_SECONDS +#endif + +#ifdef ELAPSED_TIME_IN_uSECONDS +#undef ELAPSED_TIME_IN_uSECONDS +#endif + +#ifdef WIN32 +// include headers to use gettimeofday +#else + #ifdef __GNUC__ + #include + #include + #endif +#endif + +#ifdef __GNUC__ + #ifndef __CMPH_TIME_H__ + #define __CMPH_TIME_H__ + static inline void elapsed_time_in_seconds(double * elapsed_time) + { + struct timeval e_time; + if (gettimeofday(&e_time, NULL) < 0) { + return; + } + *elapsed_time = (double)e_time.tv_sec + ((double)e_time.tv_usec/1000000.0); + } + static inline void dummy_elapsed_time_in_seconds() + { + } + static inline void elapsed_time_in_useconds(cmph_uint64 * elapsed_time) + { + struct timeval e_time; + if (gettimeofday(&e_time, NULL) < 0) { + return; + } + *elapsed_time = (cmph_uint64)(e_time.tv_sec*1000000 + e_time.tv_usec); + } + static inline void dummy_elapsed_time_in_useconds() + { + } + #endif +#endif + +#ifdef CMPH_TIMING + #ifdef __GNUC__ + #define ELAPSED_TIME_IN_SECONDS elapsed_time_in_seconds + #define ELAPSED_TIME_IN_uSECONDS elapsed_time_in_useconds + #else + #define ELAPSED_TIME_IN_SECONDS dummy_elapsed_time_in_seconds + #define ELAPSED_TIME_IN_uSECONDS dummy_elapsed_time_in_useconds + #endif +#else + #ifdef __GNUC__ + #define ELAPSED_TIME_IN_SECONDS + #define ELAPSED_TIME_IN_uSECONDS + #else + #define ELAPSED_TIME_IN_SECONDS dummy_elapsed_time_in_seconds + #define ELAPSED_TIME_IN_uSECONDS dummy_elapsed_time_in_useconds + #endif +#endif diff --git a/cmph/cmph_types.h b/cmph/cmph_types.h new file mode 100644 index 000000000..40f43329a --- /dev/null +++ b/cmph/cmph_types.h @@ -0,0 +1,42 @@ +#ifndef __CMPH_TYPES_H__ +#define __CMPH_TYPES_H__ + +typedef char cmph_int8; +typedef unsigned char cmph_uint8; + +typedef short cmph_int16; +typedef unsigned short cmph_uint16; + +typedef int cmph_int32; +typedef unsigned int cmph_uint32; + +#if defined(__ia64) || defined(__x86_64__) + /** \typedef long cmph_int64; + * \brief 64-bit integer for a 64-bit achitecture. + */ + typedef long cmph_int64; + + /** \typedef unsigned long cmph_uint64; + * \brief Unsigned 64-bit integer for a 64-bit achitecture. + */ + typedef unsigned long cmph_uint64; +#else + /** \typedef long long cmph_int64; + * \brief 64-bit integer for a 32-bit achitecture. + */ + typedef long long cmph_int64; + + /** \typedef unsigned long long cmph_uint64; + * \brief Unsigned 64-bit integer for a 32-bit achitecture. + */ + typedef unsigned long long cmph_uint64; +#endif + +typedef enum { CMPH_HASH_JENKINS, CMPH_HASH_COUNT } CMPH_HASH; +extern const char *cmph_hash_names[]; +typedef enum { CMPH_BMZ, CMPH_BMZ8, CMPH_CHM, CMPH_BRZ, CMPH_FCH, + CMPH_BDZ, CMPH_BDZ_PH, + CMPH_CHD_PH, CMPH_CHD, CMPH_COUNT } CMPH_ALGO; +extern const char *cmph_names[]; + +#endif diff --git a/cmph/compressed_rank.c b/cmph/compressed_rank.c new file mode 100644 index 000000000..822b2e155 --- /dev/null +++ b/cmph/compressed_rank.c @@ -0,0 +1,321 @@ +#include +#include +#include +#include +#include"compressed_rank.h" +#include"bitbool.h" +// #define DEBUG +#include"debug.h" +static inline cmph_uint32 compressed_rank_i_log2(cmph_uint32 x) +{ + register cmph_uint32 res = 0; + + while(x > 1) + { + x >>= 1; + res++; + } + return res; +}; + +void compressed_rank_init(compressed_rank_t * cr) +{ + cr->max_val = 0; + cr->n = 0; + cr->rem_r = 0; + select_init(&cr->sel); + cr->vals_rems = 0; +} + +void compressed_rank_destroy(compressed_rank_t * cr) +{ + free(cr->vals_rems); + cr->vals_rems = 0; + select_destroy(&cr->sel); +} + +void compressed_rank_generate(compressed_rank_t * cr, cmph_uint32 * vals_table, cmph_uint32 n) +{ + register cmph_uint32 i,j; + register cmph_uint32 rems_mask; + register cmph_uint32 * select_vec = 0; + cr->n = n; + cr->max_val = vals_table[cr->n - 1]; + cr->rem_r = compressed_rank_i_log2(cr->max_val/cr->n); + if(cr->rem_r == 0) + { + cr->rem_r = 1; + } + select_vec = (cmph_uint32 *) calloc(cr->max_val >> cr->rem_r, sizeof(cmph_uint32)); + cr->vals_rems = (cmph_uint32 *) calloc(BITS_TABLE_SIZE(cr->n, cr->rem_r), sizeof(cmph_uint32)); + rems_mask = (1U << cr->rem_r) - 1U; + + for(i = 0; i < cr->n; i++) + { + set_bits_value(cr->vals_rems, i, vals_table[i] & rems_mask, cr->rem_r, rems_mask); + } + + for(i = 1, j = 0; i <= cr->max_val >> cr->rem_r; i++) + { + while(i > (vals_table[j] >> cr->rem_r)) + { + j++; + } + select_vec[i - 1] = j; + }; + + + // FABIANO: before it was (cr->total_length >> cr->rem_r) + 1. But I wiped out the + 1 because + // I changed the select structure to work up to m, instead of up to m - 1. + select_generate(&cr->sel, select_vec, cr->max_val >> cr->rem_r, cr->n); + + free(select_vec); +} + +cmph_uint32 compressed_rank_query(compressed_rank_t * cr, cmph_uint32 idx) +{ + register cmph_uint32 rems_mask; + register cmph_uint32 val_quot, val_rem; + register cmph_uint32 sel_res, rank; + + if(idx > cr->max_val) + { + return cr->n; + } + + val_quot = idx >> cr->rem_r; + rems_mask = (1U << cr->rem_r) - 1U; + val_rem = idx & rems_mask; + if(val_quot == 0) + { + rank = sel_res = 0; + } + else + { + sel_res = select_query(&cr->sel, val_quot - 1) + 1; + rank = sel_res - val_quot; + } + + do + { + if(GETBIT32(cr->sel.bits_vec, sel_res)) + { + break; + } + if(get_bits_value(cr->vals_rems, rank, cr->rem_r, rems_mask) >= val_rem) + { + break; + } + sel_res++; + rank++; + } while(1); + + return rank; +} + +cmph_uint32 compressed_rank_get_space_usage(compressed_rank_t * cr) +{ + register cmph_uint32 space_usage = select_get_space_usage(&cr->sel); + space_usage += BITS_TABLE_SIZE(cr->n, cr->rem_r)*(cmph_uint32)sizeof(cmph_uint32)*8; + space_usage += 3*(cmph_uint32)sizeof(cmph_uint32)*8; + return space_usage; +} + +void compressed_rank_dump(compressed_rank_t * cr, char **buf, cmph_uint32 *buflen) +{ + register cmph_uint32 sel_size = select_packed_size(&(cr->sel)); + register cmph_uint32 vals_rems_size = BITS_TABLE_SIZE(cr->n, cr->rem_r) * (cmph_uint32)sizeof(cmph_uint32); + register cmph_uint32 pos = 0; + char * buf_sel = 0; + cmph_uint32 buflen_sel = 0; + + *buflen = 4*(cmph_uint32)sizeof(cmph_uint32) + sel_size + vals_rems_size; + + DEBUGP("sel_size = %u\n", sel_size); + DEBUGP("vals_rems_size = %u\n", vals_rems_size); + + *buf = (char *)calloc(*buflen, sizeof(char)); + + if (!*buf) + { + *buflen = UINT_MAX; + return; + } + + // dumping max_val, n and rem_r + memcpy(*buf, &(cr->max_val), sizeof(cmph_uint32)); + pos += (cmph_uint32)sizeof(cmph_uint32); + DEBUGP("max_val = %u\n", cr->max_val); + + memcpy(*buf + pos, &(cr->n), sizeof(cmph_uint32)); + pos += (cmph_uint32)sizeof(cmph_uint32); + DEBUGP("n = %u\n", cr->n); + + memcpy(*buf + pos, &(cr->rem_r), sizeof(cmph_uint32)); + pos += (cmph_uint32)sizeof(cmph_uint32); + DEBUGP("rem_r = %u\n", cr->rem_r); + + // dumping sel + select_dump(&cr->sel, &buf_sel, &buflen_sel); + memcpy(*buf + pos, &buflen_sel, sizeof(cmph_uint32)); + pos += (cmph_uint32)sizeof(cmph_uint32); + DEBUGP("buflen_sel = %u\n", buflen_sel); + + memcpy(*buf + pos, buf_sel, buflen_sel); + + #ifdef DEBUG + cmph_uint32 i = 0; + for(i = 0; i < buflen_sel; i++) + { + DEBUGP("pos = %u -- buf_sel[%u] = %u\n", pos, i, *(*buf + pos + i)); + } + #endif + pos += buflen_sel; + + free(buf_sel); + + // dumping vals_rems + memcpy(*buf + pos, cr->vals_rems, vals_rems_size); + #ifdef DEBUG + for(i = 0; i < vals_rems_size; i++) + { + DEBUGP("pos = %u -- vals_rems_size = %u -- vals_rems[%u] = %u\n", pos, vals_rems_size, i, *(*buf + pos + i)); + } + #endif + pos += vals_rems_size; + + DEBUGP("Dumped compressed rank structure with size %u bytes\n", *buflen); +} + +void compressed_rank_load(compressed_rank_t * cr, const char *buf, cmph_uint32 buflen) +{ + register cmph_uint32 pos = 0; + cmph_uint32 buflen_sel = 0; + register cmph_uint32 vals_rems_size = 0; + + // loading max_val, n, and rem_r + memcpy(&(cr->max_val), buf, sizeof(cmph_uint32)); + pos += (cmph_uint32)sizeof(cmph_uint32); + DEBUGP("max_val = %u\n", cr->max_val); + + memcpy(&(cr->n), buf + pos, sizeof(cmph_uint32)); + pos += (cmph_uint32)sizeof(cmph_uint32); + DEBUGP("n = %u\n", cr->n); + + memcpy(&(cr->rem_r), buf + pos, sizeof(cmph_uint32)); + pos += (cmph_uint32)sizeof(cmph_uint32); + DEBUGP("rem_r = %u\n", cr->rem_r); + + // loading sel + memcpy(&buflen_sel, buf + pos, sizeof(cmph_uint32)); + pos += (cmph_uint32)sizeof(cmph_uint32); + DEBUGP("buflen_sel = %u\n", buflen_sel); + + select_load(&cr->sel, buf + pos, buflen_sel); + #ifdef DEBUG + cmph_uint32 i = 0; + for(i = 0; i < buflen_sel; i++) + { + DEBUGP("pos = %u -- buf_sel[%u] = %u\n", pos, i, *(buf + pos + i)); + } + #endif + pos += buflen_sel; + + // loading vals_rems + if(cr->vals_rems) + { + free(cr->vals_rems); + } + vals_rems_size = BITS_TABLE_SIZE(cr->n, cr->rem_r); + cr->vals_rems = (cmph_uint32 *) calloc(vals_rems_size, sizeof(cmph_uint32)); + vals_rems_size *= 4; + memcpy(cr->vals_rems, buf + pos, vals_rems_size); + + #ifdef DEBUG + for(i = 0; i < vals_rems_size; i++) + { + DEBUGP("pos = %u -- vals_rems_size = %u -- vals_rems[%u] = %u\n", pos, vals_rems_size, i, *(buf + pos + i)); + } + #endif + pos += vals_rems_size; + + DEBUGP("Loaded compressed rank structure with size %u bytes\n", buflen); +} + + + +void compressed_rank_pack(compressed_rank_t *cr, void *cr_packed) +{ + if (cr && cr_packed) + { + char *buf = NULL; + cmph_uint32 buflen = 0; + compressed_rank_dump(cr, &buf, &buflen); + memcpy(cr_packed, buf, buflen); + free(buf); + } +} + +cmph_uint32 compressed_rank_packed_size(compressed_rank_t *cr) +{ + register cmph_uint32 sel_size = select_packed_size(&cr->sel); + register cmph_uint32 vals_rems_size = BITS_TABLE_SIZE(cr->n, cr->rem_r) * (cmph_uint32)sizeof(cmph_uint32); + return 4 * (cmph_uint32)sizeof(cmph_uint32) + sel_size + vals_rems_size; +} + +cmph_uint32 compressed_rank_query_packed(void * cr_packed, cmph_uint32 idx) +{ + // unpacking cr_packed + register cmph_uint32 *ptr = (cmph_uint32 *)cr_packed; + register cmph_uint32 max_val = *ptr++; + register cmph_uint32 n = *ptr++; + register cmph_uint32 rem_r = *ptr++; + register cmph_uint32 buflen_sel = *ptr++; + register cmph_uint32 * sel_packed = ptr; + + register cmph_uint32 * bits_vec = sel_packed + 2; // skipping n and m + + register cmph_uint32 * vals_rems = (ptr += (buflen_sel >> 2)); + + // compressed sequence query computation + register cmph_uint32 rems_mask; + register cmph_uint32 val_quot, val_rem; + register cmph_uint32 sel_res, rank; + + if(idx > max_val) + { + return n; + } + + val_quot = idx >> rem_r; + rems_mask = (1U << rem_r) - 1U; + val_rem = idx & rems_mask; + if(val_quot == 0) + { + rank = sel_res = 0; + } + else + { + sel_res = select_query_packed(sel_packed, val_quot - 1) + 1; + rank = sel_res - val_quot; + } + + do + { + if(GETBIT32(bits_vec, sel_res)) + { + break; + } + if(get_bits_value(vals_rems, rank, rem_r, rems_mask) >= val_rem) + { + break; + } + sel_res++; + rank++; + } while(1); + + return rank; +} + + + diff --git a/cmph/compressed_rank.h b/cmph/compressed_rank.h new file mode 100644 index 000000000..bfe930dd3 --- /dev/null +++ b/cmph/compressed_rank.h @@ -0,0 +1,55 @@ +#ifndef __CMPH_COMPRESSED_RANK_H__ +#define __CMPH_COMPRESSED_RANK_H__ + +#include "select.h" + +struct _compressed_rank_t +{ + cmph_uint32 max_val; + cmph_uint32 n; // number of values stored in vals_rems + // The length in bits of each value is decomposed into two compnents: the lg(n) MSBs are stored in rank_select data structure + // the remaining LSBs are stored in a table of n cells, each one of rem_r bits. + cmph_uint32 rem_r; + select_t sel; + cmph_uint32 * vals_rems; +}; + +typedef struct _compressed_rank_t compressed_rank_t; + +void compressed_rank_init(compressed_rank_t * cr); + +void compressed_rank_destroy(compressed_rank_t * cr); + +void compressed_rank_generate(compressed_rank_t * cr, cmph_uint32 * vals_table, cmph_uint32 n); + +cmph_uint32 compressed_rank_query(compressed_rank_t * cr, cmph_uint32 idx); + +cmph_uint32 compressed_rank_get_space_usage(compressed_rank_t * cr); + +void compressed_rank_dump(compressed_rank_t * cr, char **buf, cmph_uint32 *buflen); + +void compressed_rank_load(compressed_rank_t * cr, const char *buf, cmph_uint32 buflen); + + +/** \fn void compressed_rank_pack(compressed_rank_t *cr, void *cr_packed); + * \brief Support the ability to pack a compressed_rank structure into a preallocated contiguous memory space pointed by cr_packed. + * \param cr points to the compressed_rank structure + * \param cr_packed pointer to the contiguous memory area used to store the compressed_rank structure. The size of cr_packed must be at least @see compressed_rank_packed_size + */ +void compressed_rank_pack(compressed_rank_t *cr, void *cr_packed); + +/** \fn cmph_uint32 compressed_rank_packed_size(compressed_rank_t *cr); + * \brief Return the amount of space needed to pack a compressed_rank structure. + * \return the size of the packed compressed_rank structure or zero for failures + */ +cmph_uint32 compressed_rank_packed_size(compressed_rank_t *cr); + + +/** \fn cmph_uint32 compressed_rank_query_packed(void * cr_packed, cmph_uint32 idx); + * \param cr_packed is a pointer to a contiguous memory area + * \param idx is an index to compute the rank + * \return an integer that represents the compressed_rank value. + */ +cmph_uint32 compressed_rank_query_packed(void * cr_packed, cmph_uint32 idx); + +#endif diff --git a/cmph/compressed_seq.c b/cmph/compressed_seq.c new file mode 100644 index 000000000..e558196d4 --- /dev/null +++ b/cmph/compressed_seq.c @@ -0,0 +1,378 @@ +#include "compressed_seq.h" +#include +#include +#include +#include +#include + +#include "bitbool.h" + +// #define DEBUG +#include "debug.h" + +static inline cmph_uint32 compressed_seq_i_log2(cmph_uint32 x) +{ + register cmph_uint32 res = 0; + + while(x > 1) + { + x >>= 1; + res++; + } + return res; +}; + +void compressed_seq_init(compressed_seq_t * cs) +{ + select_init(&cs->sel); + cs->n = 0; + cs->rem_r = 0; + cs->length_rems = 0; + cs->total_length = 0; + cs->store_table = 0; +} + +void compressed_seq_destroy(compressed_seq_t * cs) +{ + free(cs->store_table); + cs->store_table = 0; + free(cs->length_rems); + cs->length_rems = 0; + select_destroy(&cs->sel); +}; + + +void compressed_seq_generate(compressed_seq_t * cs, cmph_uint32 * vals_table, cmph_uint32 n) +{ + register cmph_uint32 i; + // lengths: represents lengths of encoded values + register cmph_uint32 * lengths = (cmph_uint32 *)calloc(n, sizeof(cmph_uint32)); + register cmph_uint32 rems_mask; + register cmph_uint32 stored_value; + + cs->n = n; + cs->total_length = 0; + + for(i = 0; i < cs->n; i++) + { + if(vals_table[i] == 0) + { + lengths[i] = 0; + } + else + { + lengths[i] = compressed_seq_i_log2(vals_table[i] + 1); + cs->total_length += lengths[i]; + }; + }; + + if(cs->store_table) + { + free(cs->store_table); + } + cs->store_table = (cmph_uint32 *) calloc(((cs->total_length + 31) >> 5), sizeof(cmph_uint32)); + cs->total_length = 0; + + for(i = 0; i < cs->n; i++) + { + if(vals_table[i] == 0) + continue; + stored_value = vals_table[i] - ((1U << lengths[i]) - 1U); + set_bits_at_pos(cs->store_table, cs->total_length, stored_value, lengths[i]); + cs->total_length += lengths[i]; + }; + + cs->rem_r = compressed_seq_i_log2(cs->total_length/cs->n); + + if(cs->rem_r == 0) + { + cs->rem_r = 1; + } + + if(cs->length_rems) + { + free(cs->length_rems); + } + + cs->length_rems = (cmph_uint32 *) calloc(BITS_TABLE_SIZE(cs->n, cs->rem_r), sizeof(cmph_uint32)); + + rems_mask = (1U << cs->rem_r) - 1U; + cs->total_length = 0; + + for(i = 0; i < cs->n; i++) + { + cs->total_length += lengths[i]; + set_bits_value(cs->length_rems, i, cs->total_length & rems_mask, cs->rem_r, rems_mask); + lengths[i] = cs->total_length >> cs->rem_r; + }; + + select_init(&cs->sel); + + // FABIANO: before it was (cs->total_length >> cs->rem_r) + 1. But I wiped out the + 1 because + // I changed the select structure to work up to m, instead of up to m - 1. + select_generate(&cs->sel, lengths, cs->n, (cs->total_length >> cs->rem_r)); + + free(lengths); +}; + +cmph_uint32 compressed_seq_get_space_usage(compressed_seq_t * cs) +{ + register cmph_uint32 space_usage = select_get_space_usage(&cs->sel); + space_usage += ((cs->total_length + 31) >> 5) * (cmph_uint32)sizeof(cmph_uint32) * 8; + space_usage += BITS_TABLE_SIZE(cs->n, cs->rem_r) * (cmph_uint32)sizeof(cmph_uint32) * 8; + return 4 * (cmph_uint32)sizeof(cmph_uint32) * 8 + space_usage; +} + +cmph_uint32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx) +{ + register cmph_uint32 enc_idx, enc_length; + register cmph_uint32 rems_mask; + register cmph_uint32 stored_value; + register cmph_uint32 sel_res; + + assert(idx < cs->n); // FABIANO ADDED + + rems_mask = (1U << cs->rem_r) - 1U; + + if(idx == 0) + { + enc_idx = 0; + sel_res = select_query(&cs->sel, idx); + } + else + { + sel_res = select_query(&cs->sel, idx - 1); + + enc_idx = (sel_res - (idx - 1)) << cs->rem_r; + enc_idx += get_bits_value(cs->length_rems, idx-1, cs->rem_r, rems_mask); + + sel_res = select_next_query(&cs->sel, sel_res); + }; + + enc_length = (sel_res - idx) << cs->rem_r; + enc_length += get_bits_value(cs->length_rems, idx, cs->rem_r, rems_mask); + enc_length -= enc_idx; + if(enc_length == 0) + return 0; + + stored_value = get_bits_at_pos(cs->store_table, enc_idx, enc_length); + return stored_value + ((1U << enc_length) - 1U); +}; + +void compressed_seq_dump(compressed_seq_t * cs, char ** buf, cmph_uint32 * buflen) +{ + register cmph_uint32 sel_size = select_packed_size(&(cs->sel)); + register cmph_uint32 length_rems_size = BITS_TABLE_SIZE(cs->n, cs->rem_r) * 4; + register cmph_uint32 store_table_size = ((cs->total_length + 31) >> 5) * 4; + register cmph_uint32 pos = 0; + char * buf_sel = 0; + cmph_uint32 buflen_sel = 0; + + *buflen = 4*(cmph_uint32)sizeof(cmph_uint32) + sel_size + length_rems_size + store_table_size; + + DEBUGP("sel_size = %u\n", sel_size); + DEBUGP("length_rems_size = %u\n", length_rems_size); + DEBUGP("store_table_size = %u\n", store_table_size); + *buf = (char *)calloc(*buflen, sizeof(char)); + + if (!*buf) + { + *buflen = UINT_MAX; + return; + } + + // dumping n, rem_r and total_length + memcpy(*buf, &(cs->n), sizeof(cmph_uint32)); + pos += (cmph_uint32)sizeof(cmph_uint32); + DEBUGP("n = %u\n", cs->n); + + memcpy(*buf + pos, &(cs->rem_r), sizeof(cmph_uint32)); + pos += (cmph_uint32)sizeof(cmph_uint32); + DEBUGP("rem_r = %u\n", cs->rem_r); + + memcpy(*buf + pos, &(cs->total_length), sizeof(cmph_uint32)); + pos += (cmph_uint32)sizeof(cmph_uint32); + DEBUGP("total_length = %u\n", cs->total_length); + + + // dumping sel + select_dump(&cs->sel, &buf_sel, &buflen_sel); + memcpy(*buf + pos, &buflen_sel, sizeof(cmph_uint32)); + pos += (cmph_uint32)sizeof(cmph_uint32); + DEBUGP("buflen_sel = %u\n", buflen_sel); + + memcpy(*buf + pos, buf_sel, buflen_sel); + #ifdef DEBUG + cmph_uint32 i = 0; + for(i = 0; i < buflen_sel; i++) + { + DEBUGP("pos = %u -- buf_sel[%u] = %u\n", pos, i, *(*buf + pos + i)); + } + #endif + pos += buflen_sel; + + free(buf_sel); + + // dumping length_rems + memcpy(*buf + pos, cs->length_rems, length_rems_size); + #ifdef DEBUG + for(i = 0; i < length_rems_size; i++) + { + DEBUGP("pos = %u -- length_rems_size = %u -- length_rems[%u] = %u\n", pos, length_rems_size, i, *(*buf + pos + i)); + } + #endif + pos += length_rems_size; + + // dumping store_table + memcpy(*buf + pos, cs->store_table, store_table_size); + + #ifdef DEBUG + for(i = 0; i < store_table_size; i++) + { + DEBUGP("pos = %u -- store_table_size = %u -- store_table[%u] = %u\n", pos, store_table_size, i, *(*buf + pos + i)); + } + #endif + DEBUGP("Dumped compressed sequence structure with size %u bytes\n", *buflen); +} + +void compressed_seq_load(compressed_seq_t * cs, const char * buf, cmph_uint32 buflen) +{ + register cmph_uint32 pos = 0; + cmph_uint32 buflen_sel = 0; + register cmph_uint32 length_rems_size = 0; + register cmph_uint32 store_table_size = 0; + + // loading n, rem_r and total_length + memcpy(&(cs->n), buf, sizeof(cmph_uint32)); + pos += (cmph_uint32)sizeof(cmph_uint32); + DEBUGP("n = %u\n", cs->n); + + memcpy(&(cs->rem_r), buf + pos, sizeof(cmph_uint32)); + pos += (cmph_uint32)sizeof(cmph_uint32); + DEBUGP("rem_r = %u\n", cs->rem_r); + + memcpy(&(cs->total_length), buf + pos, sizeof(cmph_uint32)); + pos += (cmph_uint32)sizeof(cmph_uint32); + DEBUGP("total_length = %u\n", cs->total_length); + + // loading sel + memcpy(&buflen_sel, buf + pos, sizeof(cmph_uint32)); + pos += (cmph_uint32)sizeof(cmph_uint32); + DEBUGP("buflen_sel = %u\n", buflen_sel); + + select_load(&cs->sel, buf + pos, buflen_sel); + #ifdef DEBUG + cmph_uint32 i = 0; + for(i = 0; i < buflen_sel; i++) + { + DEBUGP("pos = %u -- buf_sel[%u] = %u\n", pos, i, *(buf + pos + i)); + } + #endif + pos += buflen_sel; + + // loading length_rems + if(cs->length_rems) + { + free(cs->length_rems); + } + length_rems_size = BITS_TABLE_SIZE(cs->n, cs->rem_r); + cs->length_rems = (cmph_uint32 *) calloc(length_rems_size, sizeof(cmph_uint32)); + length_rems_size *= 4; + memcpy(cs->length_rems, buf + pos, length_rems_size); + + #ifdef DEBUG + for(i = 0; i < length_rems_size; i++) + { + DEBUGP("pos = %u -- length_rems_size = %u -- length_rems[%u] = %u\n", pos, length_rems_size, i, *(buf + pos + i)); + } + #endif + pos += length_rems_size; + + // loading store_table + store_table_size = ((cs->total_length + 31) >> 5); + if(cs->store_table) + { + free(cs->store_table); + } + cs->store_table = (cmph_uint32 *) calloc(store_table_size, sizeof(cmph_uint32)); + store_table_size *= 4; + memcpy(cs->store_table, buf + pos, store_table_size); + + #ifdef DEBUG + for(i = 0; i < store_table_size; i++) + { + DEBUGP("pos = %u -- store_table_size = %u -- store_table[%u] = %u\n", pos, store_table_size, i, *(buf + pos + i)); + } + #endif + + DEBUGP("Loaded compressed sequence structure with size %u bytes\n", buflen); +} + +void compressed_seq_pack(compressed_seq_t *cs, void *cs_packed) +{ + if (cs && cs_packed) + { + char *buf = NULL; + cmph_uint32 buflen = 0; + compressed_seq_dump(cs, &buf, &buflen); + memcpy(cs_packed, buf, buflen); + free(buf); + } + +} + +cmph_uint32 compressed_seq_packed_size(compressed_seq_t *cs) +{ + register cmph_uint32 sel_size = select_packed_size(&cs->sel); + register cmph_uint32 store_table_size = ((cs->total_length + 31) >> 5) * (cmph_uint32)sizeof(cmph_uint32); + register cmph_uint32 length_rems_size = BITS_TABLE_SIZE(cs->n, cs->rem_r) * (cmph_uint32)sizeof(cmph_uint32); + return 4 * (cmph_uint32)sizeof(cmph_uint32) + sel_size + store_table_size + length_rems_size; +} + + +cmph_uint32 compressed_seq_query_packed(void * cs_packed, cmph_uint32 idx) +{ + // unpacking cs_packed + register cmph_uint32 *ptr = (cmph_uint32 *)cs_packed; + register cmph_uint32 n = *ptr++; + register cmph_uint32 rem_r = *ptr++; + ptr++; // skipping total_length +// register cmph_uint32 total_length = *ptr++; + register cmph_uint32 buflen_sel = *ptr++; + register cmph_uint32 * sel_packed = ptr; + register cmph_uint32 * length_rems = (ptr += (buflen_sel >> 2)); + register cmph_uint32 length_rems_size = BITS_TABLE_SIZE(n, rem_r); + register cmph_uint32 * store_table = (ptr += length_rems_size); + + // compressed sequence query computation + register cmph_uint32 enc_idx, enc_length; + register cmph_uint32 rems_mask; + register cmph_uint32 stored_value; + register cmph_uint32 sel_res; + + rems_mask = (1U << rem_r) - 1U; + + if(idx == 0) + { + enc_idx = 0; + sel_res = select_query_packed(sel_packed, idx); + } + else + { + sel_res = select_query_packed(sel_packed, idx - 1); + + enc_idx = (sel_res - (idx - 1)) << rem_r; + enc_idx += get_bits_value(length_rems, idx-1, rem_r, rems_mask); + + sel_res = select_next_query_packed(sel_packed, sel_res); + }; + + enc_length = (sel_res - idx) << rem_r; + enc_length += get_bits_value(length_rems, idx, rem_r, rems_mask); + enc_length -= enc_idx; + if(enc_length == 0) + return 0; + + stored_value = get_bits_at_pos(store_table, enc_idx, enc_length); + return stored_value + ((1U << enc_length) - 1U); +} diff --git a/cmph/compressed_seq.h b/cmph/compressed_seq.h new file mode 100644 index 000000000..8d87fc700 --- /dev/null +++ b/cmph/compressed_seq.h @@ -0,0 +1,84 @@ +#ifndef __CMPH_COMPRESSED_SEQ_H__ +#define __CMPH_COMPRESSED_SEQ_H__ + +#include"select.h" + +struct _compressed_seq_t +{ + cmph_uint32 n; // number of values stored in store_table + // The length in bits of each value is decomposed into two compnents: the lg(n) MSBs are stored in rank_select data structure + // the remaining LSBs are stored in a table of n cells, each one of rem_r bits. + cmph_uint32 rem_r; + cmph_uint32 total_length; // total length in bits of stored_table + select_t sel; + cmph_uint32 * length_rems; + cmph_uint32 * store_table; +}; + +typedef struct _compressed_seq_t compressed_seq_t; + +/** \fn void compressed_seq_init(compressed_seq_t * cs); + * \brief Initialize a compressed sequence structure. + * \param cs points to the compressed sequence structure to be initialized + */ +void compressed_seq_init(compressed_seq_t * cs); + +/** \fn void compressed_seq_destroy(compressed_seq_t * cs); + * \brief Destroy a compressed sequence given as input. + * \param cs points to the compressed sequence structure to be destroyed + */ +void compressed_seq_destroy(compressed_seq_t * cs); + +/** \fn void compressed_seq_generate(compressed_seq_t * cs, cmph_uint32 * vals_table, cmph_uint32 n); + * \brief Generate a compressed sequence from an input array with n values. + * \param cs points to the compressed sequence structure + * \param vals_table poiter to the array given as input + * \param n number of values in @see vals_table + */ +void compressed_seq_generate(compressed_seq_t * cs, cmph_uint32 * vals_table, cmph_uint32 n); + + +/** \fn cmph_uint32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx); + * \brief Returns the value stored at index @see idx of the compressed sequence structure. + * \param cs points to the compressed sequence structure + * \param idx index to retrieve the value from + * \return the value stored at index @see idx of the compressed sequence structure + */ +cmph_uint32 compressed_seq_query(compressed_seq_t * cs, cmph_uint32 idx); + + +/** \fn cmph_uint32 compressed_seq_get_space_usage(compressed_seq_t * cs); + * \brief Returns amount of space (in bits) to store the compressed sequence. + * \param cs points to the compressed sequence structure + * \return the amount of space (in bits) to store @see cs + */ +cmph_uint32 compressed_seq_get_space_usage(compressed_seq_t * cs); + +void compressed_seq_dump(compressed_seq_t * cs, char ** buf, cmph_uint32 * buflen); + +void compressed_seq_load(compressed_seq_t * cs, const char * buf, cmph_uint32 buflen); + + +/** \fn void compressed_seq_pack(compressed_seq_t *cs, void *cs_packed); + * \brief Support the ability to pack a compressed sequence structure into a preallocated contiguous memory space pointed by cs_packed. + * \param cs points to the compressed sequence structure + * \param cs_packed pointer to the contiguous memory area used to store the compressed sequence structure. The size of cs_packed must be at least @see compressed_seq_packed_size + */ +void compressed_seq_pack(compressed_seq_t *cs, void *cs_packed); + +/** \fn cmph_uint32 compressed_seq_packed_size(compressed_seq_t *cs); + * \brief Return the amount of space needed to pack a compressed sequence structure. + * \return the size of the packed compressed sequence structure or zero for failures + */ +cmph_uint32 compressed_seq_packed_size(compressed_seq_t *cs); + + +/** \fn cmph_uint32 compressed_seq_query_packed(void * cs_packed, cmph_uint32 idx); + * \brief Returns the value stored at index @see idx of the packed compressed sequence structure. + * \param cs_packed is a pointer to a contiguous memory area + * \param idx is the index to retrieve the value from + * \return the value stored at index @see idx of the packed compressed sequence structure + */ +cmph_uint32 compressed_seq_query_packed(void * cs_packed, cmph_uint32 idx); + +#endif diff --git a/cmph/debug.h b/cmph/debug.h new file mode 100644 index 000000000..0f7ddb139 --- /dev/null +++ b/cmph/debug.h @@ -0,0 +1,53 @@ +#ifdef DEBUGP +#undef DEBUGP +#endif + +#ifdef __cplusplus +#include +#ifdef WIN32 +#include +#endif +#else +#include +#ifdef WIN32 +#include +#endif +#endif + +#ifndef __GNUC__ +#ifndef __DEBUG_H__ +#define __DEBUG_H__ +#include +static void debugprintf(const char *format, ...) +{ + va_list ap; + char *f = NULL; + const char *p="%s:%d "; + size_t plen = strlen(p); + va_start(ap, format); + f = (char *)malloc(plen + strlen(format) + 1); + if (!f) return; + memcpy(f, p, plen); + memcpy(f + plen, format, strlen(format) + 1); + vfprintf(stderr, f, ap); + va_end(ap); + free(f); +} +static void dummyprintf(const char *format, ...) +{} +#endif +#endif + +#ifdef DEBUG +#ifndef __GNUC__ +#define DEBUGP debugprintf +#else +#define DEBUGP(args...) do { fprintf(stderr, "%s:%d ", __FILE__, __LINE__); fprintf(stderr, ## args); } while(0) +#endif +#else +#ifndef __GNUC__ +#define DEBUGP dummyprintf +#else +#define DEBUGP(args...) +#endif +#endif diff --git a/cmph/djb2_hash.c b/cmph/djb2_hash.c new file mode 100644 index 000000000..d3b4330ab --- /dev/null +++ b/cmph/djb2_hash.c @@ -0,0 +1,49 @@ +#include "djb2_hash.h" +#include + +djb2_state_t *djb2_state_new() +{ + djb2_state_t *state = (djb2_state_t *)malloc(sizeof(djb2_state_t)); + state->hashfunc = CMPH_HASH_DJB2; + return state; +} + +void djb2_state_destroy(djb2_state_t *state) +{ + free(state); +} + +cmph_uint32 djb2_hash(djb2_state_t *state, const char *k, cmph_uint32 keylen) +{ + register cmph_uint32 hash = 5381; + const unsigned char *ptr = (unsigned char *)k; + cmph_uint32 i = 0; + while (i < keylen) + { + hash = hash*33 ^ *ptr; + ++ptr, ++i; + } + return hash; +} + + +void djb2_state_dump(djb2_state_t *state, char **buf, cmph_uint32 *buflen) +{ + *buf = NULL; + *buflen = 0; + return; +} + +djb2_state_t *djb2_state_copy(djb2_state_t *src_state) +{ + djb2_state_t *dest_state = (djb2_state_t *)malloc(sizeof(djb2_state_t)); + dest_state->hashfunc = src_state->hashfunc; + return dest_state; +} + +djb2_state_t *djb2_state_load(const char *buf, cmph_uint32 buflen) +{ + djb2_state_t *state = (djb2_state_t *)malloc(sizeof(djb2_state_t)); + state->hashfunc = CMPH_HASH_DJB2; + return state; +} diff --git a/cmph/djb2_hash.h b/cmph/djb2_hash.h new file mode 100644 index 000000000..dda97e317 --- /dev/null +++ b/cmph/djb2_hash.h @@ -0,0 +1,18 @@ +#ifndef __DJB2_HASH_H__ +#define __DJB2_HASH_H__ + +#include "hash.h" + +typedef struct __djb2_state_t +{ + CMPH_HASH hashfunc; +} djb2_state_t; + +djb2_state_t *djb2_state_new(); +cmph_uint32 djb2_hash(djb2_state_t *state, const char *k, cmph_uint32 keylen); +void djb2_state_dump(djb2_state_t *state, char **buf, cmph_uint32 *buflen); +djb2_state_t *djb2_state_copy(djb2_state_t *src_state); +djb2_state_t *djb2_state_load(const char *buf, cmph_uint32 buflen); +void djb2_state_destroy(djb2_state_t *state); + +#endif diff --git a/cmph/fch.c b/cmph/fch.c new file mode 100644 index 000000000..67b68fbbe --- /dev/null +++ b/cmph/fch.c @@ -0,0 +1,517 @@ +#include "fch.h" +#include "cmph_structs.h" +#include "fch_structs.h" +#include "hash.h" +#include "bitbool.h" +#include "fch_buckets.h" +#include +#include +#include +#include +#include +#define INDEX 0 /* alignment index within a bucket */ +//#define DEBUG +#include "debug.h" + +static fch_buckets_t * mapping(cmph_config_t *mph); +static cmph_uint32 * ordering(fch_buckets_t * buckets); +static cmph_uint8 check_for_collisions_h2(fch_config_data_t *fch, fch_buckets_t * buckets, cmph_uint32 *sorted_indexes); +static void permut(cmph_uint32 * vector, cmph_uint32 n); +static cmph_uint8 searching(fch_config_data_t *fch, fch_buckets_t *buckets, cmph_uint32 *sorted_indexes); + +fch_config_data_t *fch_config_new() +{ + fch_config_data_t *fch; + fch = (fch_config_data_t *)malloc(sizeof(fch_config_data_t)); + assert(fch); + memset(fch, 0, sizeof(fch_config_data_t)); + fch->hashfuncs[0] = CMPH_HASH_JENKINS; + fch->hashfuncs[1] = CMPH_HASH_JENKINS; + fch->m = fch->b = 0; + fch->c = fch->p1 = fch->p2 = 0.0; + fch->g = NULL; + fch->h1 = NULL; + fch->h2 = NULL; + return fch; +} + +void fch_config_destroy(cmph_config_t *mph) +{ + fch_config_data_t *data = (fch_config_data_t *)mph->data; + //DEBUGP("Destroying algorithm dependent data\n"); + free(data); +} + +void fch_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) +{ + fch_config_data_t *fch = (fch_config_data_t *)mph->data; + CMPH_HASH *hashptr = hashfuncs; + cmph_uint32 i = 0; + while(*hashptr != CMPH_HASH_COUNT) + { + if (i >= 2) break; //fch only uses two hash functions + fch->hashfuncs[i] = *hashptr; + ++i, ++hashptr; + } +} + +cmph_uint32 mixh10h11h12(cmph_uint32 b, double p1, double p2, cmph_uint32 initial_index) +{ + register cmph_uint32 int_p2 = (cmph_uint32)p2; + if (initial_index < p1) initial_index %= int_p2; /* h11 o h10 */ + else { /* h12 o h10 */ + initial_index %= b; + if(initial_index < p2) initial_index += int_p2; + } + return initial_index; +} + + +cmph_uint32 fch_calc_b(double c, cmph_uint32 m) +{ + return (cmph_uint32)ceil((c*m)/(log((double)m)/log(2.0) + 1)); +} + +double fch_calc_p1(cmph_uint32 m) +{ + return ceil(0.55*m); +} + +double fch_calc_p2(cmph_uint32 b) +{ + return ceil(0.3*b); +} + +static fch_buckets_t * mapping(cmph_config_t *mph) +{ + cmph_uint32 i = 0; + fch_buckets_t *buckets = NULL; + fch_config_data_t *fch = (fch_config_data_t *)mph->data; + if (fch->h1) hash_state_destroy(fch->h1); + fch->h1 = hash_state_new(fch->hashfuncs[0], fch->m); + fch->b = fch_calc_b(fch->c, fch->m); + fch->p1 = fch_calc_p1(fch->m); + fch->p2 = fch_calc_p2(fch->b); + //DEBUGP("b:%u p1:%f p2:%f\n", fch->b, fch->p1, fch->p2); + buckets = fch_buckets_new(fch->b); + + mph->key_source->rewind(mph->key_source->data); + for(i = 0; i < fch->m; i++) + { + cmph_uint32 h1, keylen; + char *key = NULL; + mph->key_source->read(mph->key_source->data, &key, &keylen); + h1 = hash(fch->h1, key, keylen) % fch->m; + h1 = mixh10h11h12 (fch->b, fch->p1, fch->p2, h1); + fch_buckets_insert(buckets, h1, key, keylen); + key = NULL; // transger memory ownership + + } + return buckets; +} + + +// returns the buckets indexes sorted by their sizes. +static cmph_uint32 * ordering(fch_buckets_t * buckets) +{ + return fch_buckets_get_indexes_sorted_by_size(buckets); +} + +/* Check whether function h2 causes collisions among the keys of each bucket */ +static cmph_uint8 check_for_collisions_h2(fch_config_data_t *fch, fch_buckets_t * buckets, cmph_uint32 *sorted_indexes) +{ + //cmph_uint32 max_size = fch_buckets_get_max_size(buckets); + cmph_uint8 * hashtable = (cmph_uint8 *)calloc((size_t)fch->m, sizeof(cmph_uint8)); + cmph_uint32 nbuckets = fch_buckets_get_nbuckets(buckets); + cmph_uint32 i = 0, index = 0, j =0; + for (i = 0; i < nbuckets; i++) + { + cmph_uint32 nkeys = fch_buckets_get_size(buckets, sorted_indexes[i]); + memset(hashtable, 0, (size_t)fch->m); + //DEBUGP("bucket %u -- nkeys: %u\n", i, nkeys); + for (j = 0; j < nkeys; j++) + { + char * key = fch_buckets_get_key(buckets, sorted_indexes[i], j); + cmph_uint32 keylen = fch_buckets_get_keylength(buckets, sorted_indexes[i], j); + index = hash(fch->h2, key, keylen) % fch->m; + if(hashtable[index]) { // collision detected + free(hashtable); + return 1; + } + hashtable[index] = 1; + } + } + free(hashtable); + return 0; +} + +static void permut(cmph_uint32 * vector, cmph_uint32 n) +{ + cmph_uint32 i, j, b; + for (i = 0; i < n; i++) { + j = (cmph_uint32) rand() % n; + b = vector[i]; + vector[i] = vector[j]; + vector[j] = b; + } +} + +static cmph_uint8 searching(fch_config_data_t *fch, fch_buckets_t *buckets, cmph_uint32 *sorted_indexes) +{ + cmph_uint32 * random_table = (cmph_uint32 *) calloc((size_t)fch->m, sizeof(cmph_uint32)); + cmph_uint32 * map_table = (cmph_uint32 *) calloc((size_t)fch->m, sizeof(cmph_uint32)); + cmph_uint32 iteration_to_generate_h2 = 0; + cmph_uint32 searching_iterations = 0; + cmph_uint8 restart = 0; + cmph_uint32 nbuckets = fch_buckets_get_nbuckets(buckets); + cmph_uint32 i, j, z, counter = 0, filled_count = 0; + if (fch->g) free (fch->g); + fch->g = (cmph_uint32 *) calloc((size_t)fch->b, sizeof(cmph_uint32)); + + //DEBUGP("max bucket size: %u\n", fch_buckets_get_max_size(buckets)); + + for(i = 0; i < fch->m; i++) + { + random_table[i] = i; + } + permut(random_table, fch->m); + for(i = 0; i < fch->m; i++) + { + map_table[random_table[i]] = i; + } + do { + if (fch->h2) hash_state_destroy(fch->h2); + fch->h2 = hash_state_new(fch->hashfuncs[1], fch->m); + restart = check_for_collisions_h2(fch, buckets, sorted_indexes); + filled_count = 0; + if (!restart) + { + searching_iterations++; iteration_to_generate_h2 = 0; + //DEBUGP("searching_iterations: %u\n", searching_iterations); + } + else { + iteration_to_generate_h2++; + //DEBUGP("iteration_to_generate_h2: %u\n", iteration_to_generate_h2); + } + for(i = 0; (i < nbuckets) && !restart; i++) { + cmph_uint32 bucketsize = fch_buckets_get_size(buckets, sorted_indexes[i]); + if (bucketsize == 0) + { + restart = 0; // false + break; + } + else restart = 1; // true + for(z = 0; (z < (fch->m - filled_count)) && restart; z++) { + char * key = fch_buckets_get_key(buckets, sorted_indexes[i], INDEX); + cmph_uint32 keylen = fch_buckets_get_keylength(buckets, sorted_indexes[i], INDEX); + cmph_uint32 h2 = hash(fch->h2, key, keylen) % fch->m; + counter = 0; + restart = 0; // false + fch->g[sorted_indexes[i]] = (fch->m + random_table[filled_count + z] - h2) % fch->m; + //DEBUGP("g[%u]: %u\n", sorted_indexes[i], fch->g[sorted_indexes[i]]); + j = INDEX; + do { + cmph_uint32 index = 0; + key = fch_buckets_get_key(buckets, sorted_indexes[i], j); + keylen = fch_buckets_get_keylength(buckets, sorted_indexes[i], j); + h2 = hash(fch->h2, key, keylen) % fch->m; + index = (h2 + fch->g[sorted_indexes[i]]) % fch->m; + //DEBUGP("key:%s keylen:%u index: %u h2:%u bucketsize:%u\n", key, keylen, index, h2, bucketsize); + if (map_table[index] >= filled_count) { + cmph_uint32 y = map_table[index]; + cmph_uint32 ry = random_table[y]; + random_table[y] = random_table[filled_count]; + random_table[filled_count] = ry; + map_table[random_table[y]] = y; + map_table[random_table[filled_count]] = filled_count; + filled_count++; + counter ++; + } + else { + restart = 1; // true + filled_count = filled_count - counter; + counter = 0; + break; + } + j = (j + 1) % bucketsize; + } while(j % bucketsize != INDEX); + } + //getchar(); + } + } while(restart && (searching_iterations < 10) && (iteration_to_generate_h2 < 1000)); + free(map_table); + free(random_table); + return restart; +} + + + +cmph_t *fch_new(cmph_config_t *mph, double c) +{ + cmph_t *mphf = NULL; + fch_data_t *fchf = NULL; + cmph_uint32 iterations = 100; + cmph_uint8 restart_mapping = 0; + fch_buckets_t * buckets = NULL; + cmph_uint32 * sorted_indexes = NULL; + fch_config_data_t *fch = (fch_config_data_t *)mph->data; + fch->m = mph->key_source->nkeys; + //DEBUGP("m: %f\n", fch->m); + if (c <= 2) c = 2.6; // validating restrictions over parameter c. + fch->c = c; + //DEBUGP("c: %f\n", fch->c); + fch->h1 = NULL; + fch->h2 = NULL; + fch->g = NULL; + do + { + if (mph->verbosity) + { + fprintf(stderr, "Entering mapping step for mph creation of %u keys\n", fch->m); + } + if (buckets) fch_buckets_destroy(buckets); + buckets = mapping(mph); + if (mph->verbosity) + { + fprintf(stderr, "Starting ordering step\n"); + } + if (sorted_indexes) free (sorted_indexes); + sorted_indexes = ordering(buckets); + if (mph->verbosity) + { + fprintf(stderr, "Starting searching step.\n"); + } + restart_mapping = searching(fch, buckets, sorted_indexes); + iterations--; + + } while(restart_mapping && iterations > 0); + if (buckets) fch_buckets_destroy(buckets); + if (sorted_indexes) free (sorted_indexes); + if (iterations == 0) return NULL; + mphf = (cmph_t *)malloc(sizeof(cmph_t)); + mphf->algo = mph->algo; + fchf = (fch_data_t *)malloc(sizeof(fch_data_t)); + fchf->g = fch->g; + fch->g = NULL; //transfer memory ownership + fchf->h1 = fch->h1; + fch->h1 = NULL; //transfer memory ownership + fchf->h2 = fch->h2; + fch->h2 = NULL; //transfer memory ownership + fchf->p2 = fch->p2; + fchf->p1 = fch->p1; + fchf->b = fch->b; + fchf->c = fch->c; + fchf->m = fch->m; + mphf->data = fchf; + mphf->size = fch->m; + //DEBUGP("Successfully generated minimal perfect hash\n"); + if (mph->verbosity) + { + fprintf(stderr, "Successfully generated minimal perfect hash function\n"); + } + return mphf; +} + +int fch_dump(cmph_t *mphf, FILE *fd) +{ + char *buf = NULL; + cmph_uint32 buflen; + register size_t nbytes; + + fch_data_t *data = (fch_data_t *)mphf->data; + __cmph_dump(mphf, fd); + + hash_state_dump(data->h1, &buf, &buflen); + //DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); + nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd); + free(buf); + + hash_state_dump(data->h2, &buf, &buflen); + //DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); + nbytes = fwrite(&buflen, sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(buf, (size_t)buflen, (size_t)1, fd); + free(buf); + + nbytes = fwrite(&(data->m), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(&(data->c), sizeof(double), (size_t)1, fd); + nbytes = fwrite(&(data->b), sizeof(cmph_uint32), (size_t)1, fd); + nbytes = fwrite(&(data->p1), sizeof(double), (size_t)1, fd); + nbytes = fwrite(&(data->p2), sizeof(double), (size_t)1, fd); + nbytes = fwrite(data->g, sizeof(cmph_uint32)*(data->b), (size_t)1, fd); + #ifdef DEBUG + cmph_uint32 i; + fprintf(stderr, "G: "); + for (i = 0; i < data->b; ++i) fprintf(stderr, "%u ", data->g[i]); + fprintf(stderr, "\n"); + #endif + return 1; +} + +void fch_load(FILE *f, cmph_t *mphf) +{ + char *buf = NULL; + cmph_uint32 buflen; + register size_t nbytes; + fch_data_t *fch = (fch_data_t *)malloc(sizeof(fch_data_t)); + + //DEBUGP("Loading fch mphf\n"); + mphf->data = fch; + //DEBUGP("Reading h1\n"); + fch->h1 = NULL; + nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f); + //DEBUGP("Hash state of h1 has %u bytes\n", buflen); + buf = (char *)malloc((size_t)buflen); + nbytes = fread(buf, (size_t)buflen, (size_t)1, f); + fch->h1 = hash_state_load(buf, buflen); + free(buf); + + //DEBUGP("Loading fch mphf\n"); + mphf->data = fch; + //DEBUGP("Reading h2\n"); + fch->h2 = NULL; + nbytes = fread(&buflen, sizeof(cmph_uint32), (size_t)1, f); + //DEBUGP("Hash state of h2 has %u bytes\n", buflen); + buf = (char *)malloc((size_t)buflen); + nbytes = fread(buf, (size_t)buflen, (size_t)1, f); + fch->h2 = hash_state_load(buf, buflen); + free(buf); + + + //DEBUGP("Reading m and n\n"); + nbytes = fread(&(fch->m), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(fch->c), sizeof(double), (size_t)1, f); + nbytes = fread(&(fch->b), sizeof(cmph_uint32), (size_t)1, f); + nbytes = fread(&(fch->p1), sizeof(double), (size_t)1, f); + nbytes = fread(&(fch->p2), sizeof(double), (size_t)1, f); + + fch->g = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*fch->b); + nbytes = fread(fch->g, fch->b*sizeof(cmph_uint32), (size_t)1, f); + #ifdef DEBUG + cmph_uint32 i; + fprintf(stderr, "G: "); + for (i = 0; i < fch->b; ++i) fprintf(stderr, "%u ", fch->g[i]); + fprintf(stderr, "\n"); + #endif + return; +} + +cmph_uint32 fch_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) +{ + fch_data_t *fch = mphf->data; + cmph_uint32 h1 = hash(fch->h1, key, keylen) % fch->m; + cmph_uint32 h2 = hash(fch->h2, key, keylen) % fch->m; + h1 = mixh10h11h12 (fch->b, fch->p1, fch->p2, h1); + //DEBUGP("key: %s h1: %u h2: %u g[h1]: %u\n", key, h1, h2, fch->g[h1]); + return (h2 + fch->g[h1]) % fch->m; +} +void fch_destroy(cmph_t *mphf) +{ + fch_data_t *data = (fch_data_t *)mphf->data; + free(data->g); + hash_state_destroy(data->h1); + hash_state_destroy(data->h2); + free(data); + free(mphf); +} + +/** \fn void fch_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void fch_pack(cmph_t *mphf, void *packed_mphf) +{ + fch_data_t *data = (fch_data_t *)mphf->data; + cmph_uint8 * ptr = packed_mphf; + + // packing h1 type + CMPH_HASH h1_type = hash_get_type(data->h1); + *((cmph_uint32 *) ptr) = h1_type; + ptr += sizeof(cmph_uint32); + + // packing h1 + hash_state_pack(data->h1, ptr); + ptr += hash_state_packed_size(h1_type); + + // packing h2 type + CMPH_HASH h2_type = hash_get_type(data->h2); + *((cmph_uint32 *) ptr) = h2_type; + ptr += sizeof(cmph_uint32); + + // packing h2 + hash_state_pack(data->h2, ptr); + ptr += hash_state_packed_size(h2_type); + + // packing m + *((cmph_uint32 *) ptr) = data->m; + ptr += sizeof(data->m); + + // packing b + *((cmph_uint32 *) ptr) = data->b; + ptr += sizeof(data->b); + + // packing p1 + *((cmph_uint64 *)ptr) = (cmph_uint64)data->p1; + ptr += sizeof(data->p1); + + // packing p2 + *((cmph_uint64 *)ptr) = (cmph_uint64)data->p2; + ptr += sizeof(data->p2); + + // packing g + memcpy(ptr, data->g, sizeof(cmph_uint32)*(data->b)); +} + +/** \fn cmph_uint32 fch_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 fch_packed_size(cmph_t *mphf) +{ + fch_data_t *data = (fch_data_t *)mphf->data; + CMPH_HASH h1_type = hash_get_type(data->h1); + CMPH_HASH h2_type = hash_get_type(data->h2); + + return (cmph_uint32)(sizeof(CMPH_ALGO) + hash_state_packed_size(h1_type) + hash_state_packed_size(h2_type) + + 4*sizeof(cmph_uint32) + 2*sizeof(double) + sizeof(cmph_uint32)*(data->b)); +} + + +/** cmph_uint32 fch_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 fch_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen) +{ + register cmph_uint8 *h1_ptr = packed_mphf; + register CMPH_HASH h1_type = *((cmph_uint32 *)h1_ptr); + h1_ptr += 4; + + register cmph_uint8 *h2_ptr = h1_ptr + hash_state_packed_size(h1_type); + register CMPH_HASH h2_type = *((cmph_uint32 *)h2_ptr); + h2_ptr += 4; + + register cmph_uint32 *g_ptr = (cmph_uint32 *)(h2_ptr + hash_state_packed_size(h2_type)); + + register cmph_uint32 m = *g_ptr++; + + register cmph_uint32 b = *g_ptr++; + + register double p1 = (double)(*((cmph_uint64 *)g_ptr)); + g_ptr += 2; + + register double p2 = (double)(*((cmph_uint64 *)g_ptr)); + g_ptr += 2; + + register cmph_uint32 h1 = hash_packed(h1_ptr, h1_type, key, keylen) % m; + register cmph_uint32 h2 = hash_packed(h2_ptr, h2_type, key, keylen) % m; + + h1 = mixh10h11h12 (b, p1, p2, h1); + return (h2 + g_ptr[h1]) % m; +} + diff --git a/cmph/fch.h b/cmph/fch.h new file mode 100644 index 000000000..ec4f0f5b0 --- /dev/null +++ b/cmph/fch.h @@ -0,0 +1,48 @@ +#ifndef __CMPH_FCH_H__ +#define __CMPH_FCH_H__ + +#include "cmph.h" + +typedef struct __fch_data_t fch_data_t; +typedef struct __fch_config_data_t fch_config_data_t; + +/* Parameters calculation */ +cmph_uint32 fch_calc_b(double c, cmph_uint32 m); +double fch_calc_p1(cmph_uint32 m); +double fch_calc_p2(cmph_uint32 b); +cmph_uint32 mixh10h11h12(cmph_uint32 b, double p1, double p2, cmph_uint32 initial_index); + +fch_config_data_t *fch_config_new(); +void fch_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs); +void fch_config_destroy(cmph_config_t *mph); +cmph_t *fch_new(cmph_config_t *mph, double c); + +void fch_load(FILE *f, cmph_t *mphf); +int fch_dump(cmph_t *mphf, FILE *f); +void fch_destroy(cmph_t *mphf); +cmph_uint32 fch_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); + +/** \fn void fch_pack(cmph_t *mphf, void *packed_mphf); + * \brief Support the ability to pack a perfect hash function into a preallocated contiguous memory space pointed by packed_mphf. + * \param mphf pointer to the resulting mphf + * \param packed_mphf pointer to the contiguous memory area used to store the resulting mphf. The size of packed_mphf must be at least cmph_packed_size() + */ +void fch_pack(cmph_t *mphf, void *packed_mphf); + +/** \fn cmph_uint32 fch_packed_size(cmph_t *mphf); + * \brief Return the amount of space needed to pack mphf. + * \param mphf pointer to a mphf + * \return the size of the packed function or zero for failures + */ +cmph_uint32 fch_packed_size(cmph_t *mphf); + +/** cmph_uint32 fch_search(void *packed_mphf, const char *key, cmph_uint32 keylen); + * \brief Use the packed mphf to do a search. + * \param packed_mphf pointer to the packed mphf + * \param key key to be hashed + * \param keylen key legth in bytes + * \return The mphf value + */ +cmph_uint32 fch_search_packed(void *packed_mphf, const char *key, cmph_uint32 keylen); + +#endif diff --git a/cmph/fch_buckets.c b/cmph/fch_buckets.c new file mode 100644 index 000000000..24b98e672 --- /dev/null +++ b/cmph/fch_buckets.c @@ -0,0 +1,214 @@ +#include "vqueue.h" +#include "fch_buckets.h" +#include +#include +#include +//#define DEBUG +#include "debug.h" + +typedef struct __fch_bucket_entry_t +{ + char * value; + cmph_uint32 length; +} fch_bucket_entry_t; + +typedef struct __fch_bucket_t +{ + fch_bucket_entry_t * entries; + cmph_uint32 capacity, size; +} fch_bucket_t; + + + +static void fch_bucket_new(fch_bucket_t *bucket) +{ + assert(bucket); + bucket->size = 0; + bucket->entries = NULL; + bucket->capacity = 0; +} + +static void fch_bucket_destroy(fch_bucket_t *bucket) +{ + cmph_uint32 i; + assert(bucket); + for (i = 0; i < bucket->size; i++) + { + free((bucket->entries + i)->value); + } + free(bucket->entries); +} + + +static void fch_bucket_reserve(fch_bucket_t *bucket, cmph_uint32 size) +{ + assert(bucket); + if (bucket->capacity < size) + { + cmph_uint32 new_capacity = bucket->capacity + 1; + DEBUGP("Increasing current capacity %u to %u\n", bucket->capacity, size); + while (new_capacity < size) + { + new_capacity *= 2; + } + bucket->entries = (fch_bucket_entry_t *)realloc(bucket->entries, sizeof(fch_bucket_entry_t)*new_capacity); + assert(bucket->entries); + bucket->capacity = new_capacity; + DEBUGP("Increased\n"); + } +} + +static void fch_bucket_insert(fch_bucket_t *bucket, char *val, cmph_uint32 val_length) +{ + assert(bucket); + fch_bucket_reserve(bucket, bucket->size + 1); + (bucket->entries + bucket->size)->value = val; + (bucket->entries + bucket->size)->length = val_length; + ++(bucket->size); +} + + +static cmph_uint8 fch_bucket_is_empty(fch_bucket_t *bucket) +{ + assert(bucket); + return (cmph_uint8)(bucket->size == 0); +} + +static cmph_uint32 fch_bucket_size(fch_bucket_t *bucket) +{ + assert(bucket); + return bucket->size; +} + +static char * fch_bucket_get_key(fch_bucket_t *bucket, cmph_uint32 index_key) +{ + assert(bucket); assert(index_key < bucket->size); + return (bucket->entries + index_key)->value; +} + +static cmph_uint32 fch_bucket_get_length(fch_bucket_t *bucket, cmph_uint32 index_key) +{ + assert(bucket); assert(index_key < bucket->size); + return (bucket->entries + index_key)->length; +} + +static void fch_bucket_print(fch_bucket_t * bucket, cmph_uint32 index) +{ + cmph_uint32 i; + assert(bucket); + fprintf(stderr, "Printing bucket %u ...\n", index); + for (i = 0; i < bucket->size; i++) + { + fprintf(stderr, " key: %s\n", (bucket->entries + i)->value); + } +} + +////////////////////////////////////////////////////////////////////////////////////// + +struct __fch_buckets_t +{ + fch_bucket_t * values; + cmph_uint32 nbuckets, max_size; + +}; + +fch_buckets_t * fch_buckets_new(cmph_uint32 nbuckets) +{ + cmph_uint32 i; + fch_buckets_t *buckets = (fch_buckets_t *)malloc(sizeof(fch_buckets_t)); + assert(buckets); + buckets->values = (fch_bucket_t *)calloc((size_t)nbuckets, sizeof(fch_bucket_t)); + for (i = 0; i < nbuckets; i++) fch_bucket_new(buckets->values + i); + assert(buckets->values); + buckets->nbuckets = nbuckets; + buckets->max_size = 0; + return buckets; +} + +cmph_uint8 fch_buckets_is_empty(fch_buckets_t * buckets, cmph_uint32 index) +{ + assert(index < buckets->nbuckets); + return fch_bucket_is_empty(buckets->values + index); +} + +void fch_buckets_insert(fch_buckets_t * buckets, cmph_uint32 index, char * key, cmph_uint32 length) +{ + assert(index < buckets->nbuckets); + fch_bucket_insert(buckets->values + index, key, length); + if (fch_bucket_size(buckets->values + index) > buckets->max_size) + { + buckets->max_size = fch_bucket_size(buckets->values + index); + } +} + +cmph_uint32 fch_buckets_get_size(fch_buckets_t * buckets, cmph_uint32 index) +{ + assert(index < buckets->nbuckets); + return fch_bucket_size(buckets->values + index); +} + + +char * fch_buckets_get_key(fch_buckets_t * buckets, cmph_uint32 index, cmph_uint32 index_key) +{ + assert(index < buckets->nbuckets); + return fch_bucket_get_key(buckets->values + index, index_key); +} + +cmph_uint32 fch_buckets_get_keylength(fch_buckets_t * buckets, cmph_uint32 index, cmph_uint32 index_key) +{ + assert(index < buckets->nbuckets); + return fch_bucket_get_length(buckets->values + index, index_key); +} + +cmph_uint32 fch_buckets_get_max_size(fch_buckets_t * buckets) +{ + return buckets->max_size; +} + +cmph_uint32 fch_buckets_get_nbuckets(fch_buckets_t * buckets) +{ + return buckets->nbuckets; +} + +cmph_uint32 * fch_buckets_get_indexes_sorted_by_size(fch_buckets_t * buckets) +{ + int i = 0; + cmph_uint32 sum = 0, value; + cmph_uint32 *nbuckets_size = (cmph_uint32 *) calloc((size_t)buckets->max_size + 1, sizeof(cmph_uint32)); + cmph_uint32 * sorted_indexes = (cmph_uint32 *) calloc((size_t)buckets->nbuckets, sizeof(cmph_uint32)); + + // collect how many buckets for each size. + for(i = 0; i < buckets->nbuckets; i++) nbuckets_size[fch_bucket_size(buckets->values + i)] ++; + + // calculating offset considering a decreasing order of buckets size. + value = nbuckets_size[buckets->max_size]; + nbuckets_size[buckets->max_size] = sum; + for(i = (int)buckets->max_size - 1; i >= 0; i--) + { + sum += value; + value = nbuckets_size[i]; + nbuckets_size[i] = sum; + + } + for(i = 0; i < buckets->nbuckets; i++) + { + sorted_indexes[nbuckets_size[fch_bucket_size(buckets->values + i)]] = (cmph_uint32)i; + nbuckets_size[fch_bucket_size(buckets->values + i)] ++; + } + free(nbuckets_size); + return sorted_indexes; +} + +void fch_buckets_print(fch_buckets_t * buckets) +{ + cmph_uint32 i; + for (i = 0; i < buckets->nbuckets; i++) fch_bucket_print(buckets->values + i, i); +} + +void fch_buckets_destroy(fch_buckets_t * buckets) +{ + cmph_uint32 i; + for (i = 0; i < buckets->nbuckets; i++) fch_bucket_destroy(buckets->values + i); + free(buckets->values); + free(buckets); +} diff --git a/cmph/fch_buckets.h b/cmph/fch_buckets.h new file mode 100644 index 000000000..2a1b8b2a6 --- /dev/null +++ b/cmph/fch_buckets.h @@ -0,0 +1,30 @@ +#ifndef __CMPH_FCH_BUCKETS_H__ +#define __CMPH_FCH_BUCKETS_H__ + +#include "cmph_types.h" +typedef struct __fch_buckets_t fch_buckets_t; + +fch_buckets_t * fch_buckets_new(cmph_uint32 nbuckets); + +cmph_uint8 fch_buckets_is_empty(fch_buckets_t * buckets, cmph_uint32 index); + +void fch_buckets_insert(fch_buckets_t * buckets, cmph_uint32 index, char * key, cmph_uint32 length); + +cmph_uint32 fch_buckets_get_size(fch_buckets_t * buckets, cmph_uint32 index); + +char * fch_buckets_get_key(fch_buckets_t * buckets, cmph_uint32 index, cmph_uint32 index_key); + +cmph_uint32 fch_buckets_get_keylength(fch_buckets_t * buckets, cmph_uint32 index, cmph_uint32 index_key); + +// returns the size of biggest bucket. +cmph_uint32 fch_buckets_get_max_size(fch_buckets_t * buckets); + +// returns the number of buckets. +cmph_uint32 fch_buckets_get_nbuckets(fch_buckets_t * buckets); + +cmph_uint32 * fch_buckets_get_indexes_sorted_by_size(fch_buckets_t * buckets); + +void fch_buckets_print(fch_buckets_t * buckets); + +void fch_buckets_destroy(fch_buckets_t * buckets); +#endif diff --git a/cmph/fch_structs.h b/cmph/fch_structs.h new file mode 100755 index 000000000..fcd1555ea --- /dev/null +++ b/cmph/fch_structs.h @@ -0,0 +1,30 @@ +#ifndef __CMPH_FCH_STRUCTS_H__ +#define __CMPH_FCH_STRUCTS_H__ + +#include "hash_state.h" + +struct __fch_data_t +{ + cmph_uint32 m; // words count + double c; // constant c + cmph_uint32 b; // parameter b = ceil(c*m/(log(m)/log(2) + 1)). Don't need to be stored + double p1; // constant p1 = ceil(0.6*m). Don't need to be stored + double p2; // constant p2 = ceil(0.3*b). Don't need to be stored + cmph_uint32 *g; // g function. + hash_state_t *h1; // h10 function. + hash_state_t *h2; // h20 function. +}; + +struct __fch_config_data_t +{ + CMPH_HASH hashfuncs[2]; + cmph_uint32 m; // words count + double c; // constant c + cmph_uint32 b; // parameter b = ceil(c*m/(log(m)/log(2) + 1)). Don't need to be stored + double p1; // constant p1 = ceil(0.6*m). Don't need to be stored + double p2; // constant p2 = ceil(0.3*b). Don't need to be stored + cmph_uint32 *g; // g function. + hash_state_t *h1; // h10 function. + hash_state_t *h2; // h20 function. +}; +#endif diff --git a/cmph/fnv_hash.c b/cmph/fnv_hash.c new file mode 100644 index 000000000..aeaca8ff4 --- /dev/null +++ b/cmph/fnv_hash.c @@ -0,0 +1,53 @@ +#include "fnv_hash.h" +#include + +fnv_state_t *fnv_state_new() +{ + fnv_state_t *state = (fnv_state_t *)malloc(sizeof(fnv_state_t)); + state->hashfunc = CMPH_HASH_FNV; + return state; +} + +void fnv_state_destroy(fnv_state_t *state) +{ + free(state); +} + +cmph_uint32 fnv_hash(fnv_state_t *state, const char *k, cmph_uint32 keylen) +{ + const unsigned char *bp = (const unsigned char *)k; + const unsigned char *be = bp + keylen; + static unsigned int hval = 0; + + while (bp < be) + { + + //hval *= 0x01000193; good for non-gcc compiler + hval += (hval << 1) + (hval << 4) + (hval << 7) + (hval << 8) + (hval << 24); //good for gcc + + hval ^= *bp++; + } + return hval; +} + + +void fnv_state_dump(fnv_state_t *state, char **buf, cmph_uint32 *buflen) +{ + *buf = NULL; + *buflen = 0; + return; +} + +fnv_state_t * fnv_state_copy(fnv_state_t *src_state) +{ + fnv_state_t *dest_state = (fnv_state_t *)malloc(sizeof(fnv_state_t)); + dest_state->hashfunc = src_state->hashfunc; + return dest_state; +} + +fnv_state_t *fnv_state_load(const char *buf, cmph_uint32 buflen) +{ + fnv_state_t *state = (fnv_state_t *)malloc(sizeof(fnv_state_t)); + state->hashfunc = CMPH_HASH_FNV; + return state; +} diff --git a/cmph/fnv_hash.h b/cmph/fnv_hash.h new file mode 100644 index 000000000..7f5794657 --- /dev/null +++ b/cmph/fnv_hash.h @@ -0,0 +1,18 @@ +#ifndef __FNV_HASH_H__ +#define __FNV_HASH_H__ + +#include "hash.h" + +typedef struct __fnv_state_t +{ + CMPH_HASH hashfunc; +} fnv_state_t; + +fnv_state_t *fnv_state_new(); +cmph_uint32 fnv_hash(fnv_state_t *state, const char *k, cmph_uint32 keylen); +void fnv_state_dump(fnv_state_t *state, char **buf, cmph_uint32 *buflen); +fnv_state_t *fnv_state_copy(fnv_state_t *src_state); +fnv_state_t *fnv_state_load(const char *buf, cmph_uint32 buflen); +void fnv_state_destroy(fnv_state_t *state); + +#endif diff --git a/cmph/graph.c b/cmph/graph.c new file mode 100644 index 000000000..c29fd8b9c --- /dev/null +++ b/cmph/graph.c @@ -0,0 +1,338 @@ +#include "graph.h" + +#include +#include +#include +#include +#include +#include "vstack.h" +#include "bitbool.h" + +//#define DEBUG +#include "debug.h" + +/* static const cmph_uint8 bitmask[8] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; */ +/* #define GETBIT(array, i) (array[(i) / 8] & bitmask[(i) % 8]) */ +/* #define SETBIT(array, i) (array[(i) / 8] |= bitmask[(i) % 8]) */ +/* #define UNSETBIT(array, i) (array[(i) / 8] &= (~(bitmask[(i) % 8]))) */ + +#define abs_edge(e, i) (e % g->nedges + i * g->nedges) + +struct __graph_t +{ + cmph_uint32 nnodes; + cmph_uint32 nedges; + cmph_uint32 *edges; + cmph_uint32 *first; + cmph_uint32 *next; + cmph_uint8 *critical_nodes; /* included -- Fabiano*/ + cmph_uint32 ncritical_nodes; /* included -- Fabiano*/ + cmph_uint32 cedges; + int shrinking; +}; + +static cmph_uint32 EMPTY = UINT_MAX; + +graph_t *graph_new(cmph_uint32 nnodes, cmph_uint32 nedges) +{ + graph_t *graph = (graph_t *)malloc(sizeof(graph_t)); + if (!graph) return NULL; + + graph->edges = (cmph_uint32 *)malloc(sizeof(cmph_uint32) * 2 * nedges); + graph->next = (cmph_uint32 *)malloc(sizeof(cmph_uint32) * 2 * nedges); + graph->first = (cmph_uint32 *)malloc(sizeof(cmph_uint32) * nnodes); + graph->critical_nodes = NULL; /* included -- Fabiano*/ + graph->ncritical_nodes = 0; /* included -- Fabiano*/ + graph->nnodes = nnodes; + graph->nedges = nedges; + + graph_clear_edges(graph); + return graph; +} + + +void graph_destroy(graph_t *graph) +{ + DEBUGP("Destroying graph\n"); + free(graph->edges); + free(graph->first); + free(graph->next); + free(graph->critical_nodes); /* included -- Fabiano*/ + free(graph); + return; +} + +void graph_print(graph_t *g) +{ + cmph_uint32 i, e; + for (i = 0; i < g->nnodes; ++i) + { + DEBUGP("Printing edges connected to %u\n", i); + e = g->first[i]; + if (e != EMPTY) + { + printf("%u -> %u\n", g->edges[abs_edge(e, 0)], g->edges[abs_edge(e, 1)]); + while ((e = g->next[e]) != EMPTY) + { + printf("%u -> %u\n", g->edges[abs_edge(e, 0)], g->edges[abs_edge(e, 1)]); + } + } + + } + return; +} + +void graph_add_edge(graph_t *g, cmph_uint32 v1, cmph_uint32 v2) +{ + cmph_uint32 e = g->cedges; + + assert(v1 < g->nnodes); + assert(v2 < g->nnodes); + assert(e < g->nedges); + assert(!g->shrinking); + + g->next[e] = g->first[v1]; + g->first[v1] = e; + g->edges[e] = v2; + + g->next[e + g->nedges] = g->first[v2]; + g->first[v2] = e + g->nedges; + g->edges[e + g->nedges] = v1; + + ++(g->cedges); +} + +static int check_edge(graph_t *g, cmph_uint32 e, cmph_uint32 v1, cmph_uint32 v2) +{ + DEBUGP("Checking edge %u %u looking for %u %u\n", g->edges[abs_edge(e, 0)], g->edges[abs_edge(e, 1)], v1, v2); + if (g->edges[abs_edge(e, 0)] == v1 && g->edges[abs_edge(e, 1)] == v2) return 1; + if (g->edges[abs_edge(e, 0)] == v2 && g->edges[abs_edge(e, 1)] == v1) return 1; + return 0; +} + +cmph_uint32 graph_edge_id(graph_t *g, cmph_uint32 v1, cmph_uint32 v2) +{ + cmph_uint32 e; + e = g->first[v1]; + assert(e != EMPTY); + if (check_edge(g, e, v1, v2)) return abs_edge(e, 0); + do + { + e = g->next[e]; + assert(e != EMPTY); + } + while (!check_edge(g, e, v1, v2)); + return abs_edge(e, 0); +} +static void del_edge_point(graph_t *g, cmph_uint32 v1, cmph_uint32 v2) +{ + cmph_uint32 e, prev; + + DEBUGP("Deleting edge point %u %u\n", v1, v2); + e = g->first[v1]; + if (check_edge(g, e, v1, v2)) + { + g->first[v1] = g->next[e]; + //g->edges[e] = EMPTY; + DEBUGP("Deleted\n"); + return; + } + DEBUGP("Checking linked list\n"); + do + { + prev = e; + e = g->next[e]; + assert(e != EMPTY); + } + while (!check_edge(g, e, v1, v2)); + + g->next[prev] = g->next[e]; + //g->edges[e] = EMPTY; + DEBUGP("Deleted\n"); +} + + +void graph_del_edge(graph_t *g, cmph_uint32 v1, cmph_uint32 v2) +{ + g->shrinking = 1; + del_edge_point(g, v1, v2); + del_edge_point(g, v2, v1); +} + +void graph_clear_edges(graph_t *g) +{ + cmph_uint32 i; + for (i = 0; i < g->nnodes; ++i) g->first[i] = EMPTY; + for (i = 0; i < g->nedges*2; ++i) + { + g->edges[i] = EMPTY; + g->next[i] = EMPTY; + } + g->cedges = 0; + g->shrinking = 0; +} + +static cmph_uint8 find_degree1_edge(graph_t *g, cmph_uint32 v, cmph_uint8 *deleted, cmph_uint32 *e) +{ + cmph_uint32 edge = g->first[v]; + cmph_uint8 found = 0; + DEBUGP("Checking degree of vertex %u\n", v); + if (edge == EMPTY) return 0; + else if (!(GETBIT(deleted, abs_edge(edge, 0)))) + { + found = 1; + *e = edge; + } + while(1) + { + edge = g->next[edge]; + if (edge == EMPTY) break; + if (GETBIT(deleted, abs_edge(edge, 0))) continue; + if (found) return 0; + DEBUGP("Found first edge\n"); + *e = edge; + found = 1; + } + return found; +} + +static void cyclic_del_edge(graph_t *g, cmph_uint32 v, cmph_uint8 *deleted) +{ + + cmph_uint32 e = 0; + cmph_uint8 degree1; + cmph_uint32 v1 = v; + cmph_uint32 v2 = 0; + + degree1 = find_degree1_edge(g, v1, deleted, &e); + if (!degree1) return; + while(1) + { + DEBUGP("Deleting edge %u (%u->%u)\n", e, g->edges[abs_edge(e, 0)], g->edges[abs_edge(e, 1)]); + SETBIT(deleted, abs_edge(e, 0)); + + v2 = g->edges[abs_edge(e, 0)]; + if (v2 == v1) v2 = g->edges[abs_edge(e, 1)]; + + DEBUGP("Checking if second endpoint %u has degree 1\n", v2); + degree1 = find_degree1_edge(g, v2, deleted, &e); + if (degree1) + { + DEBUGP("Inspecting vertex %u\n", v2); + v1 = v2; + } + else break; + } +} + +int graph_is_cyclic(graph_t *g) +{ + cmph_uint32 i; + cmph_uint32 v; + cmph_uint8 *deleted = (cmph_uint8 *)malloc((g->nedges*sizeof(cmph_uint8))/8 + 1); + size_t deleted_len = g->nedges/8 + 1; + memset(deleted, 0, deleted_len); + + DEBUGP("Looking for cycles in graph with %u vertices and %u edges\n", g->nnodes, g->nedges); + for (v = 0; v < g->nnodes; ++v) + { + cyclic_del_edge(g, v, deleted); + } + for (i = 0; i < g->nedges; ++i) + { + if (!(GETBIT(deleted, i))) + { + DEBUGP("Edge %u %u->%u was not deleted\n", i, g->edges[i], g->edges[i + g->nedges]); + free(deleted); + return 1; + } + } + free(deleted); + return 0; +} + +cmph_uint8 graph_node_is_critical(graph_t * g, cmph_uint32 v) /* included -- Fabiano */ +{ + return (cmph_uint8)GETBIT(g->critical_nodes,v); +} + +void graph_obtain_critical_nodes(graph_t *g) /* included -- Fabiano*/ +{ + cmph_uint32 i; + cmph_uint32 v; + cmph_uint8 *deleted = (cmph_uint8 *)malloc((g->nedges*sizeof(cmph_uint8))/8+1); + size_t deleted_len = g->nedges/8 + 1; + memset(deleted, 0, deleted_len); + free(g->critical_nodes); + g->critical_nodes = (cmph_uint8 *)malloc((g->nnodes*sizeof(cmph_uint8))/8 + 1); + g->ncritical_nodes = 0; + memset(g->critical_nodes, 0, (g->nnodes*sizeof(cmph_uint8))/8 + 1); + DEBUGP("Looking for the 2-core in graph with %u vertices and %u edges\n", g->nnodes, g->nedges); + for (v = 0; v < g->nnodes; ++v) + { + cyclic_del_edge(g, v, deleted); + } + + for (i = 0; i < g->nedges; ++i) + { + if (!(GETBIT(deleted,i))) + { + DEBUGP("Edge %u %u->%u belongs to the 2-core\n", i, g->edges[i], g->edges[i + g->nedges]); + if(!(GETBIT(g->critical_nodes,g->edges[i]))) + { + g->ncritical_nodes ++; + SETBIT(g->critical_nodes,g->edges[i]); + } + if(!(GETBIT(g->critical_nodes,g->edges[i + g->nedges]))) + { + g->ncritical_nodes ++; + SETBIT(g->critical_nodes,g->edges[i + g->nedges]); + } + } + } + free(deleted); +} + +cmph_uint8 graph_contains_edge(graph_t *g, cmph_uint32 v1, cmph_uint32 v2) /* included -- Fabiano*/ +{ + cmph_uint32 e; + e = g->first[v1]; + if(e == EMPTY) return 0; + if (check_edge(g, e, v1, v2)) return 1; + do + { + e = g->next[e]; + if(e == EMPTY) return 0; + } + while (!check_edge(g, e, v1, v2)); + return 1; +} + +cmph_uint32 graph_vertex_id(graph_t *g, cmph_uint32 e, cmph_uint32 id) /* included -- Fabiano*/ +{ + return (g->edges[e + id*g->nedges]); +} + +cmph_uint32 graph_ncritical_nodes(graph_t *g) /* included -- Fabiano*/ +{ + return g->ncritical_nodes; +} + +graph_iterator_t graph_neighbors_it(graph_t *g, cmph_uint32 v) +{ + graph_iterator_t it; + it.vertex = v; + it.edge = g->first[v]; + return it; +} +cmph_uint32 graph_next_neighbor(graph_t *g, graph_iterator_t* it) +{ + cmph_uint32 ret; + if(it->edge == EMPTY) return GRAPH_NO_NEIGHBOR; + if (g->edges[it->edge] == it->vertex) ret = g->edges[it->edge + g->nedges]; + else ret = g->edges[it->edge]; + it->edge = g->next[it->edge]; + return ret; +} + + diff --git a/cmph/graph.h b/cmph/graph.h new file mode 100644 index 000000000..e1b5de6f6 --- /dev/null +++ b/cmph/graph.h @@ -0,0 +1,40 @@ +#ifndef _CMPH_GRAPH_H__ +#define _CMPH_GRAPH_H__ + +#include +#include "cmph_types.h" + +#define GRAPH_NO_NEIGHBOR UINT_MAX + +typedef struct __graph_t graph_t; +typedef struct __graph_iterator_t graph_iterator_t; +struct __graph_iterator_t +{ + cmph_uint32 vertex; + cmph_uint32 edge; +}; + + + +graph_t *graph_new(cmph_uint32 nnodes, cmph_uint32 nedges); +void graph_destroy(graph_t *graph); + +void graph_add_edge(graph_t *g, cmph_uint32 v1, cmph_uint32 v2); +void graph_del_edge(graph_t *g, cmph_uint32 v1, cmph_uint32 v2); +void graph_clear_edges(graph_t *g); +cmph_uint32 graph_edge_id(graph_t *g, cmph_uint32 v1, cmph_uint32 v2); +cmph_uint8 graph_contains_edge(graph_t *g, cmph_uint32 v1, cmph_uint32 v2); + +graph_iterator_t graph_neighbors_it(graph_t *g, cmph_uint32 v); +cmph_uint32 graph_next_neighbor(graph_t *g, graph_iterator_t* it); + +void graph_obtain_critical_nodes(graph_t *g); /* included -- Fabiano*/ +cmph_uint8 graph_node_is_critical(graph_t * g, cmph_uint32 v); /* included -- Fabiano */ +cmph_uint32 graph_ncritical_nodes(graph_t *g); /* included -- Fabiano*/ +cmph_uint32 graph_vertex_id(graph_t *g, cmph_uint32 e, cmph_uint32 id); /* included -- Fabiano*/ + +int graph_is_cyclic(graph_t *g); + +void graph_print(graph_t *); + +#endif diff --git a/cmph/hash.c b/cmph/hash.c new file mode 100644 index 000000000..be86d6e75 --- /dev/null +++ b/cmph/hash.c @@ -0,0 +1,216 @@ +#include "hash_state.h" +#include +#include +#include +#include + +//#define DEBUG +#include "debug.h" + +const char *cmph_hash_names[] = { "jenkins", NULL }; + +hash_state_t *hash_state_new(CMPH_HASH hashfunc, cmph_uint32 hashsize) +{ + hash_state_t *state = NULL; + switch (hashfunc) + { + case CMPH_HASH_JENKINS: + DEBUGP("Jenkins function - %u\n", hashsize); + state = (hash_state_t *)jenkins_state_new(hashsize); + DEBUGP("Jenkins function created\n"); + break; + default: + assert(0); + } + state->hashfunc = hashfunc; + return state; +} +cmph_uint32 hash(hash_state_t *state, const char *key, cmph_uint32 keylen) +{ + switch (state->hashfunc) + { + case CMPH_HASH_JENKINS: + return jenkins_hash((jenkins_state_t *)state, key, keylen); + default: + assert(0); + } + assert(0); + return 0; +} + +void hash_vector(hash_state_t *state, const char *key, cmph_uint32 keylen, cmph_uint32 * hashes) +{ + switch (state->hashfunc) + { + case CMPH_HASH_JENKINS: + jenkins_hash_vector_((jenkins_state_t *)state, key, keylen, hashes); + break; + default: + assert(0); + } +} + + +void hash_state_dump(hash_state_t *state, char **buf, cmph_uint32 *buflen) +{ + char *algobuf; + size_t len; + switch (state->hashfunc) + { + case CMPH_HASH_JENKINS: + jenkins_state_dump((jenkins_state_t *)state, &algobuf, buflen); + if (*buflen == UINT_MAX) return; + break; + default: + assert(0); + } + *buf = (char *)malloc(strlen(cmph_hash_names[state->hashfunc]) + 1 + *buflen); + memcpy(*buf, cmph_hash_names[state->hashfunc], strlen(cmph_hash_names[state->hashfunc]) + 1); + DEBUGP("Algobuf is %u\n", *(cmph_uint32 *)algobuf); + len = *buflen; + memcpy(*buf + strlen(cmph_hash_names[state->hashfunc]) + 1, algobuf, len); + *buflen = (cmph_uint32)strlen(cmph_hash_names[state->hashfunc]) + 1 + *buflen; + free(algobuf); + return; +} + +hash_state_t * hash_state_copy(hash_state_t *src_state) +{ + hash_state_t *dest_state = NULL; + switch (src_state->hashfunc) + { + case CMPH_HASH_JENKINS: + dest_state = (hash_state_t *)jenkins_state_copy((jenkins_state_t *)src_state); + break; + default: + assert(0); + } + dest_state->hashfunc = src_state->hashfunc; + return dest_state; +} + +hash_state_t *hash_state_load(const char *buf, cmph_uint32 buflen) +{ + cmph_uint32 i; + cmph_uint32 offset; + CMPH_HASH hashfunc = CMPH_HASH_COUNT; + for (i = 0; i < CMPH_HASH_COUNT; ++i) + { + if (strcmp(buf, cmph_hash_names[i]) == 0) + { + hashfunc = i; + break; + } + } + if (hashfunc == CMPH_HASH_COUNT) return NULL; + offset = (cmph_uint32)strlen(cmph_hash_names[hashfunc]) + 1; + switch (hashfunc) + { + case CMPH_HASH_JENKINS: + return (hash_state_t *)jenkins_state_load(buf + offset, buflen - offset); + default: + return NULL; + } + return NULL; +} +void hash_state_destroy(hash_state_t *state) +{ + switch (state->hashfunc) + { + case CMPH_HASH_JENKINS: + jenkins_state_destroy((jenkins_state_t *)state); + break; + default: + assert(0); + } + return; +} + +/** \fn void hash_state_pack(hash_state_t *state, void *hash_packed) + * \brief Support the ability to pack a hash function into a preallocated contiguous memory space pointed by hash_packed. + * \param state points to the hash function + * \param hash_packed pointer to the contiguous memory area used to store the hash function. The size of hash_packed must be at least hash_state_packed_size() + * + * Support the ability to pack a hash function into a preallocated contiguous memory space pointed by hash_packed. + * However, the hash function type must be packed outside. + */ +void hash_state_pack(hash_state_t *state, void *hash_packed) +{ + switch (state->hashfunc) + { + case CMPH_HASH_JENKINS: + // pack the jenkins hash function + jenkins_state_pack((jenkins_state_t *)state, hash_packed); + break; + default: + assert(0); + } + return; +} + +/** \fn cmph_uint32 hash_state_packed_size(CMPH_HASH hashfunc) + * \brief Return the amount of space needed to pack a hash function. + * \param hashfunc function type + * \return the size of the packed function or zero for failures + */ +cmph_uint32 hash_state_packed_size(CMPH_HASH hashfunc) +{ + cmph_uint32 size = 0; + switch (hashfunc) + { + case CMPH_HASH_JENKINS: + size += jenkins_state_packed_size(); + break; + default: + assert(0); + } + return size; +} + +/** \fn cmph_uint32 hash_packed(void *hash_packed, CMPH_HASH hashfunc, const char *k, cmph_uint32 keylen) + * \param hash_packed is a pointer to a contiguous memory area + * \param hashfunc is the type of the hash function packed in hash_packed + * \param key is a pointer to a key + * \param keylen is the key length + * \return an integer that represents a hash value of 32 bits. + */ +cmph_uint32 hash_packed(void *hash_packed, CMPH_HASH hashfunc, const char *k, cmph_uint32 keylen) +{ + switch (hashfunc) + { + case CMPH_HASH_JENKINS: + return jenkins_hash_packed(hash_packed, k, keylen); + default: + assert(0); + } + assert(0); + return 0; +} + +/** \fn hash_vector_packed(void *hash_packed, CMPH_HASH hashfunc, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes) + * \param hash_packed is a pointer to a contiguous memory area + * \param key is a pointer to a key + * \param keylen is the key length + * \param hashes is a pointer to a memory large enough to fit three 32-bit integers. + */ +void hash_vector_packed(void *hash_packed, CMPH_HASH hashfunc, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes) +{ + switch (hashfunc) + { + case CMPH_HASH_JENKINS: + jenkins_hash_vector_packed(hash_packed, k, keylen, hashes); + break; + default: + assert(0); + } +} + + +/** \fn CMPH_HASH hash_get_type(hash_state_t *state); + * \param state is a pointer to a hash_state_t structure + * \return the hash function type pointed by state + */ +CMPH_HASH hash_get_type(hash_state_t *state) +{ + return state->hashfunc; +} diff --git a/cmph/hash.h b/cmph/hash.h new file mode 100644 index 000000000..0ec4ce1c7 --- /dev/null +++ b/cmph/hash.h @@ -0,0 +1,76 @@ +#ifndef __CMPH_HASH_H__ +#define __CMPH_HASH_H__ + +#include "cmph_types.h" + +typedef union __hash_state_t hash_state_t; + +hash_state_t *hash_state_new(CMPH_HASH, cmph_uint32 hashsize); + +/** \fn cmph_uint32 hash(hash_state_t *state, const char *key, cmph_uint32 keylen); + * \param state is a pointer to a hash_state_t structure + * \param key is a pointer to a key + * \param keylen is the key length + * \return an integer that represents a hash value of 32 bits. + */ +cmph_uint32 hash(hash_state_t *state, const char *key, cmph_uint32 keylen); + +/** \fn void hash_vector(hash_state_t *state, const char *key, cmph_uint32 keylen, cmph_uint32 * hashes); + * \param state is a pointer to a hash_state_t structure + * \param key is a pointer to a key + * \param keylen is the key length + * \param hashes is a pointer to a memory large enough to fit three 32-bit integers. + */ +void hash_vector(hash_state_t *state, const char *key, cmph_uint32 keylen, cmph_uint32 * hashes); + +void hash_state_dump(hash_state_t *state, char **buf, cmph_uint32 *buflen); + +hash_state_t * hash_state_copy(hash_state_t *src_state); + +hash_state_t *hash_state_load(const char *buf, cmph_uint32 buflen); + +void hash_state_destroy(hash_state_t *state); + +/** \fn void hash_state_pack(hash_state_t *state, void *hash_packed); + * \brief Support the ability to pack a hash function into a preallocated contiguous memory space pointed by hash_packed. + * \param state points to the hash function + * \param hash_packed pointer to the contiguous memory area used to store the hash function. The size of hash_packed must be at least hash_state_packed_size() + * + * Support the ability to pack a hash function into a preallocated contiguous memory space pointed by hash_packed. + * However, the hash function type must be packed outside. + */ +void hash_state_pack(hash_state_t *state, void *hash_packed); + +/** \fn cmph_uint32 hash_packed(void *hash_packed, CMPH_HASH hashfunc, const char *k, cmph_uint32 keylen); + * \param hash_packed is a pointer to a contiguous memory area + * \param hashfunc is the type of the hash function packed in hash_packed + * \param key is a pointer to a key + * \param keylen is the key length + * \return an integer that represents a hash value of 32 bits. + */ +cmph_uint32 hash_packed(void *hash_packed, CMPH_HASH hashfunc, const char *k, cmph_uint32 keylen); + +/** \fn cmph_uint32 hash_state_packed_size(CMPH_HASH hashfunc) + * \brief Return the amount of space needed to pack a hash function. + * \param hashfunc function type + * \return the size of the packed function or zero for failures + */ +cmph_uint32 hash_state_packed_size(CMPH_HASH hashfunc); + + +/** \fn hash_vector_packed(void *hash_packed, CMPH_HASH hashfunc, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes); + * \param hash_packed is a pointer to a contiguous memory area + * \param key is a pointer to a key + * \param keylen is the key length + * \param hashes is a pointer to a memory large enough to fit three 32-bit integers. + */ +void hash_vector_packed(void *hash_packed, CMPH_HASH hashfunc, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes); + + +/** \fn CMPH_HASH hash_get_type(hash_state_t *state); + * \param state is a pointer to a hash_state_t structure + * \return the hash function type pointed by state + */ +CMPH_HASH hash_get_type(hash_state_t *state); + +#endif diff --git a/cmph/hash_state.h b/cmph/hash_state.h new file mode 100644 index 000000000..1b567dca1 --- /dev/null +++ b/cmph/hash_state.h @@ -0,0 +1,12 @@ +#ifndef __HASH_STATE_H__ +#define __HASH_STATE_H__ + +#include "hash.h" +#include "jenkins_hash.h" +union __hash_state_t +{ + CMPH_HASH hashfunc; + jenkins_state_t jenkins; +}; + +#endif diff --git a/cmph/hashtree.c b/cmph/hashtree.c new file mode 100644 index 000000000..2f3567e55 --- /dev/null +++ b/cmph/hashtree.c @@ -0,0 +1,289 @@ +#include "graph.h" +#include "hashtree.h" +#include "cmph_structs.h" +#include "hastree_structs.h" +#include "hash.h" +#include "bitbool.h" + +#include +#include +#include +#include +#include + +//#define DEBUG +#include "debug.h" + +hashtree_config_data_t *hashtree_config_new() +{ + hashtree_config_data_t *hashtree; + hashtree = (hashtree_config_data_t *)malloc(sizeof(hashtree_config_data_t)); + if (!hashtree) return NULL; + memset(hashtree, 0, sizeof(hashtree_config_data_t)); + hashtree->hashfuncs[0] = CMPH_HASH_JENKINS; + hashtree->hashfuncs[1] = CMPH_HASH_JENKINS; + hashtree->hashfuncs[2] = CMPH_HASH_JENKINS; + hashtree->memory = 32 * 1024 * 1024; + return hashtree; +} +void hashtree_config_destroy(cmph_config_t *mph) +{ + hashtree_config_data_t *data = (hashtree_config_data_t *)mph->data; + DEBUGP("Destroying algorithm dependent data\n"); + free(data); +} + +void hashtree_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs) +{ + hashtree_config_data_t *hashtree = (hashtree_config_data_t *)mph->data; + CMPH_HASH *hashptr = hashfuncs; + cmph_uint32 i = 0; + while(*hashptr != CMPH_HASH_COUNT) + { + if (i >= 3) break; //hashtree only uses three hash functions + hashtree->hashfuncs[i] = *hashptr; + ++i, ++hashptr; + } +} + +cmph_t *hashtree_new(cmph_config_t *mph, double c) +{ + cmph_t *mphf = NULL; + hashtree_data_t *hashtreef = NULL; + + cmph_uint32 i; + cmph_uint32 iterations = 20; + cmph_uint8 *visited = NULL; + hashtree_config_data_t *hashtree = (hashtree_config_data_t *)mph->data; + hashtree->m = mph->key_source->nkeys; + hashtree->n = ceil(c * mph->key_source->nkeys); + DEBUGP("m (edges): %u n (vertices): %u c: %f\n", hashtree->m, hashtree->n, c); + hashtree->graph = graph_new(hashtree->n, hashtree->m); + DEBUGP("Created graph\n"); + + hashtree->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*3); + for(i = 0; i < 3; ++i) hashtree->hashes[i] = NULL; + //Mapping step + if (mph->verbosity) + { + fprintf(stderr, "Entering mapping step for mph creation of %u keys with graph sized %u\n", hashtree->m, hashtree->n); + } + while(1) + { + int ok; + hashtree->hashes[0] = hash_state_new(hashtree->hashfuncs[0], hashtree->n); + hashtree->hashes[1] = hash_state_new(hashtree->hashfuncs[1], hashtree->n); + ok = hashtree_gen_edges(mph); + if (!ok) + { + --iterations; + hash_state_destroy(hashtree->hashes[0]); + hashtree->hashes[0] = NULL; + hash_state_destroy(hashtree->hashes[1]); + hashtree->hashes[1] = NULL; + DEBUGP("%u iterations remaining\n", iterations); + if (mph->verbosity) + { + fprintf(stderr, "Acyclic graph creation failure - %u iterations remaining\n", iterations); + } + if (iterations == 0) break; + } + else break; + } + if (iterations == 0) + { + graph_destroy(hashtree->graph); + return NULL; + } + + //Assignment step + if (mph->verbosity) + { + fprintf(stderr, "Starting assignment step\n"); + } + DEBUGP("Assignment step\n"); + visited = (char *)malloc(hashtree->n/8 + 1); + memset(visited, 0, hashtree->n/8 + 1); + free(hashtree->g); + hashtree->g = (cmph_uint32 *)malloc(hashtree->n * sizeof(cmph_uint32)); + assert(hashtree->g); + for (i = 0; i < hashtree->n; ++i) + { + if (!GETBIT(visited,i)) + { + hashtree->g[i] = 0; + hashtree_traverse(hashtree, visited, i); + } + } + graph_destroy(hashtree->graph); + free(visited); + hashtree->graph = NULL; + + mphf = (cmph_t *)malloc(sizeof(cmph_t)); + mphf->algo = mph->algo; + hashtreef = (hashtree_data_t *)malloc(sizeof(hashtree_data_t)); + hashtreef->g = hashtree->g; + hashtree->g = NULL; //transfer memory ownership + hashtreef->hashes = hashtree->hashes; + hashtree->hashes = NULL; //transfer memory ownership + hashtreef->n = hashtree->n; + hashtreef->m = hashtree->m; + mphf->data = hashtreef; + mphf->size = hashtree->m; + DEBUGP("Successfully generated minimal perfect hash\n"); + if (mph->verbosity) + { + fprintf(stderr, "Successfully generated minimal perfect hash function\n"); + } + return mphf; +} + +static void hashtree_traverse(hashtree_config_data_t *hashtree, cmph_uint8 *visited, cmph_uint32 v) +{ + + graph_iterator_t it = graph_neighbors_it(hashtree->graph, v); + cmph_uint32 neighbor = 0; + SETBIT(visited,v); + + DEBUGP("Visiting vertex %u\n", v); + while((neighbor = graph_next_neighbor(hashtree->graph, &it)) != GRAPH_NO_NEIGHBOR) + { + DEBUGP("Visiting neighbor %u\n", neighbor); + if(GETBIT(visited,neighbor)) continue; + DEBUGP("Visiting neighbor %u\n", neighbor); + DEBUGP("Visiting edge %u->%u with id %u\n", v, neighbor, graph_edge_id(hashtree->graph, v, neighbor)); + hashtree->g[neighbor] = graph_edge_id(hashtree->graph, v, neighbor) - hashtree->g[v]; + DEBUGP("g is %u (%u - %u mod %u)\n", hashtree->g[neighbor], graph_edge_id(hashtree->graph, v, neighbor), hashtree->g[v], hashtree->m); + hashtree_traverse(hashtree, visited, neighbor); + } +} + +static int hashtree_gen_edges(cmph_config_t *mph) +{ + cmph_uint32 e; + hashtree_config_data_t *hashtree = (hashtree_config_data_t *)mph->data; + int cycles = 0; + + DEBUGP("Generating edges for %u vertices with hash functions %s and %s\n", hashtree->n, cmph_hash_names[hashtree->hashfuncs[0]], cmph_hash_names[hashtree->hashfuncs[1]]); + graph_clear_edges(hashtree->graph); + mph->key_source->rewind(mph->key_source->data); + for (e = 0; e < mph->key_source->nkeys; ++e) + { + cmph_uint32 h1, h2; + cmph_uint32 keylen; + char *key; + mph->key_source->read(mph->key_source->data, &key, &keylen); + h1 = hash(hashtree->hashes[0], key, keylen) % hashtree->n; + h2 = hash(hashtree->hashes[1], key, keylen) % hashtree->n; + if (h1 == h2) if (++h2 >= hashtree->n) h2 = 0; + if (h1 == h2) + { + if (mph->verbosity) fprintf(stderr, "Self loop for key %u\n", e); + mph->key_source->dispose(mph->key_source->data, key, keylen); + return 0; + } + DEBUGP("Adding edge: %u -> %u for key %s\n", h1, h2, key); + mph->key_source->dispose(mph->key_source->data, key, keylen); + graph_add_edge(hashtree->graph, h1, h2); + } + cycles = graph_is_cyclic(hashtree->graph); + if (mph->verbosity && cycles) fprintf(stderr, "Cyclic graph generated\n"); + DEBUGP("Looking for cycles: %u\n", cycles); + + return ! cycles; +} + +int hashtree_dump(cmph_t *mphf, FILE *fd) +{ + char *buf = NULL; + cmph_uint32 buflen; + cmph_uint32 two = 2; //number of hash functions + hashtree_data_t *data = (hashtree_data_t *)mphf->data; + __cmph_dump(mphf, fd); + + fwrite(&two, sizeof(cmph_uint32), 1, fd); + hash_state_dump(data->hashes[0], &buf, &buflen); + DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); + fwrite(&buflen, sizeof(cmph_uint32), 1, fd); + fwrite(buf, buflen, 1, fd); + free(buf); + + hash_state_dump(data->hashes[1], &buf, &buflen); + DEBUGP("Dumping hash state with %u bytes to disk\n", buflen); + fwrite(&buflen, sizeof(cmph_uint32), 1, fd); + fwrite(buf, buflen, 1, fd); + free(buf); + + fwrite(&(data->n), sizeof(cmph_uint32), 1, fd); + fwrite(&(data->m), sizeof(cmph_uint32), 1, fd); + + fwrite(data->g, sizeof(cmph_uint32)*data->n, 1, fd); + #ifdef DEBUG + fprintf(stderr, "G: "); + for (i = 0; i < data->n; ++i) fprintf(stderr, "%u ", data->g[i]); + fprintf(stderr, "\n"); + #endif + return 1; +} + +void hashtree_load(FILE *f, cmph_t *mphf) +{ + cmph_uint32 nhashes; + char *buf = NULL; + cmph_uint32 buflen; + cmph_uint32 i; + hashtree_data_t *hashtree = (hashtree_data_t *)malloc(sizeof(hashtree_data_t)); + + DEBUGP("Loading hashtree mphf\n"); + mphf->data = hashtree; + fread(&nhashes, sizeof(cmph_uint32), 1, f); + hashtree->hashes = (hash_state_t **)malloc(sizeof(hash_state_t *)*(nhashes + 1)); + hashtree->hashes[nhashes] = NULL; + DEBUGP("Reading %u hashes\n", nhashes); + for (i = 0; i < nhashes; ++i) + { + hash_state_t *state = NULL; + fread(&buflen, sizeof(cmph_uint32), 1, f); + DEBUGP("Hash state has %u bytes\n", buflen); + buf = (char *)malloc(buflen); + fread(buf, buflen, 1, f); + state = hash_state_load(buf, buflen); + hashtree->hashes[i] = state; + free(buf); + } + + DEBUGP("Reading m and n\n"); + fread(&(hashtree->n), sizeof(cmph_uint32), 1, f); + fread(&(hashtree->m), sizeof(cmph_uint32), 1, f); + + hashtree->g = (cmph_uint32 *)malloc(sizeof(cmph_uint32)*hashtree->n); + fread(hashtree->g, hashtree->n*sizeof(cmph_uint32), 1, f); + #ifdef DEBUG + fprintf(stderr, "G: "); + for (i = 0; i < hashtree->n; ++i) fprintf(stderr, "%u ", hashtree->g[i]); + fprintf(stderr, "\n"); + #endif + return; +} + + +cmph_uint32 hashtree_search(cmph_t *mphf, const char *key, cmph_uint32 keylen) +{ + hashtree_data_t *hashtree = mphf->data; + cmph_uint32 h1 = hash(hashtree->hashes[0], key, keylen) % hashtree->n; + cmph_uint32 h2 = hash(hashtree->hashes[1], key, keylen) % hashtree->n; + DEBUGP("key: %s h1: %u h2: %u\n", key, h1, h2); + if (h1 == h2 && ++h2 >= hashtree->n) h2 = 0; + DEBUGP("key: %s g[h1]: %u g[h2]: %u edges: %u\n", key, hashtree->g[h1], hashtree->g[h2], hashtree->m); + return (hashtree->g[h1] + hashtree->g[h2]) % hashtree->m; +} +void hashtree_destroy(cmph_t *mphf) +{ + hashtree_data_t *data = (hashtree_data_t *)mphf->data; + free(data->g); + hash_state_destroy(data->hashes[0]); + hash_state_destroy(data->hashes[1]); + free(data->hashes); + free(data); + free(mphf); +} diff --git a/cmph/hashtree.h b/cmph/hashtree.h new file mode 100644 index 000000000..8bff67462 --- /dev/null +++ b/cmph/hashtree.h @@ -0,0 +1,19 @@ +#ifndef __CMPH_HASHTREE_H__ +#define __CMPH_HASHTREE_H__ + +#include "cmph.h" + +typedef struct __hashtree_data_t hashtree_data_t; +typedef struct __hashtree_config_data_t hashtree_config_data_t; + +hashtree_config_data_t *hashtree_config_new(); +void hashtree_config_set_hashfuncs(cmph_config_t *mph, CMPH_HASH *hashfuncs); +void hashtree_config_set_leaf_algo(cmph_config_t *mph, CMPH_ALGO leaf_algo); +void hashtree_config_destroy(cmph_config_t *mph); +cmph_t *hashtree_new(cmph_config_t *mph, double c); + +void hashtree_load(FILE *f, cmph_t *mphf); +int hashtree_dump(cmph_t *mphf, FILE *f); +void hashtree_destroy(cmph_t *mphf); +cmph_uint32 hashtree_search(cmph_t *mphf, const char *key, cmph_uint32 keylen); +#endif diff --git a/cmph/hashtree_structs.h b/cmph/hashtree_structs.h new file mode 100644 index 000000000..7258cd399 --- /dev/null +++ b/cmph/hashtree_structs.h @@ -0,0 +1,32 @@ +#ifndef __CMPH_HASHTREE_STRUCTS_H__ +#define __CMPH_HASHTREE_STRUCTS_H__ + +#include "hash_state.h" + +struct __hashtree_data_t +{ + cmph_uint32 m; //edges (words) count + double c; //constant c + cmph_uint8 *size; //size[i] stores the number of edges represented by g[i] + cmph_uint32 **g; + cmph_uint32 k; //number of components + hash_state_t **h1; + hash_state_t **h2; + hash_state_t *h3; +}; + +struct __hashtree_config_data_t +{ + CMPH_ALGO leaf_algo; + CMPH_HASH hashfuncs[3]; + cmph_uint32 m; //edges (words) count + cmph_uint8 *size; //size[i] stores the number of edges represented by g[i] + cmph_uint32 *offset; //offset[i] stores the sum size[0] + ... size[i - 1] + cmph_uint32 k; //number of components + cmph_uint32 memory; + hash_state_t **h1; + hash_state_t **h2; + hash_state_t *h3; +}; + +#endif diff --git a/cmph/jenkins_hash.c b/cmph/jenkins_hash.c new file mode 100644 index 000000000..f5233a5a5 --- /dev/null +++ b/cmph/jenkins_hash.c @@ -0,0 +1,297 @@ +#include "jenkins_hash.h" +#include +#ifdef WIN32 +#define _USE_MATH_DEFINES //For M_LOG2E +#endif +#include +#include +#include + +//#define DEBUG +#include "debug.h" + +#define hashsize(n) ((cmph_uint32)1<<(n)) +#define hashmask(n) (hashsize(n)-1) + + + +//#define NM2 /* Define this if you do not want power of 2 table sizes*/ + + +/* + -------------------------------------------------------------------- + mix -- mix 3 32-bit values reversibly. + For every delta with one or two bits set, and the deltas of all three + high bits or all three low bits, whether the original value of a,b,c + is almost all zero or is uniformly distributed, + * If mix() is run forward or backward, at least 32 bits in a,b,c + have at least 1/4 probability of changing. + * If mix() is run forward, every bit of c will change between 1/3 and + 2/3 of the time. (Well, 22/100 and 78/100 for some 2-bit deltas.) + mix() was built out of 36 single-cycle latency instructions in a + structure that could supported 2x parallelism, like so: + a -= b; + a -= c; x = (c>>13); + b -= c; a ^= x; + b -= a; x = (a<<8); + c -= a; b ^= x; + c -= b; x = (b>>13); + ... + Unfortunately, superscalar Pentiums and Sparcs can't take advantage + of that parallelism. They've also turned some of those single-cycle + latency instructions into multi-cycle latency instructions. Still, + this is the fastest good hash I could find. There were about 2^^68 + to choose from. I only looked at a billion or so. + -------------------------------------------------------------------- + */ +#define mix(a,b,c) \ +{ \ + a -= b; a -= c; a ^= (c>>13); \ + b -= c; b -= a; b ^= (a<<8); \ + c -= a; c -= b; c ^= (b>>13); \ + a -= b; a -= c; a ^= (c>>12); \ + b -= c; b -= a; b ^= (a<<16); \ + c -= a; c -= b; c ^= (b>>5); \ + a -= b; a -= c; a ^= (c>>3); \ + b -= c; b -= a; b ^= (a<<10); \ + c -= a; c -= b; c ^= (b>>15); \ +} + +/* + -------------------------------------------------------------------- + hash() -- hash a variable-length key into a 32-bit value +k : the key (the unaligned variable-length array of bytes) +len : the length of the key, counting by bytes +initval : can be any 4-byte value +Returns a 32-bit value. Every bit of the key affects every bit of +the return value. Every 1-bit and 2-bit delta achieves avalanche. +About 6*len+35 instructions. + +The best hash table sizes are powers of 2. There is no need to do +mod a prime (mod is sooo slow!). If you need less than 32 bits, +use a bitmask. For example, if you need only 10 bits, do +h = (h & hashmask(10)); +In which case, the hash table should have hashsize(10) elements. + +If you are hashing n strings (cmph_uint8 **)k, do it like this: +for (i=0, h=0; iseed = ((cmph_uint32)rand() % size); + return state; +} +void jenkins_state_destroy(jenkins_state_t *state) +{ + free(state); +} + + +inline void __jenkins_hash_vector(cmph_uint32 seed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes) +{ + register cmph_uint32 len, length; + + /* Set up the internal state */ + length = keylen; + len = length; + hashes[0] = hashes[1] = 0x9e3779b9; /* the golden ratio; an arbitrary value */ + hashes[2] = seed; /* the previous hash value - seed in our case */ + + /*---------------------------------------- handle most of the key */ + while (len >= 12) + { + hashes[0] += ((cmph_uint32)k[0] +((cmph_uint32)k[1]<<8) +((cmph_uint32)k[2]<<16) +((cmph_uint32)k[3]<<24)); + hashes[1] += ((cmph_uint32)k[4] +((cmph_uint32)k[5]<<8) +((cmph_uint32)k[6]<<16) +((cmph_uint32)k[7]<<24)); + hashes[2] += ((cmph_uint32)k[8] +((cmph_uint32)k[9]<<8) +((cmph_uint32)k[10]<<16)+((cmph_uint32)k[11]<<24)); + mix(hashes[0],hashes[1],hashes[2]); + k += 12; len -= 12; + } + + /*------------------------------------- handle the last 11 bytes */ + hashes[2] += length; + switch(len) /* all the case statements fall through */ + { + case 11: + hashes[2] +=((cmph_uint32)k[10]<<24); + case 10: + hashes[2] +=((cmph_uint32)k[9]<<16); + case 9 : + hashes[2] +=((cmph_uint32)k[8]<<8); + /* the first byte of hashes[2] is reserved for the length */ + case 8 : + hashes[1] +=((cmph_uint32)k[7]<<24); + case 7 : + hashes[1] +=((cmph_uint32)k[6]<<16); + case 6 : + hashes[1] +=((cmph_uint32)k[5]<<8); + case 5 : + hashes[1] +=(cmph_uint8) k[4]; + case 4 : + hashes[0] +=((cmph_uint32)k[3]<<24); + case 3 : + hashes[0] +=((cmph_uint32)k[2]<<16); + case 2 : + hashes[0] +=((cmph_uint32)k[1]<<8); + case 1 : + hashes[0] +=(cmph_uint8)k[0]; + /* case 0: nothing left to add */ + } + + mix(hashes[0],hashes[1],hashes[2]); +} + +cmph_uint32 jenkins_hash(jenkins_state_t *state, const char *k, cmph_uint32 keylen) +{ + cmph_uint32 hashes[3]; + __jenkins_hash_vector(state->seed, k, keylen, hashes); + return hashes[2]; +/* cmph_uint32 a, b, c; + cmph_uint32 len, length; + + // Set up the internal state + length = keylen; + len = length; + a = b = 0x9e3779b9; // the golden ratio; an arbitrary value + c = state->seed; // the previous hash value - seed in our case + + // handle most of the key + while (len >= 12) + { + a += (k[0] +((cmph_uint32)k[1]<<8) +((cmph_uint32)k[2]<<16) +((cmph_uint32)k[3]<<24)); + b += (k[4] +((cmph_uint32)k[5]<<8) +((cmph_uint32)k[6]<<16) +((cmph_uint32)k[7]<<24)); + c += (k[8] +((cmph_uint32)k[9]<<8) +((cmph_uint32)k[10]<<16)+((cmph_uint32)k[11]<<24)); + mix(a,b,c); + k += 12; len -= 12; + } + + // handle the last 11 bytes + c += length; + switch(len) /// all the case statements fall through + { + case 11: + c +=((cmph_uint32)k[10]<<24); + case 10: + c +=((cmph_uint32)k[9]<<16); + case 9 : + c +=((cmph_uint32)k[8]<<8); + // the first byte of c is reserved for the length + case 8 : + b +=((cmph_uint32)k[7]<<24); + case 7 : + b +=((cmph_uint32)k[6]<<16); + case 6 : + b +=((cmph_uint32)k[5]<<8); + case 5 : + b +=k[4]; + case 4 : + a +=((cmph_uint32)k[3]<<24); + case 3 : + a +=((cmph_uint32)k[2]<<16); + case 2 : + a +=((cmph_uint32)k[1]<<8); + case 1 : + a +=k[0]; + // case 0: nothing left to add + } + + mix(a,b,c); + + /// report the result + + return c; + */ +} + +void jenkins_hash_vector_(jenkins_state_t *state, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes) +{ + __jenkins_hash_vector(state->seed, k, keylen, hashes); +} + +void jenkins_state_dump(jenkins_state_t *state, char **buf, cmph_uint32 *buflen) +{ + *buflen = sizeof(cmph_uint32); + *buf = (char *)malloc(sizeof(cmph_uint32)); + if (!*buf) + { + *buflen = UINT_MAX; + return; + } + memcpy(*buf, &(state->seed), sizeof(cmph_uint32)); + DEBUGP("Dumped jenkins state with seed %u\n", state->seed); + return; +} + +jenkins_state_t *jenkins_state_copy(jenkins_state_t *src_state) +{ + jenkins_state_t *dest_state = (jenkins_state_t *)malloc(sizeof(jenkins_state_t)); + dest_state->hashfunc = src_state->hashfunc; + dest_state->seed = src_state->seed; + return dest_state; +} + +jenkins_state_t *jenkins_state_load(const char *buf, cmph_uint32 buflen) +{ + jenkins_state_t *state = (jenkins_state_t *)malloc(sizeof(jenkins_state_t)); + state->seed = *(cmph_uint32 *)buf; + state->hashfunc = CMPH_HASH_JENKINS; + DEBUGP("Loaded jenkins state with seed %u\n", state->seed); + return state; +} + + +/** \fn void jenkins_state_pack(jenkins_state_t *state, void *jenkins_packed); + * \brief Support the ability to pack a jenkins function into a preallocated contiguous memory space pointed by jenkins_packed. + * \param state points to the jenkins function + * \param jenkins_packed pointer to the contiguous memory area used to store the jenkins function. The size of jenkins_packed must be at least jenkins_state_packed_size() + */ +void jenkins_state_pack(jenkins_state_t *state, void *jenkins_packed) +{ + if (state && jenkins_packed) + { + memcpy(jenkins_packed, &(state->seed), sizeof(cmph_uint32)); + } +} + +/** \fn cmph_uint32 jenkins_state_packed_size(jenkins_state_t *state); + * \brief Return the amount of space needed to pack a jenkins function. + * \return the size of the packed function or zero for failures + */ +cmph_uint32 jenkins_state_packed_size() +{ + return sizeof(cmph_uint32); +} + + +/** \fn cmph_uint32 jenkins_hash_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen); + * \param jenkins_packed is a pointer to a contiguous memory area + * \param key is a pointer to a key + * \param keylen is the key length + * \return an integer that represents a hash value of 32 bits. + */ +cmph_uint32 jenkins_hash_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen) +{ + cmph_uint32 hashes[3]; + __jenkins_hash_vector(*((cmph_uint32 *)jenkins_packed), k, keylen, hashes); + return hashes[2]; +} + +/** \fn jenkins_hash_vector_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes); + * \param jenkins_packed is a pointer to a contiguous memory area + * \param key is a pointer to a key + * \param keylen is the key length + * \param hashes is a pointer to a memory large enough to fit three 32-bit integers. + */ +void jenkins_hash_vector_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes) +{ + __jenkins_hash_vector(*((cmph_uint32 *)jenkins_packed), k, keylen, hashes); +} diff --git a/cmph/jenkins_hash.h b/cmph/jenkins_hash.h new file mode 100644 index 000000000..8e8b9173e --- /dev/null +++ b/cmph/jenkins_hash.h @@ -0,0 +1,65 @@ +#ifndef __JEKINS_HASH_H__ +#define __JEKINS_HASH_H__ + +#include "hash.h" + +typedef struct __jenkins_state_t +{ + CMPH_HASH hashfunc; + cmph_uint32 seed; +} jenkins_state_t; + +jenkins_state_t *jenkins_state_new(cmph_uint32 size); //size of hash table + +/** \fn cmph_uint32 jenkins_hash(jenkins_state_t *state, const char *k, cmph_uint32 keylen); + * \param state is a pointer to a jenkins_state_t structure + * \param key is a pointer to a key + * \param keylen is the key length + * \return an integer that represents a hash value of 32 bits. + */ +cmph_uint32 jenkins_hash(jenkins_state_t *state, const char *k, cmph_uint32 keylen); + +/** \fn void jenkins_hash_vector_(jenkins_state_t *state, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes); + * \param state is a pointer to a jenkins_state_t structure + * \param key is a pointer to a key + * \param keylen is the key length + * \param hashes is a pointer to a memory large enough to fit three 32-bit integers. + */ +void jenkins_hash_vector_(jenkins_state_t *state, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes); + +void jenkins_state_dump(jenkins_state_t *state, char **buf, cmph_uint32 *buflen); +jenkins_state_t *jenkins_state_copy(jenkins_state_t *src_state); +jenkins_state_t *jenkins_state_load(const char *buf, cmph_uint32 buflen); +void jenkins_state_destroy(jenkins_state_t *state); + +/** \fn void jenkins_state_pack(jenkins_state_t *state, void *jenkins_packed); + * \brief Support the ability to pack a jenkins function into a preallocated contiguous memory space pointed by jenkins_packed. + * \param state points to the jenkins function + * \param jenkins_packed pointer to the contiguous memory area used to store the jenkins function. The size of jenkins_packed must be at least jenkins_state_packed_size() + */ +void jenkins_state_pack(jenkins_state_t *state, void *jenkins_packed); + +/** \fn cmph_uint32 jenkins_state_packed_size(); + * \brief Return the amount of space needed to pack a jenkins function. + * \return the size of the packed function or zero for failures + */ +cmph_uint32 jenkins_state_packed_size(); + + +/** \fn cmph_uint32 jenkins_hash_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen); + * \param jenkins_packed is a pointer to a contiguous memory area + * \param key is a pointer to a key + * \param keylen is the key length + * \return an integer that represents a hash value of 32 bits. + */ +cmph_uint32 jenkins_hash_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen); + +/** \fn jenkins_hash_vector_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes); + * \param jenkins_packed is a pointer to a contiguous memory area + * \param key is a pointer to a key + * \param keylen is the key length + * \param hashes is a pointer to a memory large enough to fit three 32-bit integers. + */ +void jenkins_hash_vector_packed(void *jenkins_packed, const char *k, cmph_uint32 keylen, cmph_uint32 * hashes); + +#endif diff --git a/cmph/main.c b/cmph/main.c new file mode 100644 index 000000000..f739b325f --- /dev/null +++ b/cmph/main.c @@ -0,0 +1,342 @@ +#ifdef WIN32 +#include "wingetopt.h" +#else +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include "cmph.h" +#include "hash.h" + +#ifdef WIN32 +#define VERSION "0.8" +#else +#include "config.h" +#endif + + +void usage(const char *prg) +{ + fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph] keysfile\n", prg); +} +void usage_long(const char *prg) +{ + cmph_uint32 i; + fprintf(stderr, "usage: %s [-v] [-h] [-V] [-k nkeys] [-f hash_function] [-g [-c algorithm_dependent_value][-s seed] ] [-a algorithm] [-M memory_in_MB] [-b algorithm_dependent_value] [-t keys_per_bin] [-d tmp_dir] [-m file.mph] keysfile\n", prg); + fprintf(stderr, "Minimum perfect hashing tool\n\n"); + fprintf(stderr, " -h\t print this help message\n"); + fprintf(stderr, " -c\t c value determines:\n"); + fprintf(stderr, " \t * the number of vertices in the graph for the algorithms BMZ and CHM\n"); + fprintf(stderr, " \t * the number of bits per key required in the FCH algorithm\n"); + fprintf(stderr, " \t * the load factor in the CHD_PH algorithm\n"); + fprintf(stderr, " -a\t algorithm - valid values are\n"); + for (i = 0; i < CMPH_COUNT; ++i) fprintf(stderr, " \t * %s\n", cmph_names[i]); + fprintf(stderr, " -f\t hash function (may be used multiple times) - valid values are\n"); + for (i = 0; i < CMPH_HASH_COUNT; ++i) fprintf(stderr, " \t * %s\n", cmph_hash_names[i]); + fprintf(stderr, " -V\t print version number and exit\n"); + fprintf(stderr, " -v\t increase verbosity (may be used multiple times)\n"); + fprintf(stderr, " -k\t number of keys\n"); + fprintf(stderr, " -g\t generation mode\n"); + fprintf(stderr, " -s\t random seed\n"); + fprintf(stderr, " -m\t minimum perfect hash function file \n"); + fprintf(stderr, " -M\t main memory availability (in MB) used in BRZ algorithm \n"); + fprintf(stderr, " -d\t temporary directory used in BRZ algorithm \n"); + fprintf(stderr, " -b\t the meaning of this parameter depends on the algorithm selected in the -a option:\n"); + fprintf(stderr, " \t * For BRZ it is used to make the maximal number of keys in a bucket lower than 256.\n"); + fprintf(stderr, " \t In this case its value should be an integer in the range [64,175]. Default is 128.\n\n"); + fprintf(stderr, " \t * For BDZ it is used to determine the size of some precomputed rank\n"); + fprintf(stderr, " \t information and its value should be an integer in the range [3,10]. Default\n"); + fprintf(stderr, " \t is 7. The larger is this value, the more compact are the resulting functions\n"); + fprintf(stderr, " \t and the slower are them at evaluation time.\n\n"); + fprintf(stderr, " \t * For CHD and CHD_PH it is used to set the average number of keys per bucket\n"); + fprintf(stderr, " \t and its value should be an integer in the range [1,32]. Default is 4. The\n"); + fprintf(stderr, " \t larger is this value, the slower is the construction of the functions.\n"); + fprintf(stderr, " \t This parameter has no effect for other algorithms.\n\n"); + fprintf(stderr, " -t\t set the number of keys per bin for a t-perfect hashing function. A t-perfect\n"); + fprintf(stderr, " \t hash function allows at most t collisions in a given bin. This parameter applies\n"); + fprintf(stderr, " \t only to the CHD and CHD_PH algorithms. Its value should be an integer in the\n"); + fprintf(stderr, " \t range [1,128]. Defaul is 1\n"); + fprintf(stderr, " keysfile\t line separated file with keys\n"); +} + +int main(int argc, char **argv) +{ + cmph_uint32 verbosity = 0; + char generate = 0; + char *mphf_file = NULL; + FILE *mphf_fd = stdout; + const char *keys_file = NULL; + FILE *keys_fd; + cmph_uint32 nkeys = UINT_MAX; + cmph_uint32 seed = UINT_MAX; + CMPH_HASH *hashes = NULL; + cmph_uint32 nhashes = 0; + cmph_uint32 i; + CMPH_ALGO mph_algo = CMPH_CHM; + double c = 0; + cmph_config_t *config = NULL; + cmph_t *mphf = NULL; + char * tmp_dir = NULL; + cmph_io_adapter_t *source; + cmph_uint32 memory_availability = 0; + cmph_uint32 b = 0; + cmph_uint32 keys_per_bin = 1; + while (1) + { + char ch = (char)getopt(argc, argv, "hVvgc:k:a:M:b:t:f:m:d:s:"); + if (ch == -1) break; + switch (ch) + { + case 's': + { + char *cptr; + seed = (cmph_uint32)strtoul(optarg, &cptr, 10); + if(*cptr != 0) { + fprintf(stderr, "Invalid seed %s\n", optarg); + exit(1); + } + } + break; + case 'c': + { + char *endptr; + c = strtod(optarg, &endptr); + if(*endptr != 0) { + fprintf(stderr, "Invalid c value %s\n", optarg); + exit(1); + } + } + break; + case 'g': + generate = 1; + break; + case 'k': + { + char *endptr; + nkeys = (cmph_uint32)strtoul(optarg, &endptr, 10); + if(*endptr != 0) { + fprintf(stderr, "Invalid number of keys %s\n", optarg); + exit(1); + } + } + break; + case 'm': + mphf_file = strdup(optarg); + break; + case 'd': + tmp_dir = strdup(optarg); + break; + case 'M': + { + char *cptr; + memory_availability = (cmph_uint32)strtoul(optarg, &cptr, 10); + if(*cptr != 0) { + fprintf(stderr, "Invalid memory availability %s\n", optarg); + exit(1); + } + } + break; + case 'b': + { + char *cptr; + b = (cmph_uint32)strtoul(optarg, &cptr, 10); + if(*cptr != 0) { + fprintf(stderr, "Parameter b was not found: %s\n", optarg); + exit(1); + } + } + break; + case 't': + { + char *cptr; + keys_per_bin = (cmph_uint32)strtoul(optarg, &cptr, 10); + if(*cptr != 0) { + fprintf(stderr, "Parameter t was not found: %s\n", optarg); + exit(1); + } + } + break; + case 'v': + ++verbosity; + break; + case 'V': + printf("%s\n", VERSION); + return 0; + case 'h': + usage_long(argv[0]); + return 0; + case 'a': + { + char valid = 0; + for (i = 0; i < CMPH_COUNT; ++i) + { + if (strcmp(cmph_names[i], optarg) == 0) + { + mph_algo = i; + valid = 1; + break; + } + } + if (!valid) + { + fprintf(stderr, "Invalid mph algorithm: %s. It is not available in version %s\n", optarg, VERSION); + return -1; + } + } + break; + case 'f': + { + char valid = 0; + for (i = 0; i < CMPH_HASH_COUNT; ++i) + { + if (strcmp(cmph_hash_names[i], optarg) == 0) + { + hashes = (CMPH_HASH *)realloc(hashes, sizeof(CMPH_HASH) * ( nhashes + 2 )); + hashes[nhashes] = i; + hashes[nhashes + 1] = CMPH_HASH_COUNT; + ++nhashes; + valid = 1; + break; + } + } + if (!valid) + { + fprintf(stderr, "Invalid hash function: %s\n", optarg); + return -1; + } + } + break; + default: + usage(argv[0]); + return 1; + } + } + + if (optind != argc - 1) + { + usage(argv[0]); + return 1; + } + keys_file = argv[optind]; + + if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL); + srand(seed); + int ret = 0; + if (mphf_file == NULL) + { + mphf_file = (char *)malloc(strlen(keys_file) + 5); + memcpy(mphf_file, keys_file, strlen(keys_file)); + memcpy(mphf_file + strlen(keys_file), ".mph\0", (size_t)5); + } + + keys_fd = fopen(keys_file, "r"); + + if (keys_fd == NULL) + { + fprintf(stderr, "Unable to open file %s: %s\n", keys_file, strerror(errno)); + return -1; + } + + if (seed == UINT_MAX) seed = (cmph_uint32)time(NULL); + if(nkeys == UINT_MAX) source = cmph_io_nlfile_adapter(keys_fd); + else source = cmph_io_nlnkfile_adapter(keys_fd, nkeys); + if (generate) + { + //Create mphf + mphf_fd = fopen(mphf_file, "w"); + config = cmph_config_new(source); + cmph_config_set_algo(config, mph_algo); + if (nhashes) cmph_config_set_hashfuncs(config, hashes); + cmph_config_set_verbosity(config, verbosity); + cmph_config_set_tmp_dir(config, (cmph_uint8 *) tmp_dir); + cmph_config_set_mphf_fd(config, mphf_fd); + cmph_config_set_memory_availability(config, memory_availability); + cmph_config_set_b(config, b); + cmph_config_set_keys_per_bin(config, keys_per_bin); + + //if((mph_algo == CMPH_BMZ || mph_algo == CMPH_BRZ) && c >= 2.0) c=1.15; + if(mph_algo == CMPH_BMZ && c >= 2.0) c=1.15; + if (c != 0) cmph_config_set_graphsize(config, c); + mphf = cmph_new(config); + + cmph_config_destroy(config); + if (mphf == NULL) + { + fprintf(stderr, "Unable to create minimum perfect hashing function\n"); + //cmph_config_destroy(config); + free(mphf_file); + return -1; + } + + if (mphf_fd == NULL) + { + fprintf(stderr, "Unable to open output file %s: %s\n", mphf_file, strerror(errno)); + free(mphf_file); + return -1; + } + cmph_dump(mphf, mphf_fd); + cmph_destroy(mphf); + fclose(mphf_fd); + } + else + { + cmph_uint8 * hashtable = NULL; + mphf_fd = fopen(mphf_file, "r"); + if (mphf_fd == NULL) + { + fprintf(stderr, "Unable to open input file %s: %s\n", mphf_file, strerror(errno)); + free(mphf_file); + return -1; + } + mphf = cmph_load(mphf_fd); + fclose(mphf_fd); + if (!mphf) + { + fprintf(stderr, "Unable to parser input file %s\n", mphf_file); + free(mphf_file); + return -1; + } + cmph_uint32 siz = cmph_size(mphf); + hashtable = (cmph_uint8*)calloc(siz, sizeof(cmph_uint8)); + memset(hashtable, 0,(size_t) siz); + //check all keys + for (i = 0; i < source->nkeys; ++i) + { + cmph_uint32 h; + char *buf; + cmph_uint32 buflen = 0; + source->read(source->data, &buf, &buflen); + h = cmph_search(mphf, buf, buflen); + if (!(h < siz)) + { + fprintf(stderr, "Unknown key %*s in the input.\n", buflen, buf); + ret = 1; + } else if(hashtable[h] >= keys_per_bin) + { + fprintf(stderr, "More than %u keys were mapped to bin %u\n", keys_per_bin, h); + fprintf(stderr, "Duplicated or unknown key %*s in the input\n", buflen, buf); + ret = 1; + } else hashtable[h]++; + + if (verbosity) + { + printf("%s -> %u\n", buf, h); + } + source->dispose(source->data, buf, buflen); + } + + cmph_destroy(mphf); + free(hashtable); + } + fclose(keys_fd); + free(mphf_file); + free(tmp_dir); + cmph_io_nlfile_adapter_destroy(source); + return ret; + +} diff --git a/cmph/miller_rabin.c b/cmph/miller_rabin.c new file mode 100644 index 000000000..17d0ed344 --- /dev/null +++ b/cmph/miller_rabin.c @@ -0,0 +1,67 @@ +#include "miller_rabin.h" + +static inline cmph_uint64 int_pow(cmph_uint64 a, cmph_uint64 d, cmph_uint64 n) +{ + cmph_uint64 a_pow = a; + cmph_uint64 res = 1; + while(d > 0) + { + if((d & 1) == 1) + res =(((cmph_uint64)res) * a_pow) % n; + a_pow = (((cmph_uint64)a_pow) * a_pow) % n; + d /= 2; + }; + return res; +}; + +static inline cmph_uint8 check_witness(cmph_uint64 a_exp_d, cmph_uint64 n, cmph_uint64 s) +{ + cmph_uint64 i; + cmph_uint64 a_exp = a_exp_d; + if(a_exp == 1 || a_exp == (n - 1)) + return 1; + for(i = 1; i < s; i++) + { + a_exp = (((cmph_uint64)a_exp) * a_exp) % n; + if(a_exp == (n - 1)) + return 1; + }; + return 0; +}; + +cmph_uint8 check_primality(cmph_uint64 n) +{ + cmph_uint64 a, d, s, a_exp_d; + if((n % 2) == 0) + return 0; + if((n % 3) == 0) + return 0; + if((n % 5) == 0) + return 0; + if((n % 7 ) == 0) + return 0; + //we decompoe the number n - 1 into 2^s*d + s = 0; + d = n - 1; + do + { + s++; + d /= 2; + }while((d % 2) == 0); + + a = 2; + a_exp_d = int_pow(a, d, n); + if(check_witness(a_exp_d, n, s) == 0) + return 0; + a = 7; + a_exp_d = int_pow(a, d, n); + if(check_witness(a_exp_d, n, s) == 0) + return 0; + a = 61; + a_exp_d = int_pow(a, d, n); + if(check_witness(a_exp_d, n, s) == 0) + return 0; + return 1; +}; + + diff --git a/cmph/miller_rabin.h b/cmph/miller_rabin.h new file mode 100644 index 000000000..42dc6ce5e --- /dev/null +++ b/cmph/miller_rabin.h @@ -0,0 +1,5 @@ +#ifndef _CMPH_MILLER_RABIN_H__ +#define _CMPH_MILLER_RABIN_H__ +#include "cmph_types.h" +cmph_uint8 check_primality(cmph_uint64 n); +#endif diff --git a/cmph/sdbm_hash.c b/cmph/sdbm_hash.c new file mode 100644 index 000000000..2f706c9ff --- /dev/null +++ b/cmph/sdbm_hash.c @@ -0,0 +1,49 @@ +#include "sdbm_hash.h" +#include + +sdbm_state_t *sdbm_state_new() +{ + sdbm_state_t *state = (sdbm_state_t *)malloc(sizeof(sdbm_state_t)); + state->hashfunc = CMPH_HASH_SDBM; + return state; +} + +void sdbm_state_destroy(sdbm_state_t *state) +{ + free(state); +} + +cmph_uint32 sdbm_hash(sdbm_state_t *state, const char *k, cmph_uint32 keylen) +{ + register cmph_uint32 hash = 0; + const unsigned char *ptr = (unsigned char *)k; + cmph_uint32 i = 0; + + while(i < keylen) { + hash = *ptr + (hash << 6) + (hash << 16) - hash; + ++ptr, ++i; + } + return hash; +} + + +void sdbm_state_dump(sdbm_state_t *state, char **buf, cmph_uint32 *buflen) +{ + *buf = NULL; + *buflen = 0; + return; +} + +sdbm_state_t *sdbm_state_copy(sdbm_state_t *src_state) +{ + sdbm_state_t *dest_state = (sdbm_state_t *)malloc(sizeof(sdbm_state_t)); + dest_state->hashfunc = src_state->hashfunc; + return dest_state; +} + +sdbm_state_t *sdbm_state_load(const char *buf, cmph_uint32 buflen) +{ + sdbm_state_t *state = (sdbm_state_t *)malloc(sizeof(sdbm_state_t)); + state->hashfunc = CMPH_HASH_SDBM; + return state; +} diff --git a/cmph/sdbm_hash.h b/cmph/sdbm_hash.h new file mode 100644 index 000000000..f44b2f15a --- /dev/null +++ b/cmph/sdbm_hash.h @@ -0,0 +1,18 @@ +#ifndef __SDBM_HASH_H__ +#define __SDBM_HASH_H__ + +#include "hash.h" + +typedef struct __sdbm_state_t +{ + CMPH_HASH hashfunc; +} sdbm_state_t; + +sdbm_state_t *sdbm_state_new(); +cmph_uint32 sdbm_hash(sdbm_state_t *state, const char *k, cmph_uint32 keylen); +void sdbm_state_dump(sdbm_state_t *state, char **buf, cmph_uint32 *buflen); +sdbm_state_t *sdbm_state_copy(sdbm_state_t *src_state); +sdbm_state_t *sdbm_state_load(const char *buf, cmph_uint32 buflen); +void sdbm_state_destroy(sdbm_state_t *state); + +#endif diff --git a/cmph/select.c b/cmph/select.c new file mode 100644 index 000000000..fec4b7ada --- /dev/null +++ b/cmph/select.c @@ -0,0 +1,337 @@ +#include +#include +#include +#include +#include +#include "select_lookup_tables.h" +#include "select.h" + +//#define DEBUG +#include "debug.h" + +#ifndef STEP_SELECT_TABLE +#define STEP_SELECT_TABLE 128 +#endif + +#ifndef NBITS_STEP_SELECT_TABLE +#define NBITS_STEP_SELECT_TABLE 7 +#endif + +#ifndef MASK_STEP_SELECT_TABLE +#define MASK_STEP_SELECT_TABLE 0x7f // 0x7f = 127 +#endif + +static inline void select_insert_0(cmph_uint32 * buffer) +{ + (*buffer) >>= 1; +}; + +static inline void select_insert_1(cmph_uint32 * buffer) +{ + (*buffer) >>= 1; + (*buffer) |= 0x80000000; +}; + +void select_init(select_t * sel) +{ + sel->n = 0; + sel->m = 0; + sel->bits_vec = 0; + sel->select_table = 0; +}; + +cmph_uint32 select_get_space_usage(select_t * sel) +{ + register cmph_uint32 nbits; + register cmph_uint32 vec_size; + register cmph_uint32 sel_table_size; + register cmph_uint32 space_usage; + + nbits = sel->n + sel->m; + vec_size = (nbits + 31) >> 5; + sel_table_size = (sel->n >> NBITS_STEP_SELECT_TABLE) + 1; // (sel->n >> NBITS_STEP_SELECT_TABLE) = (sel->n/STEP_SELECT_TABLE) + + space_usage = 2 * sizeof(cmph_uint32) * 8; // n and m + space_usage += vec_size * (cmph_uint32) sizeof(cmph_uint32) * 8; + space_usage += sel_table_size * (cmph_uint32)sizeof(cmph_uint32) * 8; + return space_usage; +} + +void select_destroy(select_t * sel) +{ + free(sel->bits_vec); + free(sel->select_table); + sel->bits_vec = 0; + sel->select_table = 0; +}; + +static inline void select_generate_sel_table(select_t * sel) +{ + register cmph_uint8 * bits_table = (cmph_uint8 *)sel->bits_vec; + register cmph_uint32 part_sum, old_part_sum; + register cmph_uint32 vec_idx, one_idx, sel_table_idx; + + part_sum = vec_idx = one_idx = sel_table_idx = 0; + + for(;;) + { + // FABIANO: Should'n it be one_idx >= sel->n + if(one_idx >= sel->n) + break; + do + { + old_part_sum = part_sum; + part_sum += rank_lookup_table[bits_table[vec_idx]]; + vec_idx++; + } while (part_sum <= one_idx); + + sel->select_table[sel_table_idx] = select_lookup_table[bits_table[vec_idx - 1]][one_idx - old_part_sum] + ((vec_idx - 1) << 3); // ((vec_idx - 1) << 3) = ((vec_idx - 1) * 8) + one_idx += STEP_SELECT_TABLE ; + sel_table_idx++; + }; +}; + +void select_generate(select_t * sel, cmph_uint32 * keys_vec, cmph_uint32 n, cmph_uint32 m) +{ + register cmph_uint32 i, j, idx; + cmph_uint32 buffer = 0; + + register cmph_uint32 nbits; + register cmph_uint32 vec_size; + register cmph_uint32 sel_table_size; + sel->n = n; + sel->m = m; // n values in the range [0,m-1] + + nbits = sel->n + sel->m; + vec_size = (nbits + 31) >> 5; // (nbits + 31) >> 5 = (nbits + 31)/32 + + sel_table_size = (sel->n >> NBITS_STEP_SELECT_TABLE) + 1; // (sel->n >> NBITS_STEP_SELECT_TABLE) = (sel->n/STEP_SELECT_TABLE) + + if(sel->bits_vec) + { + free(sel->bits_vec); + } + sel->bits_vec = (cmph_uint32 *)calloc(vec_size, sizeof(cmph_uint32)); + + if(sel->select_table) + { + free(sel->select_table); + } + sel->select_table = (cmph_uint32 *)calloc(sel_table_size, sizeof(cmph_uint32)); + + + + idx = i = j = 0; + + for(;;) + { + while(keys_vec[j]==i) + { + select_insert_1(&buffer); + idx++; + + if((idx & 0x1f) == 0 ) // (idx & 0x1f) = idx % 32 + sel->bits_vec[(idx >> 5) - 1] = buffer; // (idx >> 5) = idx/32 + j++; + + if(j == sel->n) + goto loop_end; + + //assert(keys_vec[j] < keys_vec[j-1]); + } + + if(i == sel->m) + break; + + while(keys_vec[j] > i) + { + select_insert_0(&buffer); + idx++; + + if((idx & 0x1f) == 0 ) // (idx & 0x1f) = idx % 32 + sel->bits_vec[(idx >> 5) - 1] = buffer; // (idx >> 5) = idx/32 + i++; + }; + + }; + loop_end: + if((idx & 0x1f) != 0 ) // (idx & 0x1f) = idx % 32 + { + buffer >>= 32 - (idx & 0x1f); + sel->bits_vec[ (idx - 1) >> 5 ] = buffer; + }; + + select_generate_sel_table(sel); +}; + +static inline cmph_uint32 _select_query(cmph_uint8 * bits_table, cmph_uint32 * select_table, cmph_uint32 one_idx) +{ + register cmph_uint32 vec_bit_idx ,vec_byte_idx; + register cmph_uint32 part_sum, old_part_sum; + + vec_bit_idx = select_table[one_idx >> NBITS_STEP_SELECT_TABLE]; // one_idx >> NBITS_STEP_SELECT_TABLE = one_idx/STEP_SELECT_TABLE + vec_byte_idx = vec_bit_idx >> 3; // vec_bit_idx / 8 + + one_idx &= MASK_STEP_SELECT_TABLE; // one_idx %= STEP_SELECT_TABLE == one_idx &= MASK_STEP_SELECT_TABLE + one_idx += rank_lookup_table[bits_table[vec_byte_idx] & ((1 << (vec_bit_idx & 0x7)) - 1)]; + part_sum = 0; + + do + { + old_part_sum = part_sum; + part_sum += rank_lookup_table[bits_table[vec_byte_idx]]; + vec_byte_idx++; + + }while (part_sum <= one_idx); + + return select_lookup_table[bits_table[vec_byte_idx - 1]][one_idx - old_part_sum] + ((vec_byte_idx-1) << 3); +} + +cmph_uint32 select_query(select_t * sel, cmph_uint32 one_idx) +{ + return _select_query((cmph_uint8 *)sel->bits_vec, sel->select_table, one_idx); +}; + + +static inline cmph_uint32 _select_next_query(cmph_uint8 * bits_table, cmph_uint32 vec_bit_idx) +{ + register cmph_uint32 vec_byte_idx, one_idx; + register cmph_uint32 part_sum, old_part_sum; + + vec_byte_idx = vec_bit_idx >> 3; + + one_idx = rank_lookup_table[bits_table[vec_byte_idx] & ((1U << (vec_bit_idx & 0x7)) - 1U)] + 1U; + part_sum = 0; + + do + { + old_part_sum = part_sum; + part_sum += rank_lookup_table[bits_table[vec_byte_idx]]; + vec_byte_idx++; + + }while (part_sum <= one_idx); + + return select_lookup_table[bits_table[(vec_byte_idx - 1)]][(one_idx - old_part_sum)] + ((vec_byte_idx - 1) << 3); +} + +cmph_uint32 select_next_query(select_t * sel, cmph_uint32 vec_bit_idx) +{ + return _select_next_query((cmph_uint8 *)sel->bits_vec, vec_bit_idx); +}; + +void select_dump(select_t *sel, char **buf, cmph_uint32 *buflen) +{ + register cmph_uint32 nbits = sel->n + sel->m; + register cmph_uint32 vec_size = ((nbits + 31) >> 5) * (cmph_uint32)sizeof(cmph_uint32); // (nbits + 31) >> 5 = (nbits + 31)/32 + register cmph_uint32 sel_table_size = ((sel->n >> NBITS_STEP_SELECT_TABLE) + 1) * (cmph_uint32)sizeof(cmph_uint32); // (sel->n >> NBITS_STEP_SELECT_TABLE) = (sel->n/STEP_SELECT_TABLE) + register cmph_uint32 pos = 0; + + *buflen = 2*(cmph_uint32)sizeof(cmph_uint32) + vec_size + sel_table_size; + + *buf = (char *)calloc(*buflen, sizeof(char)); + + if (!*buf) + { + *buflen = UINT_MAX; + return; + } + + memcpy(*buf, &(sel->n), sizeof(cmph_uint32)); + pos += (cmph_uint32)sizeof(cmph_uint32); + memcpy(*buf + pos, &(sel->m), sizeof(cmph_uint32)); + pos += (cmph_uint32)sizeof(cmph_uint32); + memcpy(*buf + pos, sel->bits_vec, vec_size); + pos += vec_size; + memcpy(*buf + pos, sel->select_table, sel_table_size); + + DEBUGP("Dumped select structure with size %u bytes\n", *buflen); +} + +void select_load(select_t * sel, const char *buf, cmph_uint32 buflen) +{ + register cmph_uint32 pos = 0; + register cmph_uint32 nbits = 0; + register cmph_uint32 vec_size = 0; + register cmph_uint32 sel_table_size = 0; + + memcpy(&(sel->n), buf, sizeof(cmph_uint32)); + pos += (cmph_uint32)sizeof(cmph_uint32); + memcpy(&(sel->m), buf + pos, sizeof(cmph_uint32)); + pos += (cmph_uint32)sizeof(cmph_uint32); + + nbits = sel->n + sel->m; + vec_size = ((nbits + 31) >> 5) * (cmph_uint32)sizeof(cmph_uint32); // (nbits + 31) >> 5 = (nbits + 31)/32 + sel_table_size = ((sel->n >> NBITS_STEP_SELECT_TABLE) + 1) * (cmph_uint32)sizeof(cmph_uint32); // (sel->n >> NBITS_STEP_SELECT_TABLE) = (sel->n/STEP_SELECT_TABLE) + + if(sel->bits_vec) + { + free(sel->bits_vec); + } + sel->bits_vec = (cmph_uint32 *)calloc(vec_size/sizeof(cmph_uint32), sizeof(cmph_uint32)); + + if(sel->select_table) + { + free(sel->select_table); + } + sel->select_table = (cmph_uint32 *)calloc(sel_table_size/sizeof(cmph_uint32), sizeof(cmph_uint32)); + + memcpy(sel->bits_vec, buf + pos, vec_size); + pos += vec_size; + memcpy(sel->select_table, buf + pos, sel_table_size); + + DEBUGP("Loaded select structure with size %u bytes\n", buflen); +} + + +/** \fn void select_pack(select_t *sel, void *sel_packed); + * \brief Support the ability to pack a select structure function into a preallocated contiguous memory space pointed by sel_packed. + * \param sel points to the select structure + * \param sel_packed pointer to the contiguous memory area used to store the select structure. The size of sel_packed must be at least @see select_packed_size + */ +void select_pack(select_t *sel, void *sel_packed) +{ + if (sel && sel_packed) + { + char *buf = NULL; + cmph_uint32 buflen = 0; + select_dump(sel, &buf, &buflen); + memcpy(sel_packed, buf, buflen); + free(buf); + } +} + + +/** \fn cmph_uint32 select_packed_size(select_t *sel); + * \brief Return the amount of space needed to pack a select structure. + * \return the size of the packed select structure or zero for failures + */ +cmph_uint32 select_packed_size(select_t *sel) +{ + register cmph_uint32 nbits = sel->n + sel->m; + register cmph_uint32 vec_size = ((nbits + 31) >> 5) * (cmph_uint32)sizeof(cmph_uint32); // (nbits + 31) >> 5 = (nbits + 31)/32 + register cmph_uint32 sel_table_size = ((sel->n >> NBITS_STEP_SELECT_TABLE) + 1) * (cmph_uint32)sizeof(cmph_uint32); // (sel->n >> NBITS_STEP_SELECT_TABLE) = (sel->n/STEP_SELECT_TABLE) + return 2*(cmph_uint32)sizeof(cmph_uint32) + vec_size + sel_table_size; +} + + + +cmph_uint32 select_query_packed(void * sel_packed, cmph_uint32 one_idx) +{ + register cmph_uint32 *ptr = (cmph_uint32 *)sel_packed; + register cmph_uint32 n = *ptr++; + register cmph_uint32 m = *ptr++; + register cmph_uint32 nbits = n + m; + register cmph_uint32 vec_size = (nbits + 31) >> 5; // (nbits + 31) >> 5 = (nbits + 31)/32 + register cmph_uint8 * bits_vec = (cmph_uint8 *)ptr; + register cmph_uint32 * select_table = ptr + vec_size; + + return _select_query(bits_vec, select_table, one_idx); +} + + +cmph_uint32 select_next_query_packed(void * sel_packed, cmph_uint32 vec_bit_idx) +{ + register cmph_uint8 * bits_vec = (cmph_uint8 *)sel_packed; + bits_vec += 8; // skipping n and m + return _select_next_query(bits_vec, vec_bit_idx); +} diff --git a/cmph/select.h b/cmph/select.h new file mode 100644 index 000000000..a31eb0f22 --- /dev/null +++ b/cmph/select.h @@ -0,0 +1,61 @@ +#ifndef __CMPH_SELECT_H__ +#define __CMPH_SELECT_H__ + +#include "cmph_types.h" + +struct _select_t +{ + cmph_uint32 n,m; + cmph_uint32 * bits_vec; + cmph_uint32 * select_table; +}; + +typedef struct _select_t select_t; + +void select_init(select_t * sel); + +void select_destroy(select_t * sel); + +void select_generate(select_t * sel, cmph_uint32 * keys_vec, cmph_uint32 n, cmph_uint32 m); + +cmph_uint32 select_query(select_t * sel, cmph_uint32 one_idx); + +cmph_uint32 select_next_query(select_t * sel, cmph_uint32 vec_bit_idx); + +cmph_uint32 select_get_space_usage(select_t * sel); + +void select_dump(select_t *sel, char **buf, cmph_uint32 *buflen); + +void select_load(select_t * sel, const char *buf, cmph_uint32 buflen); + + +/** \fn void select_pack(select_t *sel, void *sel_packed); + * \brief Support the ability to pack a select structure into a preallocated contiguous memory space pointed by sel_packed. + * \param sel points to the select structure + * \param sel_packed pointer to the contiguous memory area used to store the select structure. The size of sel_packed must be at least @see select_packed_size + */ +void select_pack(select_t *sel, void *sel_packed); + +/** \fn cmph_uint32 select_packed_size(select_t *sel); + * \brief Return the amount of space needed to pack a select structure. + * \return the size of the packed select structure or zero for failures + */ +cmph_uint32 select_packed_size(select_t *sel); + + +/** \fn cmph_uint32 select_query_packed(void * sel_packed, cmph_uint32 one_idx); + * \param sel_packed is a pointer to a contiguous memory area + * \param one_idx is the rank for which we want to calculate the inverse function select + * \return an integer that represents the select value of rank idx. + */ +cmph_uint32 select_query_packed(void * sel_packed, cmph_uint32 one_idx); + + +/** \fn cmph_uint32 select_next_query_packed(void * sel_packed, cmph_uint32 vec_bit_idx); + * \param sel_packed is a pointer to a contiguous memory area + * \param vec_bit_idx is a value prior computed by @see select_query_packed + * \return an integer that represents the next select value greater than @see vec_bit_idx. + */ +cmph_uint32 select_next_query_packed(void * sel_packed, cmph_uint32 vec_bit_idx); + +#endif diff --git a/cmph/select_lookup_tables.h b/cmph/select_lookup_tables.h new file mode 100644 index 000000000..efd595ed5 --- /dev/null +++ b/cmph/select_lookup_tables.h @@ -0,0 +1,170 @@ +#ifndef SELECT_LOOKUP_TABLES +#define SELECT_LOOKUP_TABLES + +#include "cmph_types.h" + +/* +rank_lookup_table[i] simply gives the number of bits set to one in the byte of value i. +For example if i = 01010101 in binary then we have : +rank_lookup_table[i] = 4 +*/ + +static cmph_uint8 rank_lookup_table[256] ={ + 0 , 1 , 1 , 2 , 1 , 2 , 2 , 3 , 1 , 2 , 2 , 3 , 2 , 3 , 3 , 4 +, 1 , 2 , 2 , 3 , 2 , 3 , 3 , 4 , 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 +, 1 , 2 , 2 , 3 , 2 , 3 , 3 , 4 , 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 +, 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 , 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 +, 1 , 2 , 2 , 3 , 2 , 3 , 3 , 4 , 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 +, 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 , 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 +, 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 , 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 +, 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 , 4 , 5 , 5 , 6 , 5 , 6 , 6 , 7 +, 1 , 2 , 2 , 3 , 2 , 3 , 3 , 4 , 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 +, 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 , 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 +, 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 , 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 +, 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 , 4 , 5 , 5 , 6 , 5 , 6 , 6 , 7 +, 2 , 3 , 3 , 4 , 3 , 4 , 4 , 5 , 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 +, 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 , 4 , 5 , 5 , 6 , 5 , 6 , 6 , 7 +, 3 , 4 , 4 , 5 , 4 , 5 , 5 , 6 , 4 , 5 , 5 , 6 , 5 , 6 , 6 , 7 +, 4 , 5 , 5 , 6 , 5 , 6 , 6 , 7 , 5 , 6 , 6 , 7 , 6 , 7 , 7 , 8 + }; + +/* +select_lookup_table[i][j] simply gives the index of the j'th bit set to one in the byte of value i. +For example if i=01010101 in binary then we have : +select_lookup_table[i][0] = 0, the first bit set to one is at position 0 +select_lookup_table[i][1] = 2, the second bit set to one is at position 2 +select_lookup_table[i][2] = 4, the third bit set to one is at position 4 +select_lookup_table[i][3] = 6, the fourth bit set to one is at position 6 +select_lookup_table[i][4] = 255, there is no more than 4 bits set to one in i, so we return escape value 255. +*/ +static cmph_uint8 select_lookup_table[256][8]={ +{ 255 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , +{ 1 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 255 , 255 , 255 , 255 , 255 , 255 } , +{ 2 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 255 , 255 , 255 , 255 , 255 , 255 } , +{ 1 , 2 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 255 , 255 , 255 , 255 , 255 } , +{ 3 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 255 , 255 , 255 , 255 , 255 , 255 } , +{ 1 , 3 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 255 , 255 , 255 , 255 , 255 } , +{ 2 , 3 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 255 , 255 , 255 , 255 , 255 } , +{ 1 , 2 , 3 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 255 , 255 , 255 , 255 } , +{ 4 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 4 , 255 , 255 , 255 , 255 , 255 , 255 } , +{ 1 , 4 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 4 , 255 , 255 , 255 , 255 , 255 } , +{ 2 , 4 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 4 , 255 , 255 , 255 , 255 , 255 } , +{ 1 , 2 , 4 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 4 , 255 , 255 , 255 , 255 } , +{ 3 , 4 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 4 , 255 , 255 , 255 , 255 , 255 } , +{ 1 , 3 , 4 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 4 , 255 , 255 , 255 , 255 } , +{ 2 , 3 , 4 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 4 , 255 , 255 , 255 , 255 } , +{ 1 , 2 , 3 , 4 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 4 , 255 , 255 , 255 } , +{ 5 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 5 , 255 , 255 , 255 , 255 , 255 , 255 } , +{ 1 , 5 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 5 , 255 , 255 , 255 , 255 , 255 } , +{ 2 , 5 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 5 , 255 , 255 , 255 , 255 , 255 } , +{ 1 , 2 , 5 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 5 , 255 , 255 , 255 , 255 } , +{ 3 , 5 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 5 , 255 , 255 , 255 , 255 , 255 } , +{ 1 , 3 , 5 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 5 , 255 , 255 , 255 , 255 } , +{ 2 , 3 , 5 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 5 , 255 , 255 , 255 , 255 } , +{ 1 , 2 , 3 , 5 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 5 , 255 , 255 , 255 } , +{ 4 , 5 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 4 , 5 , 255 , 255 , 255 , 255 , 255 } , +{ 1 , 4 , 5 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 4 , 5 , 255 , 255 , 255 , 255 } , +{ 2 , 4 , 5 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 4 , 5 , 255 , 255 , 255 , 255 } , +{ 1 , 2 , 4 , 5 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 4 , 5 , 255 , 255 , 255 } , +{ 3 , 4 , 5 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 4 , 5 , 255 , 255 , 255 , 255 } , +{ 1 , 3 , 4 , 5 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 4 , 5 , 255 , 255 , 255 } , +{ 2 , 3 , 4 , 5 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 4 , 5 , 255 , 255 , 255 } , +{ 1 , 2 , 3 , 4 , 5 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 4 , 5 , 255 , 255 } , +{ 6 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 6 , 255 , 255 , 255 , 255 , 255 , 255 } , +{ 1 , 6 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 6 , 255 , 255 , 255 , 255 , 255 } , +{ 2 , 6 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 6 , 255 , 255 , 255 , 255 , 255 } , +{ 1 , 2 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 6 , 255 , 255 , 255 , 255 } , +{ 3 , 6 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 6 , 255 , 255 , 255 , 255 , 255 } , +{ 1 , 3 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 6 , 255 , 255 , 255 , 255 } , +{ 2 , 3 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 6 , 255 , 255 , 255 , 255 } , +{ 1 , 2 , 3 , 6 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 6 , 255 , 255 , 255 } , +{ 4 , 6 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 4 , 6 , 255 , 255 , 255 , 255 , 255 } , +{ 1 , 4 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 4 , 6 , 255 , 255 , 255 , 255 } , +{ 2 , 4 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 4 , 6 , 255 , 255 , 255 , 255 } , +{ 1 , 2 , 4 , 6 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 4 , 6 , 255 , 255 , 255 } , +{ 3 , 4 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 4 , 6 , 255 , 255 , 255 , 255 } , +{ 1 , 3 , 4 , 6 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 4 , 6 , 255 , 255 , 255 } , +{ 2 , 3 , 4 , 6 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 4 , 6 , 255 , 255 , 255 } , +{ 1 , 2 , 3 , 4 , 6 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 4 , 6 , 255 , 255 } , +{ 5 , 6 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 5 , 6 , 255 , 255 , 255 , 255 , 255 } , +{ 1 , 5 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 5 , 6 , 255 , 255 , 255 , 255 } , +{ 2 , 5 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 5 , 6 , 255 , 255 , 255 , 255 } , +{ 1 , 2 , 5 , 6 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 5 , 6 , 255 , 255 , 255 } , +{ 3 , 5 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 5 , 6 , 255 , 255 , 255 , 255 } , +{ 1 , 3 , 5 , 6 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 5 , 6 , 255 , 255 , 255 } , +{ 2 , 3 , 5 , 6 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 5 , 6 , 255 , 255 , 255 } , +{ 1 , 2 , 3 , 5 , 6 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 5 , 6 , 255 , 255 } , +{ 4 , 5 , 6 , 255 , 255 , 255 , 255 , 255 } , { 0 , 4 , 5 , 6 , 255 , 255 , 255 , 255 } , +{ 1 , 4 , 5 , 6 , 255 , 255 , 255 , 255 } , { 0 , 1 , 4 , 5 , 6 , 255 , 255 , 255 } , +{ 2 , 4 , 5 , 6 , 255 , 255 , 255 , 255 } , { 0 , 2 , 4 , 5 , 6 , 255 , 255 , 255 } , +{ 1 , 2 , 4 , 5 , 6 , 255 , 255 , 255 } , { 0 , 1 , 2 , 4 , 5 , 6 , 255 , 255 } , +{ 3 , 4 , 5 , 6 , 255 , 255 , 255 , 255 } , { 0 , 3 , 4 , 5 , 6 , 255 , 255 , 255 } , +{ 1 , 3 , 4 , 5 , 6 , 255 , 255 , 255 } , { 0 , 1 , 3 , 4 , 5 , 6 , 255 , 255 } , +{ 2 , 3 , 4 , 5 , 6 , 255 , 255 , 255 } , { 0 , 2 , 3 , 4 , 5 , 6 , 255 , 255 } , +{ 1 , 2 , 3 , 4 , 5 , 6 , 255 , 255 } , { 0 , 1 , 2 , 3 , 4 , 5 , 6 , 255 } , +{ 7 , 255 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 7 , 255 , 255 , 255 , 255 , 255 , 255 } , +{ 1 , 7 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 7 , 255 , 255 , 255 , 255 , 255 } , +{ 2 , 7 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 7 , 255 , 255 , 255 , 255 , 255 } , +{ 1 , 2 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 7 , 255 , 255 , 255 , 255 } , +{ 3 , 7 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 7 , 255 , 255 , 255 , 255 , 255 } , +{ 1 , 3 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 7 , 255 , 255 , 255 , 255 } , +{ 2 , 3 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 7 , 255 , 255 , 255 , 255 } , +{ 1 , 2 , 3 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 7 , 255 , 255 , 255 } , +{ 4 , 7 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 4 , 7 , 255 , 255 , 255 , 255 , 255 } , +{ 1 , 4 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 4 , 7 , 255 , 255 , 255 , 255 } , +{ 2 , 4 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 4 , 7 , 255 , 255 , 255 , 255 } , +{ 1 , 2 , 4 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 4 , 7 , 255 , 255 , 255 } , +{ 3 , 4 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 4 , 7 , 255 , 255 , 255 , 255 } , +{ 1 , 3 , 4 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 4 , 7 , 255 , 255 , 255 } , +{ 2 , 3 , 4 , 7 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 4 , 7 , 255 , 255 , 255 } , +{ 1 , 2 , 3 , 4 , 7 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 4 , 7 , 255 , 255 } , +{ 5 , 7 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 5 , 7 , 255 , 255 , 255 , 255 , 255 } , +{ 1 , 5 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 5 , 7 , 255 , 255 , 255 , 255 } , +{ 2 , 5 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 5 , 7 , 255 , 255 , 255 , 255 } , +{ 1 , 2 , 5 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 5 , 7 , 255 , 255 , 255 } , +{ 3 , 5 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 5 , 7 , 255 , 255 , 255 , 255 } , +{ 1 , 3 , 5 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 5 , 7 , 255 , 255 , 255 } , +{ 2 , 3 , 5 , 7 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 5 , 7 , 255 , 255 , 255 } , +{ 1 , 2 , 3 , 5 , 7 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 5 , 7 , 255 , 255 } , +{ 4 , 5 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 4 , 5 , 7 , 255 , 255 , 255 , 255 } , +{ 1 , 4 , 5 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 4 , 5 , 7 , 255 , 255 , 255 } , +{ 2 , 4 , 5 , 7 , 255 , 255 , 255 , 255 } , { 0 , 2 , 4 , 5 , 7 , 255 , 255 , 255 } , +{ 1 , 2 , 4 , 5 , 7 , 255 , 255 , 255 } , { 0 , 1 , 2 , 4 , 5 , 7 , 255 , 255 } , +{ 3 , 4 , 5 , 7 , 255 , 255 , 255 , 255 } , { 0 , 3 , 4 , 5 , 7 , 255 , 255 , 255 } , +{ 1 , 3 , 4 , 5 , 7 , 255 , 255 , 255 } , { 0 , 1 , 3 , 4 , 5 , 7 , 255 , 255 } , +{ 2 , 3 , 4 , 5 , 7 , 255 , 255 , 255 } , { 0 , 2 , 3 , 4 , 5 , 7 , 255 , 255 } , +{ 1 , 2 , 3 , 4 , 5 , 7 , 255 , 255 } , { 0 , 1 , 2 , 3 , 4 , 5 , 7 , 255 } , +{ 6 , 7 , 255 , 255 , 255 , 255 , 255 , 255 } , { 0 , 6 , 7 , 255 , 255 , 255 , 255 , 255 } , +{ 1 , 6 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 1 , 6 , 7 , 255 , 255 , 255 , 255 } , +{ 2 , 6 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 2 , 6 , 7 , 255 , 255 , 255 , 255 } , +{ 1 , 2 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 2 , 6 , 7 , 255 , 255 , 255 } , +{ 3 , 6 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 3 , 6 , 7 , 255 , 255 , 255 , 255 } , +{ 1 , 3 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 3 , 6 , 7 , 255 , 255 , 255 } , +{ 2 , 3 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 2 , 3 , 6 , 7 , 255 , 255 , 255 } , +{ 1 , 2 , 3 , 6 , 7 , 255 , 255 , 255 } , { 0 , 1 , 2 , 3 , 6 , 7 , 255 , 255 } , +{ 4 , 6 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 4 , 6 , 7 , 255 , 255 , 255 , 255 } , +{ 1 , 4 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 4 , 6 , 7 , 255 , 255 , 255 } , +{ 2 , 4 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 2 , 4 , 6 , 7 , 255 , 255 , 255 } , +{ 1 , 2 , 4 , 6 , 7 , 255 , 255 , 255 } , { 0 , 1 , 2 , 4 , 6 , 7 , 255 , 255 } , +{ 3 , 4 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 3 , 4 , 6 , 7 , 255 , 255 , 255 } , +{ 1 , 3 , 4 , 6 , 7 , 255 , 255 , 255 } , { 0 , 1 , 3 , 4 , 6 , 7 , 255 , 255 } , +{ 2 , 3 , 4 , 6 , 7 , 255 , 255 , 255 } , { 0 , 2 , 3 , 4 , 6 , 7 , 255 , 255 } , +{ 1 , 2 , 3 , 4 , 6 , 7 , 255 , 255 } , { 0 , 1 , 2 , 3 , 4 , 6 , 7 , 255 } , +{ 5 , 6 , 7 , 255 , 255 , 255 , 255 , 255 } , { 0 , 5 , 6 , 7 , 255 , 255 , 255 , 255 } , +{ 1 , 5 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 1 , 5 , 6 , 7 , 255 , 255 , 255 } , +{ 2 , 5 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 2 , 5 , 6 , 7 , 255 , 255 , 255 } , +{ 1 , 2 , 5 , 6 , 7 , 255 , 255 , 255 } , { 0 , 1 , 2 , 5 , 6 , 7 , 255 , 255 } , +{ 3 , 5 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 3 , 5 , 6 , 7 , 255 , 255 , 255 } , +{ 1 , 3 , 5 , 6 , 7 , 255 , 255 , 255 } , { 0 , 1 , 3 , 5 , 6 , 7 , 255 , 255 } , +{ 2 , 3 , 5 , 6 , 7 , 255 , 255 , 255 } , { 0 , 2 , 3 , 5 , 6 , 7 , 255 , 255 } , +{ 1 , 2 , 3 , 5 , 6 , 7 , 255 , 255 } , { 0 , 1 , 2 , 3 , 5 , 6 , 7 , 255 } , +{ 4 , 5 , 6 , 7 , 255 , 255 , 255 , 255 } , { 0 , 4 , 5 , 6 , 7 , 255 , 255 , 255 } , +{ 1 , 4 , 5 , 6 , 7 , 255 , 255 , 255 } , { 0 , 1 , 4 , 5 , 6 , 7 , 255 , 255 } , +{ 2 , 4 , 5 , 6 , 7 , 255 , 255 , 255 } , { 0 , 2 , 4 , 5 , 6 , 7 , 255 , 255 } , +{ 1 , 2 , 4 , 5 , 6 , 7 , 255 , 255 } , { 0 , 1 , 2 , 4 , 5 , 6 , 7 , 255 } , +{ 3 , 4 , 5 , 6 , 7 , 255 , 255 , 255 } , { 0 , 3 , 4 , 5 , 6 , 7 , 255 , 255 } , +{ 1 , 3 , 4 , 5 , 6 , 7 , 255 , 255 } , { 0 , 1 , 3 , 4 , 5 , 6 , 7 , 255 } , +{ 2 , 3 , 4 , 5 , 6 , 7 , 255 , 255 } , { 0 , 2 , 3 , 4 , 5 , 6 , 7 , 255 } , +{ 1 , 2 , 3 , 4 , 5 , 6 , 7 , 255 } , { 0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 } }; + +#endif diff --git a/cmph/vqueue.c b/cmph/vqueue.c new file mode 100644 index 000000000..0619dd7cc --- /dev/null +++ b/cmph/vqueue.c @@ -0,0 +1,51 @@ +#include "vqueue.h" +#include +#include +#include +struct __vqueue_t +{ + cmph_uint32 * values; + cmph_uint32 beg, end, capacity; +}; + +vqueue_t * vqueue_new(cmph_uint32 capacity) +{ + size_t capacity_plus_one = capacity + 1; + vqueue_t *q = (vqueue_t *)malloc(sizeof(vqueue_t)); + assert(q); + q->values = (cmph_uint32 *)calloc(capacity_plus_one, sizeof(cmph_uint32)); + q->beg = q->end = 0; + q->capacity = (cmph_uint32) capacity_plus_one; + return q; +} + +cmph_uint8 vqueue_is_empty(vqueue_t * q) +{ + return (cmph_uint8)(q->beg == q->end); +} + +void vqueue_insert(vqueue_t * q, cmph_uint32 val) +{ + assert((q->end + 1)%q->capacity != q->beg); // Is queue full? + q->end = (q->end + 1)%q->capacity; + q->values[q->end] = val; +} + +cmph_uint32 vqueue_remove(vqueue_t * q) +{ + assert(!vqueue_is_empty(q)); // Is queue empty? + q->beg = (q->beg + 1)%q->capacity; + return q->values[q->beg]; +} + +void vqueue_print(vqueue_t * q) +{ + cmph_uint32 i; + for (i = q->beg; i != q->end; i = (i + 1)%q->capacity) + fprintf(stderr, "%u\n", q->values[(i + 1)%q->capacity]); +} + +void vqueue_destroy(vqueue_t *q) +{ + free(q->values); q->values = NULL; free(q); +} diff --git a/cmph/vqueue.h b/cmph/vqueue.h new file mode 100644 index 000000000..86fccab68 --- /dev/null +++ b/cmph/vqueue.h @@ -0,0 +1,18 @@ +#ifndef __CMPH_VQUEUE_H__ +#define __CMPH_VQUEUE_H__ + +#include "cmph_types.h" +typedef struct __vqueue_t vqueue_t; + +vqueue_t * vqueue_new(cmph_uint32 capacity); + +cmph_uint8 vqueue_is_empty(vqueue_t * q); + +void vqueue_insert(vqueue_t * q, cmph_uint32 val); + +cmph_uint32 vqueue_remove(vqueue_t * q); + +void vqueue_print(vqueue_t * q); + +void vqueue_destroy(vqueue_t * q); +#endif diff --git a/cmph/vstack.c b/cmph/vstack.c new file mode 100644 index 000000000..24555cd6e --- /dev/null +++ b/cmph/vstack.c @@ -0,0 +1,79 @@ +#include "vstack.h" + +#include +#include + +//#define DEBUG +#include "debug.h" + +struct __vstack_t +{ + cmph_uint32 pointer; + cmph_uint32 *values; + cmph_uint32 capacity; +}; + +vstack_t *vstack_new() +{ + vstack_t *stack = (vstack_t *)malloc(sizeof(vstack_t)); + assert(stack); + stack->pointer = 0; + stack->values = NULL; + stack->capacity = 0; + return stack; +} + +void vstack_destroy(vstack_t *stack) +{ + assert(stack); + free(stack->values); + free(stack); +} + +void vstack_push(vstack_t *stack, cmph_uint32 val) +{ + assert(stack); + vstack_reserve(stack, stack->pointer + 1); + stack->values[stack->pointer] = val; + ++(stack->pointer); +} +void vstack_pop(vstack_t *stack) +{ + assert(stack); + assert(stack->pointer > 0); + --(stack->pointer); +} + +cmph_uint32 vstack_top(vstack_t *stack) +{ + assert(stack); + assert(stack->pointer > 0); + return stack->values[(stack->pointer - 1)]; +} +int vstack_empty(vstack_t *stack) +{ + assert(stack); + return stack->pointer == 0; +} +cmph_uint32 vstack_size(vstack_t *stack) +{ + return stack->pointer; +} +void vstack_reserve(vstack_t *stack, cmph_uint32 size) +{ + assert(stack); + if (stack->capacity < size) + { + cmph_uint32 new_capacity = stack->capacity + 1; + DEBUGP("Increasing current capacity %u to %u\n", stack->capacity, size); + while (new_capacity < size) + { + new_capacity *= 2; + } + stack->values = (cmph_uint32 *)realloc(stack->values, sizeof(cmph_uint32)*new_capacity); + assert(stack->values); + stack->capacity = new_capacity; + DEBUGP("Increased\n"); + } +} + diff --git a/cmph/vstack.h b/cmph/vstack.h new file mode 100644 index 000000000..1cefaaffb --- /dev/null +++ b/cmph/vstack.h @@ -0,0 +1,18 @@ +#ifndef __CMPH_VSTACK_H__ +#define __CMPH_VSTACK_H__ + +#include "cmph_types.h" +typedef struct __vstack_t vstack_t; + +vstack_t *vstack_new(); +void vstack_destroy(vstack_t *stack); + +void vstack_push(vstack_t *stack, cmph_uint32 val); +cmph_uint32 vstack_top(vstack_t *stack); +void vstack_pop(vstack_t *stack); +int vstack_empty(vstack_t *stack); +cmph_uint32 vstack_size(vstack_t *stack); + +void vstack_reserve(vstack_t *stack, cmph_uint32 size); + +#endif diff --git a/cmph/wingetopt.c b/cmph/wingetopt.c new file mode 100644 index 000000000..c981d0f04 --- /dev/null +++ b/cmph/wingetopt.c @@ -0,0 +1,179 @@ +#ifdef WIN32 +/***************************************************************************** + * + * MODULE NAME : GETOPT.C + * + * COPYRIGHTS: + * This module contains code made available by IBM + * Corporation on an AS IS basis. Any one receiving the + * module is considered to be licensed under IBM copyrights + * to use the IBM-provided source code in any way he or she + * deems fit, including copying it, compiling it, modifying + * it, and redistributing it, with or without + * modifications. No license under any IBM patents or + * patent applications is to be implied from this copyright + * license. + * + * A user of the module should understand that IBM cannot + * provide technical support for the module and will not be + * responsible for any consequences of use of the program. + * + * Any notices, including this one, are not to be removed + * from the module without the prior written consent of + * IBM. + * + * AUTHOR: Original author: + * G. R. Blair (BOBBLAIR at AUSVM1) + * Internet: bobblair@bobblair.austin.ibm.com + * + * Extensively revised by: + * John Q. Walker II, Ph.D. (JOHHQ at RALVM6) + * Internet: johnq@ralvm6.vnet.ibm.com + * + *****************************************************************************/ + +/****************************************************************************** + * getopt() + * + * The getopt() function is a command line parser. It returns the next + * option character in argv that matches an option character in opstring. + * + * The argv argument points to an array of argc+1 elements containing argc + * pointers to character strings followed by a null pointer. + * + * The opstring argument points to a string of option characters; if an + * option character is followed by a colon, the option is expected to have + * an argument that may or may not be separated from it by white space. + * The external variable optarg is set to point to the start of the option + * argument on return from getopt(). + * + * The getopt() function places in optind the argv index of the next argument + * to be processed. The system initializes the external variable optind to + * 1 before the first call to getopt(). + * + * When all options have been processed (that is, up to the first nonoption + * argument), getopt() returns EOF. The special option "--" may be used to + * delimit the end of the options; EOF will be returned, and "--" will be + * skipped. + * + * The getopt() function returns a question mark (?) when it encounters an + * option character not included in opstring. This error message can be + * disabled by setting opterr to zero. Otherwise, it returns the option + * character that was detected. + * + * If the special option "--" is detected, or all options have been + * processed, EOF is returned. + * + * Options are marked by either a minus sign (-) or a slash (/). + * + * No errors are defined. + *****************************************************************************/ + +#include /* for EOF */ +#include /* for strchr() */ + +/* static (global) variables that are specified as exported by getopt() */ +extern char *optarg; /* pointer to the start of the option argument */ +extern int optind; /* number of the next argv[] to be evaluated */ +extern int opterr; /* non-zero if a question mark should be returned + when a non-valid option character is detected */ + +/* handle possible future character set concerns by putting this in a macro */ +#define _next_char(string) (char)(*(string+1)) + +int getopt(int argc, char *argv[], char *opstring) +{ + static char *pIndexPosition = NULL; /* place inside current argv string */ + char *pArgString = NULL; /* where to start from next */ + char *pOptString; /* the string in our program */ + + + if (pIndexPosition != NULL) { + /* we last left off inside an argv string */ + if (*(++pIndexPosition)) { + /* there is more to come in the most recent argv */ + pArgString = pIndexPosition; + } + } + + if (pArgString == NULL) { + /* we didn't leave off in the middle of an argv string */ + if (optind >= argc) { + /* more command-line arguments than the argument count */ + pIndexPosition = NULL; /* not in the middle of anything */ + return EOF; /* used up all command-line arguments */ + } + + /*--------------------------------------------------------------------- + * If the next argv[] is not an option, there can be no more options. + *-------------------------------------------------------------------*/ + pArgString = argv[optind++]; /* set this to the next argument ptr */ + + if (('/' != *pArgString) && /* doesn't start with a slash or a dash? */ + ('-' != *pArgString)) { + --optind; /* point to current arg once we're done */ + optarg = NULL; /* no argument follows the option */ + pIndexPosition = NULL; /* not in the middle of anything */ + return EOF; /* used up all the command-line flags */ + } + + /* check for special end-of-flags markers */ + if ((strcmp(pArgString, "-") == 0) || + (strcmp(pArgString, "--") == 0)) { + optarg = NULL; /* no argument follows the option */ + pIndexPosition = NULL; /* not in the middle of anything */ + return EOF; /* encountered the special flag */ + } + + pArgString++; /* look past the / or - */ + } + + if (':' == *pArgString) { /* is it a colon? */ + /*--------------------------------------------------------------------- + * Rare case: if opterr is non-zero, return a question mark; + * otherwise, just return the colon we're on. + *-------------------------------------------------------------------*/ + return (opterr ? (int)'?' : (int)':'); + } + else if ((pOptString = strchr(opstring, *pArgString)) == 0) { + /*--------------------------------------------------------------------- + * The letter on the command-line wasn't any good. + *-------------------------------------------------------------------*/ + optarg = NULL; /* no argument follows the option */ + pIndexPosition = NULL; /* not in the middle of anything */ + return (opterr ? (int)'?' : (int)*pArgString); + } + else { + /*--------------------------------------------------------------------- + * The letter on the command-line matches one we expect to see + *-------------------------------------------------------------------*/ + if (':' == _next_char(pOptString)) { /* is the next letter a colon? */ + /* It is a colon. Look for an argument string. */ + if ('\0' != _next_char(pArgString)) { /* argument in this argv? */ + optarg = &pArgString[1]; /* Yes, it is */ + } + else { + /*------------------------------------------------------------- + * The argument string must be in the next argv. + * But, what if there is none (bad input from the user)? + * In that case, return the letter, and optarg as NULL. + *-----------------------------------------------------------*/ + if (optind < argc) + optarg = argv[optind++]; + else { + optarg = NULL; + return (opterr ? (int)'?' : (int)*pArgString); + } + } + pIndexPosition = NULL; /* not in the middle of anything */ + } + else { + /* it's not a colon, so just return the letter */ + optarg = NULL; /* no argument follows the option */ + pIndexPosition = pArgString; /* point to the letter we're on */ + } + return (int)*pArgString; /* return the letter that matched */ + } +} + +#endif //WIN32 diff --git a/cmph/wingetopt.h b/cmph/wingetopt.h new file mode 100644 index 000000000..9596853d9 --- /dev/null +++ b/cmph/wingetopt.h @@ -0,0 +1,25 @@ +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef WIN32 + #include +#else + #ifndef _GETOPT_ + #define _GETOPT_ + + #include /* for EOF */ + #include /* for strchr() */ + + char *optarg = NULL; /* pointer to the start of the option argument */ + int optind = 1; /* number of the next argv[] to be evaluated */ + int opterr = 1; /* non-zero if a question mark should be returned */ + + int getopt(int argc, char *argv[], char *opstring); + #endif //_GETOPT_ +#endif //WIN32 + +#ifdef __cplusplus +} +#endif +