mirror of
git://git.sv.gnu.org/findutils.git
synced 2026-01-27 12:13:22 +01:00
215 lines
5.8 KiB
C
215 lines
5.8 KiB
C
/* code -- bigram- and front-encode filenames for locate
|
|
Copyright (C) 1994 Free Software Foundation, Inc.
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation; either version 2, or (at your option)
|
|
any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program; if not, write to the Free Software
|
|
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */
|
|
|
|
/* Compress a sorted list.
|
|
Works with `find' to encode a filename database to save space
|
|
and search time.
|
|
|
|
Usage:
|
|
|
|
bigram < file_list > bigrams
|
|
process-bigrams > most_common_bigrams
|
|
code most_common_bigrams < file_list > squeezed_list
|
|
|
|
Uses `front compression' (see ";login:", March 1983, p. 8).
|
|
The output begins with the 128 most common bigrams.
|
|
After that, the output format is, for each line,
|
|
an offset (from the previous line) differential count byte
|
|
followed by a (partially bigram-encoded) ASCII remainder.
|
|
The output lines have no terminating byte; the start of the next line
|
|
is indicated by its first byte having a value <= 30.
|
|
|
|
The encoding of the output bytes is:
|
|
|
|
0-28 likeliest differential counts + offset (14) to make nonnegative
|
|
30 escape code for out-of-range count to follow in next halfword
|
|
128-255 bigram codes (the 128 most common, as determined by `updatedb')
|
|
32-127 single character (printable) ASCII remainder
|
|
|
|
Written by James A. Woods <jwoods@adobe.com>.
|
|
Modified by David MacKenzie <djm@gnu.ai.mit.edu>. */
|
|
|
|
#include <config.h>
|
|
#include <stdio.h>
|
|
#include <sys/types.h>
|
|
|
|
#if defined(HAVE_STRING_H) || defined(STDC_HEADERS)
|
|
#include <string.h>
|
|
#else
|
|
#include <strings.h>
|
|
#endif
|
|
|
|
#ifdef STDC_HEADERS
|
|
#include <stdlib.h>
|
|
#endif
|
|
|
|
#include "locatedb.h"
|
|
|
|
char *xmalloc ();
|
|
|
|
/* The name this program was run with. */
|
|
char *program_name;
|
|
|
|
/* The 128 most common bigrams in the file list, padded with NULs
|
|
if there are fewer. */
|
|
static char bigrams[257] = {0};
|
|
|
|
/* Return the offset of PATTERN in STRING, or -1 if not found. */
|
|
|
|
static int
|
|
strindex (string, pattern)
|
|
char *string, *pattern;
|
|
{
|
|
register char *s;
|
|
|
|
for (s = string; *s != '\0'; s++)
|
|
/* Fast first char check. */
|
|
if (*s == *pattern)
|
|
{
|
|
register char *p2 = pattern + 1, *s2 = s + 1;
|
|
while (*p2 != '\0' && *p2 == *s2)
|
|
p2++, s2++;
|
|
if (*p2 == '\0')
|
|
return s2 - strlen (pattern) - string;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
/* Return the length of the longest common prefix of strings S1 and S2. */
|
|
|
|
static int
|
|
prefix_length (s1, s2)
|
|
char *s1, *s2;
|
|
{
|
|
register char *start;
|
|
|
|
for (start = s1; *s1 == *s2 && *s1 != '\0'; s1++, s2++)
|
|
;
|
|
return s1 - start;
|
|
}
|
|
|
|
void
|
|
main (argc, argv)
|
|
int argc;
|
|
char **argv;
|
|
{
|
|
char *path; /* The current input entry. */
|
|
char *oldpath; /* The previous input entry. */
|
|
size_t pathsize, oldpathsize; /* Amounts allocated for them. */
|
|
int count, oldcount, diffcount; /* Their prefix lengths & the difference. */
|
|
char bigram[3]; /* Bigram to search for in table. */
|
|
int code; /* Index of `bigram' in bigrams table. */
|
|
FILE *fp; /* Most common bigrams file. */
|
|
int line_len; /* Length of input line. */
|
|
|
|
program_name = argv[0];
|
|
|
|
bigram[2] = '\0';
|
|
|
|
if (argc != 2)
|
|
{
|
|
fprintf (stderr, "Usage: %s most_common_bigrams < list > coded_list\n",
|
|
argv[0]);
|
|
exit (2);
|
|
}
|
|
|
|
fp = fopen (argv[1], "r");
|
|
if (fp == NULL)
|
|
{
|
|
fprintf (stderr, "%s: ", argv[0]);
|
|
perror (argv[1]);
|
|
exit (1);
|
|
}
|
|
|
|
pathsize = oldpathsize = 1026; /* Increased as necessary by getstr. */
|
|
path = xmalloc (pathsize);
|
|
oldpath = xmalloc (oldpathsize);
|
|
|
|
/* Set to anything not starting with a slash, to force the first
|
|
prefix count to 0. */
|
|
strcpy (oldpath, " ");
|
|
oldcount = 0;
|
|
|
|
/* Copy the list of most common bigrams to the output,
|
|
padding with NULs if there are <128 of them. */
|
|
fgets (bigrams, 257, fp);
|
|
fwrite (bigrams, 1, 256, stdout);
|
|
fclose (fp);
|
|
|
|
while ((line_len = getstr (&path, &pathsize, stdin, '\n', 0)) > 0)
|
|
{
|
|
char *pp;
|
|
|
|
path[line_len - 1] = '\0'; /* Remove newline. */
|
|
|
|
/* Squelch unprintable chars in path so as not to botch decoding. */
|
|
for (pp = path; *pp != '\0'; pp++)
|
|
{
|
|
*pp &= 0177;
|
|
if (*pp < 040 || *pp == 0177)
|
|
*pp = '?';
|
|
}
|
|
|
|
count = prefix_length (oldpath, path);
|
|
diffcount = count - oldcount;
|
|
oldcount = count;
|
|
/* If the difference is small, it fits in one byte;
|
|
otherwise, two bytes plus a marker noting that fact. */
|
|
if (diffcount < -LOCATEDB_OLD_OFFSET || diffcount > LOCATEDB_OLD_OFFSET)
|
|
{
|
|
putc (LOCATEDB_OLD_ESCAPE, stdout);
|
|
putw (diffcount + LOCATEDB_OLD_OFFSET, stdout);
|
|
}
|
|
else
|
|
putc (diffcount + LOCATEDB_OLD_OFFSET, stdout);
|
|
|
|
/* Look for bigrams in the remainder of the path. */
|
|
for (pp = path + count; *pp != '\0'; pp += 2)
|
|
{
|
|
if (pp[1] == '\0')
|
|
{
|
|
/* No bigram is possible; only one char is left. */
|
|
putchar (*pp);
|
|
break;
|
|
}
|
|
bigram[0] = *pp;
|
|
bigram[1] = pp[1];
|
|
/* Linear search for specific bigram in string table. */
|
|
code = strindex (bigrams, bigram);
|
|
if (code % 2 == 0)
|
|
putchar ((code / 2) | 0200); /* It's a common bigram. */
|
|
else
|
|
fputs (bigram, stdout); /* Write the text as printable ASCII. */
|
|
}
|
|
|
|
{
|
|
/* Swap path and oldpath and their sizes. */
|
|
char *tmppath = oldpath;
|
|
size_t tmppathsize = oldpathsize;
|
|
oldpath = path;
|
|
oldpathsize = pathsize;
|
|
path = tmppath;
|
|
pathsize = tmppathsize;
|
|
}
|
|
}
|
|
|
|
free (path);
|
|
free (oldpath);
|
|
|
|
exit (0);
|
|
}
|