diff -u -r pigz-2.1.6.orig/pigz.c pigz-2.1.6/pigz.c --- pigz-2.1.6.orig/pigz.c 2010-01-17 21:09:37.000000000 +0100 +++ pigz-2.1.6/pigz.c 2011-06-02 12:29:12.245689087 +0200 @@ -304,6 +304,8 @@ } \ } while (0) +#define DZIP_HEADER_LEN 24 + /* globals (modified by main thread only when it's the only thread) */ local int ind; /* input file descriptor */ local int outd; /* output file descriptor */ @@ -322,11 +324,12 @@ local int list; /* true to list files instead of compress */ local int first = 1; /* true if we need to print listing header */ local int decode; /* 0 to compress, 1 to decompress, 2 to test */ +local int dictzip; /* true to write random access dictionary */ local int level; /* compression level */ local int rsync; /* true for rsync blocking */ local int procs; /* maximum number of compression threads (>= 1) */ local int dict; /* true to initialize dictionary in each thread */ -local size_t size; /* uncompressed input size per thread (>= 32K) */ +size_t size; /* uncompressed input size per thread (>= 32K) */ /* saved gzip/zip header data for decompression, testing, and listing */ local time_t stamp; /* time stamp from gzip header */ @@ -560,6 +563,17 @@ return dos; } +long long get_file_size(int fd) +{ + long long old_pos = lseek(fd, 0, SEEK_CUR); + long long retval; + + retval = lseek(fd, 0, SEEK_END); + lseek(fd, old_pos, SEEK_SET); + + return retval; +} + /* put a 4-byte integer into a byte array in LSB order or MSB order */ #define PUT2L(a,b) (*(a)=(b)&0xff,(a)[1]=(b)>>8) #define PUT4L(a,b) (PUT2L(a,(b)&0xffff),PUT2L((a)+2,(b)>>16)) @@ -613,12 +627,27 @@ head[0] = 31; head[1] = 139; head[2] = 8; /* deflate */ - head[3] = name != NULL ? 8 : 0; + head[3] = 0; + if (dictzip) + head[3] |= 4; + if (name != NULL) + head[3] |= 8; PUT4L(head + 4, mtime); head[8] = level == 9 ? 2 : (level == 1 ? 4 : 0); head[9] = 3; /* unix */ writen(outd, head, 10); len = 10; + if (dictzip) { + long long file_len = get_file_size(ind); + int extra_len = DZIP_HEADER_LEN + ((file_len / size) + 2) * sizeof(int); + + char *extra = (char*)malloc(extra_len); + + memset(extra, 0, extra_len); + writen(outd, extra, extra_len); + free(extra); + len += extra_len; + } if (name != NULL) writen(outd, (unsigned char *)name, strlen(name) + 1); if (name != NULL) @@ -1165,6 +1194,10 @@ unsigned long ulen; /* total uncompressed size (overflow ok) */ unsigned long clen; /* total compressed size (overflow ok) */ unsigned long check; /* check value of uncompressed data */ + char *extra = NULL; /* dictzip extra header field */ + int dz_cur; /* dictzip current pointer to header */ + long long file_len = get_file_size(ind); + int extra_len = DZIP_HEADER_LEN + ((file_len / size) + 2) * sizeof(int); (void)dummy; @@ -1172,6 +1205,25 @@ Trace(("-- write thread running")); head = put_header(); + if (dictzip) { + extra = (char*)malloc(extra_len + 32); + memset(extra, 0, extra_len); + + if (extra_len >= ((1ULL << 16) - 1)) + bail("file too long for dictzip\n", ""); + /* extra length */ + PUT2L(extra, extra_len - 2); + /* Random Access ID */ + extra[2] = 'R'; + extra[3] = 'A'; + /* RA version 99 */ + /* XXX need to standardize! */ + PUT2L(extra + 6, 99); + // PUT2L(extra + 6, 1); + + dz_cur = 0; + } + /* process output of compress threads until end of input */ ulen = clen = 0; check = CHECK(0L, Z_NULL, 0); @@ -1191,6 +1243,15 @@ ulen += (unsigned long)len; clen += (unsigned long)(job->out->len); + if (dictzip) { + if (((dz_cur * 4) + DZIP_HEADER_LEN + sizeof(int)) > extra_len) { + printf(" %d > %d\n", dz_cur, file_len / size); + bail ("input file too large\n", ""); + } + PUT4L(extra + (dz_cur * 4) + DZIP_HEADER_LEN, job->out->len); + dz_cur++; + } + /* write the compressed data and drop the output buffer */ Trace(("-- writing #%ld", seq)); writen(outd, job->out->buf, job->out->len); @@ -1215,6 +1276,22 @@ /* write trailer */ put_trailer(ulen, clen, check, head); + if (dictzip) { + /* chunk length */ + PUT4L(extra + 8, size); + /* chunk count */ + PUT4L(extra + 12, dz_cur); +// PUT4L(extra + 12, (file_len / size) + 1); + /* Target file size (64 bit) */ + PUT4L(extra + 16, file_len); + PUT4L(extra + 20, file_len >> 32); + /* write extra header */ + if (lseek(outd, 10, SEEK_SET) != 10) + bail("couldn't seek in output file\n", ""); + writen(outd, extra, extra_len); + free(extra); + } + /* verify no more jobs, prepare for next use */ possess(compress_have); assert(compress_head == NULL && peek_lock(compress_have) == 0); @@ -2625,6 +2702,18 @@ /* prepare gzip header information for compression */ name = headis & 1 ? justname(in) : NULL; mtime = headis & 2 ? st.st_mtime : 0; + + /* Find a sane chunk size */ + if (dictzip) { + long long file_len = get_file_size(ind); + int extra_len; + + extra_len = DZIP_HEADER_LEN + ((file_len / size) + 2) * sizeof(int); + while (extra_len >= ((1 << 16) - 1)) { + size += (1 << 10ULL); + extra_len = DZIP_HEADER_LEN + ((file_len / size) + 2) * sizeof(int); + } + } } SET_BINARY_MODE(ind); @@ -2678,6 +2767,8 @@ out = malloc(strlen("") + 1); if (out == NULL) bail("not enough memory", ""); + if (dictzip) + bail("need to have a seekable output for dictzip\n", ""); strcpy(out, ""); outd = 1; if (!decode && !force && isatty(outd)) @@ -2796,6 +2887,7 @@ " -i, --independent Compress blocks independently for damage recovery", " -R, --rsyncable Input-determined block locations for rsync", " -d, --decompress Decompress the compressed input", +" -e, --dictzip Write dictzip random seek information in gzip header", " -t, --test Test the integrity of the compressed input", " -l, --list List the contents of the compressed input", " -f, --force Force overwrite, compress .gz, links, and to terminal", @@ -2868,17 +2960,18 @@ force = 0; /* don't overwrite, don't compress links */ recurse = 0; /* don't go into directories */ form = 0; /* use gzip format */ + dictzip = 0; /* don't write dictzip information */ } /* long options conversion to short options */ local char *longopts[][2] = { {"LZW", "Z"}, {"ascii", "a"}, {"best", "9"}, {"bits", "Z"}, - {"blocksize", "b"}, {"decompress", "d"}, {"fast", "1"}, {"force", "f"}, - {"help", "h"}, {"independent", "i"}, {"keep", "k"}, {"license", "L"}, - {"list", "l"}, {"name", "N"}, {"no-name", "n"}, {"no-time", "T"}, - {"processes", "p"}, {"quiet", "q"}, {"recursive", "r"}, {"rsyncable", "R"}, - {"silent", "q"}, {"stdout", "c"}, {"suffix", "S"}, {"test", "t"}, - {"to-stdout", "c"}, {"uncompress", "d"}, {"verbose", "v"}, + {"blocksize", "b"}, {"decompress", "d"}, {"dictzip", "e"}, {"fast", "1"}, + {"force", "f"}, {"help", "h"}, {"independent", "i"}, {"keep", "k"}, + {"license", "L"}, {"list", "l"}, {"name", "N"}, {"no-name", "n"}, + {"no-time", "T"}, {"processes", "p"}, {"quiet", "q"}, {"recursive", "r"}, + {"rsyncable", "R"}, {"silent", "q"}, {"stdout", "c"}, {"suffix", "S"}, + {"test", "t"}, {"to-stdout", "c"}, {"uncompress", "d"}, {"verbose", "v"}, {"version", "V"}, {"zip", "K"}, {"zlib", "z"}}; #define NLOPTS (sizeof(longopts) / (sizeof(char *) << 1)) @@ -2984,6 +3077,7 @@ case 'b': get = 1; break; case 'c': pipeout = 1; break; case 'd': decode = 1; headis = 0; break; + case 'e': dictzip = 1; dict = 0; break; case 'f': force = 1; break; case 'h': help(); break; case 'i': dict = 0; break; @@ -3112,6 +3206,19 @@ fprintf(stderr, "warning: output is concatenated zip files "); fprintf(stderr, "-- pigz will not be able to extract\n"); } + + /* dictzip sanity checks */ + if (dictzip && (form > 1)) { + fprintf(stderr, "warning: dictzip only works on gzip files\n"); + } + +#if 0 + if (dictzip && (size >= (64 << 10))) { + fprintf(stderr, "warning: dictzip needs chunks < 64k.\n"); + size = (63 << 10); + } +#endif + process(strcmp(argv[n], "-") ? argv[n] : NULL); done++; }