pigz/pigz-dictzip.patch

diff -u -r pigz-2.1.6.orig/pigz.c pigz-2.1.6/pigz.c
--- pigz-2.1.6.orig/pigz.c	2010-01-17 21:09:37.000000000 +0100
+++ pigz-2.1.6/pigz.c	2011-06-02 12:29:12.245689087 +0200
@@ -304,6 +304,8 @@
         } \
     } while (0)

+#define DZIP_HEADER_LEN    24
+
 /* globals (modified by main thread only when it's the only thread) */
 local int ind;              /* input file descriptor */
 local int outd;             /* output file descriptor */
@@ -322,11 +324,12 @@
 local int list;             /* true to list files instead of compress */
 local int first = 1;        /* true if we need to print listing header */
 local int decode;           /* 0 to compress, 1 to decompress, 2 to test */
+local int dictzip;          /* true to write random access dictionary */
 local int level;            /* compression level */
 local int rsync;            /* true for rsync blocking */
 local int procs;            /* maximum number of compression threads (>= 1) */
 local int dict;             /* true to initialize dictionary in each thread */
-local size_t size;          /* uncompressed input size per thread (>= 32K) */
+size_t size;          /* uncompressed input size per thread (>= 32K) */

 /* saved gzip/zip header data for decompression, testing, and listing */
 local time_t stamp;                 /* time stamp from gzip header */
@@ -560,6 +563,17 @@
     return dos;
 }

+long long get_file_size(int fd)
+{
+    long long old_pos = lseek(fd, 0, SEEK_CUR);
+    long long retval;
+
+    retval = lseek(fd, 0, SEEK_END);
+    lseek(fd, old_pos, SEEK_SET);
+
+    return retval;
+}
+
 /* put a 4-byte integer into a byte array in LSB order or MSB order */
 #define PUT2L(a,b) (*(a)=(b)&0xff,(a)[1]=(b)>>8)
 #define PUT4L(a,b) (PUT2L(a,(b)&0xffff),PUT2L((a)+2,(b)>>16))
@@ -613,12 +627,27 @@
         head[0] = 31;
         head[1] = 139;
         head[2] = 8;                /* deflate */
-        head[3] = name != NULL ? 8 : 0;
+        head[3] = 0;
+        if (dictzip)
+            head[3] |= 4;
+        if (name != NULL)
+            head[3] |= 8;
         PUT4L(head + 4, mtime);
         head[8] = level == 9 ? 2 : (level == 1 ? 4 : 0);
         head[9] = 3;                /* unix */
         writen(outd, head, 10);
         len = 10;
+        if (dictzip) {
+            long long file_len = get_file_size(ind);
+            int extra_len = DZIP_HEADER_LEN + ((file_len / size) + 2) * sizeof(int);
+
+            char *extra = (char*)malloc(extra_len);
+
+            memset(extra, 0, extra_len);
+            writen(outd, extra, extra_len);
+            free(extra);
+            len += extra_len;
+        }
         if (name != NULL)
             writen(outd, (unsigned char *)name, strlen(name) + 1);
         if (name != NULL)
@@ -1165,6 +1194,10 @@
     unsigned long ulen;             /* total uncompressed size (overflow ok) */
     unsigned long clen;             /* total compressed size (overflow ok) */
     unsigned long check;            /* check value of uncompressed data */
+    char *extra = NULL;             /* dictzip extra header field */
+    int dz_cur;                     /* dictzip current pointer to header */
+    long long file_len = get_file_size(ind);
+    int extra_len = DZIP_HEADER_LEN + ((file_len / size) + 2) * sizeof(int);

     (void)dummy;

@@ -1172,6 +1205,25 @@
     Trace(("-- write thread running"));
     head = put_header();

+    if (dictzip) {
+        extra = (char*)malloc(extra_len + 32);
+        memset(extra, 0, extra_len);
+
+        if (extra_len >= ((1ULL << 16) - 1))
+            bail("file too long for dictzip\n", "");
+        /* extra length */
+        PUT2L(extra, extra_len - 2);
+        /* Random Access ID */
+        extra[2] = 'R';
+        extra[3] = 'A';
+        /* RA version 99 */
+        /* XXX need to standardize! */
+        PUT2L(extra + 6, 99);
+        // PUT2L(extra + 6, 1);
+
+        dz_cur = 0;
+    }
+
     /* process output of compress threads until end of input */
     ulen = clen = 0;
     check = CHECK(0L, Z_NULL, 0);
@@ -1191,6 +1243,15 @@
         ulen += (unsigned long)len;
         clen += (unsigned long)(job->out->len);

+        if (dictzip) {
+            if (((dz_cur * 4) + DZIP_HEADER_LEN + sizeof(int)) > extra_len) {
+                printf(" %d > %d\n", dz_cur, file_len / size);
+                bail ("input file too large\n", "");
+            }
+            PUT4L(extra + (dz_cur * 4) + DZIP_HEADER_LEN, job->out->len);
+            dz_cur++;
+        }
+
         /* write the compressed data and drop the output buffer */
         Trace(("-- writing #%ld", seq));
         writen(outd, job->out->buf, job->out->len);
@@ -1215,6 +1276,22 @@
     /* write trailer */
     put_trailer(ulen, clen, check, head);

+    if (dictzip) {
+        /* chunk length */
+        PUT4L(extra + 8, size);
+        /* chunk count */
+        PUT4L(extra + 12, dz_cur);
+//        PUT4L(extra + 12, (file_len / size) + 1);
+        /* Target file size (64 bit) */
+        PUT4L(extra + 16, file_len);
+        PUT4L(extra + 20, file_len >> 32);
+        /* write extra header */
+        if (lseek(outd, 10, SEEK_SET) != 10)
+            bail("couldn't seek in output file\n", "");
+        writen(outd, extra, extra_len);
+        free(extra);
+    }
+
     /* verify no more jobs, prepare for next use */
     possess(compress_have);
     assert(compress_head == NULL && peek_lock(compress_have) == 0);
@@ -2625,6 +2702,18 @@
         /* prepare gzip header information for compression */
         name = headis & 1 ? justname(in) : NULL;
         mtime = headis & 2 ? st.st_mtime : 0;
+
+        /* Find a sane chunk size */
+        if (dictzip) {
+            long long file_len = get_file_size(ind);
+            int extra_len;
+
+            extra_len = DZIP_HEADER_LEN + ((file_len / size) + 2) * sizeof(int);
+            while (extra_len >= ((1 << 16) - 1)) {
+                size += (1 << 10ULL);
+                extra_len = DZIP_HEADER_LEN + ((file_len / size) + 2) * sizeof(int);
+            }
+        }
     }
     SET_BINARY_MODE(ind);

@@ -2678,6 +2767,8 @@
         out = malloc(strlen("<stdout>") + 1);
         if (out == NULL)
             bail("not enough memory", "");
+        if (dictzip)
+            bail("need to have a seekable output for dictzip\n", "");
         strcpy(out, "<stdout>");
         outd = 1;
         if (!decode && !force && isatty(outd))
@@ -2796,6 +2887,7 @@
 "  -i, --independent    Compress blocks independently for damage recovery",
 "  -R, --rsyncable      Input-determined block locations for rsync",
 "  -d, --decompress     Decompress the compressed input",
+"  -e, --dictzip        Write dictzip random seek information in gzip header",
 "  -t, --test           Test the integrity of the compressed input",
 "  -l, --list           List the contents of the compressed input",
 "  -f, --force          Force overwrite, compress .gz, links, and to terminal",
@@ -2868,17 +2960,18 @@
     force = 0;                      /* don't overwrite, don't compress links */
     recurse = 0;                    /* don't go into directories */
     form = 0;                       /* use gzip format */
+    dictzip = 0;                    /* don't write dictzip information */
 }

 /* long options conversion to short options */
 local char *longopts[][2] = {
     {"LZW", "Z"}, {"ascii", "a"}, {"best", "9"}, {"bits", "Z"},
-    {"blocksize", "b"}, {"decompress", "d"}, {"fast", "1"}, {"force", "f"},
-    {"help", "h"}, {"independent", "i"}, {"keep", "k"}, {"license", "L"},
-    {"list", "l"}, {"name", "N"}, {"no-name", "n"}, {"no-time", "T"},
-    {"processes", "p"}, {"quiet", "q"}, {"recursive", "r"}, {"rsyncable", "R"},
-    {"silent", "q"}, {"stdout", "c"}, {"suffix", "S"}, {"test", "t"},
-    {"to-stdout", "c"}, {"uncompress", "d"}, {"verbose", "v"},
+    {"blocksize", "b"}, {"decompress", "d"}, {"dictzip", "e"}, {"fast", "1"},
+    {"force", "f"}, {"help", "h"}, {"independent", "i"}, {"keep", "k"},
+    {"license", "L"}, {"list", "l"}, {"name", "N"}, {"no-name", "n"},
+    {"no-time", "T"}, {"processes", "p"}, {"quiet", "q"}, {"recursive", "r"},
+    {"rsyncable", "R"}, {"silent", "q"}, {"stdout", "c"}, {"suffix", "S"},
+    {"test", "t"}, {"to-stdout", "c"}, {"uncompress", "d"}, {"verbose", "v"},
     {"version", "V"}, {"zip", "K"}, {"zlib", "z"}};
 #define NLOPTS (sizeof(longopts) / (sizeof(char *) << 1))

@@ -2984,6 +3077,7 @@
             case 'b':  get = 1;  break;
             case 'c':  pipeout = 1;  break;
             case 'd':  decode = 1;  headis = 0;  break;
+            case 'e':  dictzip = 1; dict = 0;  break;
             case 'f':  force = 1;  break;
             case 'h':  help();  break;
             case 'i':  dict = 0;  break;
@@ -3112,6 +3206,19 @@
                 fprintf(stderr, "warning: output is concatenated zip files ");
                 fprintf(stderr, "-- pigz will not be able to extract\n");
             }
+
+            /* dictzip sanity checks */
+            if (dictzip && (form > 1)) {
+                fprintf(stderr, "warning: dictzip only works on gzip files\n");
+            }
+
+#if 0
+            if (dictzip && (size >= (64 << 10))) {
+                fprintf(stderr, "warning: dictzip needs chunks < 64k.\n");
+                size = (63 << 10);
+            }
+#endif
+
             process(strcmp(argv[n], "-") ? argv[n] : NULL);
             done++;
         }