kusano fc6ab3
/* zran.c -- example of zlib/gzip stream indexing and random access
kusano fc6ab3
 * Copyright (C) 2005, 2012 Mark Adler
kusano fc6ab3
 * For conditions of distribution and use, see copyright notice in zlib.h
kusano fc6ab3
   Version 1.1  29 Sep 2012  Mark Adler */
kusano fc6ab3
kusano fc6ab3
/* Version History:
kusano fc6ab3
 1.0  29 May 2005  First version
kusano fc6ab3
 1.1  29 Sep 2012  Fix memory reallocation error
kusano fc6ab3
 */
kusano fc6ab3
kusano fc6ab3
/* Illustrate the use of Z_BLOCK, inflatePrime(), and inflateSetDictionary()
kusano fc6ab3
   for random access of a compressed file.  A file containing a zlib or gzip
kusano fc6ab3
   stream is provided on the command line.  The compressed stream is decoded in
kusano fc6ab3
   its entirety, and an index built with access points about every SPAN bytes
kusano fc6ab3
   in the uncompressed output.  The compressed file is left open, and can then
kusano fc6ab3
   be read randomly, having to decompress on the average SPAN/2 uncompressed
kusano fc6ab3
   bytes before getting to the desired block of data.
kusano fc6ab3
kusano fc6ab3
   An access point can be created at the start of any deflate block, by saving
kusano fc6ab3
   the starting file offset and bit of that block, and the 32K bytes of
kusano fc6ab3
   uncompressed data that precede that block.  Also the uncompressed offset of
kusano fc6ab3
   that block is saved to provide a referece for locating a desired starting
kusano fc6ab3
   point in the uncompressed stream.  build_index() works by decompressing the
kusano fc6ab3
   input zlib or gzip stream a block at a time, and at the end of each block
kusano fc6ab3
   deciding if enough uncompressed data has gone by to justify the creation of
kusano fc6ab3
   a new access point.  If so, that point is saved in a data structure that
kusano fc6ab3
   grows as needed to accommodate the points.
kusano fc6ab3
kusano fc6ab3
   To use the index, an offset in the uncompressed data is provided, for which
kusano fc6ab3
   the latest accees point at or preceding that offset is located in the index.
kusano fc6ab3
   The input file is positioned to the specified location in the index, and if
kusano fc6ab3
   necessary the first few bits of the compressed data is read from the file.
kusano fc6ab3
   inflate is initialized with those bits and the 32K of uncompressed data, and
kusano fc6ab3
   the decompression then proceeds until the desired offset in the file is
kusano fc6ab3
   reached.  Then the decompression continues to read the desired uncompressed
kusano fc6ab3
   data from the file.
kusano fc6ab3
kusano fc6ab3
   Another approach would be to generate the index on demand.  In that case,
kusano fc6ab3
   requests for random access reads from the compressed data would try to use
kusano fc6ab3
   the index, but if a read far enough past the end of the index is required,
kusano fc6ab3
   then further index entries would be generated and added.
kusano fc6ab3
kusano fc6ab3
   There is some fair bit of overhead to starting inflation for the random
kusano fc6ab3
   access, mainly copying the 32K byte dictionary.  So if small pieces of the
kusano fc6ab3
   file are being accessed, it would make sense to implement a cache to hold
kusano fc6ab3
   some lookahead and avoid many calls to extract() for small lengths.
kusano fc6ab3
kusano fc6ab3
   Another way to build an index would be to use inflateCopy().  That would
kusano fc6ab3
   not be constrained to have access points at block boundaries, but requires
kusano fc6ab3
   more memory per access point, and also cannot be saved to file due to the
kusano fc6ab3
   use of pointers in the state.  The approach here allows for storage of the
kusano fc6ab3
   index in a file.
kusano fc6ab3
 */
kusano fc6ab3
kusano fc6ab3
#include <stdio.h></stdio.h>
kusano fc6ab3
#include <stdlib.h></stdlib.h>
kusano fc6ab3
#include <string.h></string.h>
kusano fc6ab3
#include "zlib.h"
kusano fc6ab3
kusano fc6ab3
#define local static
kusano fc6ab3
kusano fc6ab3
#define SPAN 1048576L       /* desired distance between access points */
kusano fc6ab3
#define WINSIZE 32768U      /* sliding window size */
kusano fc6ab3
#define CHUNK 16384         /* file input buffer size */
kusano fc6ab3
kusano fc6ab3
/* access point entry */
kusano fc6ab3
struct point {
kusano fc6ab3
    off_t out;          /* corresponding offset in uncompressed data */
kusano fc6ab3
    off_t in;           /* offset in input file of first full byte */
kusano fc6ab3
    int bits;           /* number of bits (1-7) from byte at in - 1, or 0 */
kusano fc6ab3
    unsigned char window[WINSIZE];  /* preceding 32K of uncompressed data */
kusano fc6ab3
};
kusano fc6ab3
kusano fc6ab3
/* access point list */
kusano fc6ab3
struct access {
kusano fc6ab3
    int have;           /* number of list entries filled in */
kusano fc6ab3
    int size;           /* number of list entries allocated */
kusano fc6ab3
    struct point *list; /* allocated list */
kusano fc6ab3
};
kusano fc6ab3
kusano fc6ab3
/* Deallocate an index built by build_index() */
kusano fc6ab3
local void free_index(struct access *index)
kusano fc6ab3
{
kusano fc6ab3
    if (index != NULL) {
kusano fc6ab3
        free(index->list);
kusano fc6ab3
        free(index);
kusano fc6ab3
    }
kusano fc6ab3
}
kusano fc6ab3
kusano fc6ab3
/* Add an entry to the access point list.  If out of memory, deallocate the
kusano fc6ab3
   existing list and return NULL. */
kusano fc6ab3
local struct access *addpoint(struct access *index, int bits,
kusano fc6ab3
    off_t in, off_t out, unsigned left, unsigned char *window)
kusano fc6ab3
{
kusano fc6ab3
    struct point *next;
kusano fc6ab3
kusano fc6ab3
    /* if list is empty, create it (start with eight points) */
kusano fc6ab3
    if (index == NULL) {
kusano fc6ab3
        index = malloc(sizeof(struct access));
kusano fc6ab3
        if (index == NULL) return NULL;
kusano fc6ab3
        index->list = malloc(sizeof(struct point) << 3);
kusano fc6ab3
        if (index->list == NULL) {
kusano fc6ab3
            free(index);
kusano fc6ab3
            return NULL;
kusano fc6ab3
        }
kusano fc6ab3
        index->size = 8;
kusano fc6ab3
        index->have = 0;
kusano fc6ab3
    }
kusano fc6ab3
kusano fc6ab3
    /* if list is full, make it bigger */
kusano fc6ab3
    else if (index->have == index->size) {
kusano fc6ab3
        index->size <<= 1;
kusano fc6ab3
        next = realloc(index->list, sizeof(struct point) * index->size);
kusano fc6ab3
        if (next == NULL) {
kusano fc6ab3
            free_index(index);
kusano fc6ab3
            return NULL;
kusano fc6ab3
        }
kusano fc6ab3
        index->list = next;
kusano fc6ab3
    }
kusano fc6ab3
kusano fc6ab3
    /* fill in entry and increment how many we have */
kusano fc6ab3
    next = index->list + index->have;
kusano fc6ab3
    next->bits = bits;
kusano fc6ab3
    next->in = in;
kusano fc6ab3
    next->out = out;
kusano fc6ab3
    if (left)
kusano fc6ab3
        memcpy(next->window, window + WINSIZE - left, left);
kusano fc6ab3
    if (left < WINSIZE)
kusano fc6ab3
        memcpy(next->window + left, window, WINSIZE - left);
kusano fc6ab3
    index->have++;
kusano fc6ab3
kusano fc6ab3
    /* return list, possibly reallocated */
kusano fc6ab3
    return index;
kusano fc6ab3
}
kusano fc6ab3
kusano fc6ab3
/* Make one entire pass through the compressed stream and build an index, with
kusano fc6ab3
   access points about every span bytes of uncompressed output -- span is
kusano fc6ab3
   chosen to balance the speed of random access against the memory requirements
kusano fc6ab3
   of the list, about 32K bytes per access point.  Note that data after the end
kusano fc6ab3
   of the first zlib or gzip stream in the file is ignored.  build_index()
kusano fc6ab3
   returns the number of access points on success (>= 1), Z_MEM_ERROR for out
kusano fc6ab3
   of memory, Z_DATA_ERROR for an error in the input file, or Z_ERRNO for a
kusano fc6ab3
   file read error.  On success, *built points to the resulting index. */
kusano fc6ab3
local int build_index(FILE *in, off_t span, struct access **built)
kusano fc6ab3
{
kusano fc6ab3
    int ret;
kusano fc6ab3
    off_t totin, totout;        /* our own total counters to avoid 4GB limit */
kusano fc6ab3
    off_t last;                 /* totout value of last access point */
kusano fc6ab3
    struct access *index;       /* access points being generated */
kusano fc6ab3
    z_stream strm;
kusano fc6ab3
    unsigned char input[CHUNK];
kusano fc6ab3
    unsigned char window[WINSIZE];
kusano fc6ab3
kusano fc6ab3
    /* initialize inflate */
kusano fc6ab3
    strm.zalloc = Z_NULL;
kusano fc6ab3
    strm.zfree = Z_NULL;
kusano fc6ab3
    strm.opaque = Z_NULL;
kusano fc6ab3
    strm.avail_in = 0;
kusano fc6ab3
    strm.next_in = Z_NULL;
kusano fc6ab3
    ret = inflateInit2(&strm, 47);      /* automatic zlib or gzip decoding */
kusano fc6ab3
    if (ret != Z_OK)
kusano fc6ab3
        return ret;
kusano fc6ab3
kusano fc6ab3
    /* inflate the input, maintain a sliding window, and build an index -- this
kusano fc6ab3
       also validates the integrity of the compressed data using the check
kusano fc6ab3
       information at the end of the gzip or zlib stream */
kusano fc6ab3
    totin = totout = last = 0;
kusano fc6ab3
    index = NULL;               /* will be allocated by first addpoint() */
kusano fc6ab3
    strm.avail_out = 0;
kusano fc6ab3
    do {
kusano fc6ab3
        /* get some compressed data from input file */
kusano fc6ab3
        strm.avail_in = fread(input, 1, CHUNK, in);
kusano fc6ab3
        if (ferror(in)) {
kusano fc6ab3
            ret = Z_ERRNO;
kusano fc6ab3
            goto build_index_error;
kusano fc6ab3
        }
kusano fc6ab3
        if (strm.avail_in == 0) {
kusano fc6ab3
            ret = Z_DATA_ERROR;
kusano fc6ab3
            goto build_index_error;
kusano fc6ab3
        }
kusano fc6ab3
        strm.next_in = input;
kusano fc6ab3
kusano fc6ab3
        /* process all of that, or until end of stream */
kusano fc6ab3
        do {
kusano fc6ab3
            /* reset sliding window if necessary */
kusano fc6ab3
            if (strm.avail_out == 0) {
kusano fc6ab3
                strm.avail_out = WINSIZE;
kusano fc6ab3
                strm.next_out = window;
kusano fc6ab3
            }
kusano fc6ab3
kusano fc6ab3
            /* inflate until out of input, output, or at end of block --
kusano fc6ab3
               update the total input and output counters */
kusano fc6ab3
            totin += strm.avail_in;
kusano fc6ab3
            totout += strm.avail_out;
kusano fc6ab3
            ret = inflate(&strm, Z_BLOCK);      /* return at end of block */
kusano fc6ab3
            totin -= strm.avail_in;
kusano fc6ab3
            totout -= strm.avail_out;
kusano fc6ab3
            if (ret == Z_NEED_DICT)
kusano fc6ab3
                ret = Z_DATA_ERROR;
kusano fc6ab3
            if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
kusano fc6ab3
                goto build_index_error;
kusano fc6ab3
            if (ret == Z_STREAM_END)
kusano fc6ab3
                break;
kusano fc6ab3
kusano fc6ab3
            /* if at end of block, consider adding an index entry (note that if
kusano fc6ab3
               data_type indicates an end-of-block, then all of the
kusano fc6ab3
               uncompressed data from that block has been delivered, and none
kusano fc6ab3
               of the compressed data after that block has been consumed,
kusano fc6ab3
               except for up to seven bits) -- the totout == 0 provides an
kusano fc6ab3
               entry point after the zlib or gzip header, and assures that the
kusano fc6ab3
               index always has at least one access point; we avoid creating an
kusano fc6ab3
               access point after the last block by checking bit 6 of data_type
kusano fc6ab3
             */
kusano fc6ab3
            if ((strm.data_type & 128) && !(strm.data_type & 64) &&
kusano fc6ab3
                (totout == 0 || totout - last > span)) {
kusano fc6ab3
                index = addpoint(index, strm.data_type & 7, totin,
kusano fc6ab3
                                 totout, strm.avail_out, window);
kusano fc6ab3
                if (index == NULL) {
kusano fc6ab3
                    ret = Z_MEM_ERROR;
kusano fc6ab3
                    goto build_index_error;
kusano fc6ab3
                }
kusano fc6ab3
                last = totout;
kusano fc6ab3
            }
kusano fc6ab3
        } while (strm.avail_in != 0);
kusano fc6ab3
    } while (ret != Z_STREAM_END);
kusano fc6ab3
kusano fc6ab3
    /* clean up and return index (release unused entries in list) */
kusano fc6ab3
    (void)inflateEnd(&strm);
kusano fc6ab3
    index->list = realloc(index->list, sizeof(struct point) * index->have);
kusano fc6ab3
    index->size = index->have;
kusano fc6ab3
    *built = index;
kusano fc6ab3
    return index->size;
kusano fc6ab3
kusano fc6ab3
    /* return error */
kusano fc6ab3
  build_index_error:
kusano fc6ab3
    (void)inflateEnd(&strm);
kusano fc6ab3
    if (index != NULL)
kusano fc6ab3
        free_index(index);
kusano fc6ab3
    return ret;
kusano fc6ab3
}
kusano fc6ab3
kusano fc6ab3
/* Use the index to read len bytes from offset into buf, return bytes read or
kusano fc6ab3
   negative for error (Z_DATA_ERROR or Z_MEM_ERROR).  If data is requested past
kusano fc6ab3
   the end of the uncompressed data, then extract() will return a value less
kusano fc6ab3
   than len, indicating how much as actually read into buf.  This function
kusano fc6ab3
   should not return a data error unless the file was modified since the index
kusano fc6ab3
   was generated.  extract() may also return Z_ERRNO if there is an error on
kusano fc6ab3
   reading or seeking the input file. */
kusano fc6ab3
local int extract(FILE *in, struct access *index, off_t offset,
kusano fc6ab3
                  unsigned char *buf, int len)
kusano fc6ab3
{
kusano fc6ab3
    int ret, skip;
kusano fc6ab3
    z_stream strm;
kusano fc6ab3
    struct point *here;
kusano fc6ab3
    unsigned char input[CHUNK];
kusano fc6ab3
    unsigned char discard[WINSIZE];
kusano fc6ab3
kusano fc6ab3
    /* proceed only if something reasonable to do */
kusano fc6ab3
    if (len < 0)
kusano fc6ab3
        return 0;
kusano fc6ab3
kusano fc6ab3
    /* find where in stream to start */
kusano fc6ab3
    here = index->list;
kusano fc6ab3
    ret = index->have;
kusano fc6ab3
    while (--ret && here[1].out <= offset)
kusano fc6ab3
        here++;
kusano fc6ab3
kusano fc6ab3
    /* initialize file and inflate state to start there */
kusano fc6ab3
    strm.zalloc = Z_NULL;
kusano fc6ab3
    strm.zfree = Z_NULL;
kusano fc6ab3
    strm.opaque = Z_NULL;
kusano fc6ab3
    strm.avail_in = 0;
kusano fc6ab3
    strm.next_in = Z_NULL;
kusano fc6ab3
    ret = inflateInit2(&strm, -15);         /* raw inflate */
kusano fc6ab3
    if (ret != Z_OK)
kusano fc6ab3
        return ret;
kusano fc6ab3
    ret = fseeko(in, here->in - (here->bits ? 1 : 0), SEEK_SET);
kusano fc6ab3
    if (ret == -1)
kusano fc6ab3
        goto extract_ret;
kusano fc6ab3
    if (here->bits) {
kusano fc6ab3
        ret = getc(in);
kusano fc6ab3
        if (ret == -1) {
kusano fc6ab3
            ret = ferror(in) ? Z_ERRNO : Z_DATA_ERROR;
kusano fc6ab3
            goto extract_ret;
kusano fc6ab3
        }
kusano fc6ab3
        (void)inflatePrime(&strm, here->bits, ret >> (8 - here->bits));
kusano fc6ab3
    }
kusano fc6ab3
    (void)inflateSetDictionary(&strm, here->window, WINSIZE);
kusano fc6ab3
kusano fc6ab3
    /* skip uncompressed bytes until offset reached, then satisfy request */
kusano fc6ab3
    offset -= here->out;
kusano fc6ab3
    strm.avail_in = 0;
kusano fc6ab3
    skip = 1;                               /* while skipping to offset */
kusano fc6ab3
    do {
kusano fc6ab3
        /* define where to put uncompressed data, and how much */
kusano fc6ab3
        if (offset == 0 && skip) {          /* at offset now */
kusano fc6ab3
            strm.avail_out = len;
kusano fc6ab3
            strm.next_out = buf;
kusano fc6ab3
            skip = 0;                       /* only do this once */
kusano fc6ab3
        }
kusano fc6ab3
        if (offset > WINSIZE) {             /* skip WINSIZE bytes */
kusano fc6ab3
            strm.avail_out = WINSIZE;
kusano fc6ab3
            strm.next_out = discard;
kusano fc6ab3
            offset -= WINSIZE;
kusano fc6ab3
        }
kusano fc6ab3
        else if (offset != 0) {             /* last skip */
kusano fc6ab3
            strm.avail_out = (unsigned)offset;
kusano fc6ab3
            strm.next_out = discard;
kusano fc6ab3
            offset = 0;
kusano fc6ab3
        }
kusano fc6ab3
kusano fc6ab3
        /* uncompress until avail_out filled, or end of stream */
kusano fc6ab3
        do {
kusano fc6ab3
            if (strm.avail_in == 0) {
kusano fc6ab3
                strm.avail_in = fread(input, 1, CHUNK, in);
kusano fc6ab3
                if (ferror(in)) {
kusano fc6ab3
                    ret = Z_ERRNO;
kusano fc6ab3
                    goto extract_ret;
kusano fc6ab3
                }
kusano fc6ab3
                if (strm.avail_in == 0) {
kusano fc6ab3
                    ret = Z_DATA_ERROR;
kusano fc6ab3
                    goto extract_ret;
kusano fc6ab3
                }
kusano fc6ab3
                strm.next_in = input;
kusano fc6ab3
            }
kusano fc6ab3
            ret = inflate(&strm, Z_NO_FLUSH);       /* normal inflate */
kusano fc6ab3
            if (ret == Z_NEED_DICT)
kusano fc6ab3
                ret = Z_DATA_ERROR;
kusano fc6ab3
            if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
kusano fc6ab3
                goto extract_ret;
kusano fc6ab3
            if (ret == Z_STREAM_END)
kusano fc6ab3
                break;
kusano fc6ab3
        } while (strm.avail_out != 0);
kusano fc6ab3
kusano fc6ab3
        /* if reach end of stream, then don't keep trying to get more */
kusano fc6ab3
        if (ret == Z_STREAM_END)
kusano fc6ab3
            break;
kusano fc6ab3
kusano fc6ab3
        /* do until offset reached and requested data read, or stream ends */
kusano fc6ab3
    } while (skip);
kusano fc6ab3
kusano fc6ab3
    /* compute number of uncompressed bytes read after offset */
kusano fc6ab3
    ret = skip ? 0 : len - strm.avail_out;
kusano fc6ab3
kusano fc6ab3
    /* clean up and return bytes read or error */
kusano fc6ab3
  extract_ret:
kusano fc6ab3
    (void)inflateEnd(&strm);
kusano fc6ab3
    return ret;
kusano fc6ab3
}
kusano fc6ab3
kusano fc6ab3
/* Demonstrate the use of build_index() and extract() by processing the file
kusano fc6ab3
   provided on the command line, and the extracting 16K from about 2/3rds of
kusano fc6ab3
   the way through the uncompressed output, and writing that to stdout. */
kusano fc6ab3
int main(int argc, char **argv)
kusano fc6ab3
{
kusano fc6ab3
    int len;
kusano fc6ab3
    off_t offset;
kusano fc6ab3
    FILE *in;
kusano fc6ab3
    struct access *index = NULL;
kusano fc6ab3
    unsigned char buf[CHUNK];
kusano fc6ab3
kusano fc6ab3
    /* open input file */
kusano fc6ab3
    if (argc != 2) {
kusano fc6ab3
        fprintf(stderr, "usage: zran file.gz\n");
kusano fc6ab3
        return 1;
kusano fc6ab3
    }
kusano fc6ab3
    in = fopen(argv[1], "rb");
kusano fc6ab3
    if (in == NULL) {
kusano fc6ab3
        fprintf(stderr, "zran: could not open %s for reading\n", argv[1]);
kusano fc6ab3
        return 1;
kusano fc6ab3
    }
kusano fc6ab3
kusano fc6ab3
    /* build index */
kusano fc6ab3
    len = build_index(in, SPAN, &index);
kusano fc6ab3
    if (len < 0) {
kusano fc6ab3
        fclose(in);
kusano fc6ab3
        switch (len) {
kusano fc6ab3
        case Z_MEM_ERROR:
kusano fc6ab3
            fprintf(stderr, "zran: out of memory\n");
kusano fc6ab3
            break;
kusano fc6ab3
        case Z_DATA_ERROR:
kusano fc6ab3
            fprintf(stderr, "zran: compressed data error in %s\n", argv[1]);
kusano fc6ab3
            break;
kusano fc6ab3
        case Z_ERRNO:
kusano fc6ab3
            fprintf(stderr, "zran: read error on %s\n", argv[1]);
kusano fc6ab3
            break;
kusano fc6ab3
        default:
kusano fc6ab3
            fprintf(stderr, "zran: error %d while building index\n", len);
kusano fc6ab3
        }
kusano fc6ab3
        return 1;
kusano fc6ab3
    }
kusano fc6ab3
    fprintf(stderr, "zran: built index with %d access points\n", len);
kusano fc6ab3
kusano fc6ab3
    /* use index by reading some bytes from an arbitrary offset */
kusano fc6ab3
    offset = (index->list[index->have - 1].out << 1) / 3;
kusano fc6ab3
    len = extract(in, index, offset, buf, CHUNK);
kusano fc6ab3
    if (len < 0)
kusano fc6ab3
        fprintf(stderr, "zran: extraction failed: %s error\n",
kusano fc6ab3
                len == Z_MEM_ERROR ? "out of memory" : "input corrupted");
kusano fc6ab3
    else {
kusano fc6ab3
        fwrite(buf, 1, len, stdout);
kusano fc6ab3
        fprintf(stderr, "zran: extracted %d bytes at %llu\n", len, offset);
kusano fc6ab3
    }
kusano fc6ab3
kusano fc6ab3
    /* clean up and exit */
kusano fc6ab3
    free_index(index);
kusano fc6ab3
    fclose(in);
kusano fc6ab3
    return 0;
kusano fc6ab3
}