kusano 7d535a
/* zran.c -- example of zlib/gzip stream indexing and random access
kusano 7d535a
 * Copyright (C) 2005 Mark Adler
kusano 7d535a
 * For conditions of distribution and use, see copyright notice in zlib.h
kusano 7d535a
   Version 1.0  29 May 2005  Mark Adler */
kusano 7d535a
kusano 7d535a
/* Illustrate the use of Z_BLOCK, inflatePrime(), and inflateSetDictionary()
kusano 7d535a
   for random access of a compressed file.  A file containing a zlib or gzip
kusano 7d535a
   stream is provided on the command line.  The compressed stream is decoded in
kusano 7d535a
   its entirety, and an index built with access points about every SPAN bytes
kusano 7d535a
   in the uncompressed output.  The compressed file is left open, and can then
kusano 7d535a
   be read randomly, having to decompress on the average SPAN/2 uncompressed
kusano 7d535a
   bytes before getting to the desired block of data.
kusano 7d535a
kusano 7d535a
   An access point can be created at the start of any deflate block, by saving
kusano 7d535a
   the starting file offset and bit of that block, and the 32K bytes of
kusano 7d535a
   uncompressed data that precede that block.  Also the uncompressed offset of
kusano 7d535a
   that block is saved to provide a referece for locating a desired starting
kusano 7d535a
   point in the uncompressed stream.  build_index() works by decompressing the
kusano 7d535a
   input zlib or gzip stream a block at a time, and at the end of each block
kusano 7d535a
   deciding if enough uncompressed data has gone by to justify the creation of
kusano 7d535a
   a new access point.  If so, that point is saved in a data structure that
kusano 7d535a
   grows as needed to accommodate the points.
kusano 7d535a
kusano 7d535a
   To use the index, an offset in the uncompressed data is provided, for which
kusano 7d535a
   the latest accees point at or preceding that offset is located in the index.
kusano 7d535a
   The input file is positioned to the specified location in the index, and if
kusano 7d535a
   necessary the first few bits of the compressed data is read from the file.
kusano 7d535a
   inflate is initialized with those bits and the 32K of uncompressed data, and
kusano 7d535a
   the decompression then proceeds until the desired offset in the file is
kusano 7d535a
   reached.  Then the decompression continues to read the desired uncompressed
kusano 7d535a
   data from the file.
kusano 7d535a
kusano 7d535a
   Another approach would be to generate the index on demand.  In that case,
kusano 7d535a
   requests for random access reads from the compressed data would try to use
kusano 7d535a
   the index, but if a read far enough past the end of the index is required,
kusano 7d535a
   then further index entries would be generated and added.
kusano 7d535a
kusano 7d535a
   There is some fair bit of overhead to starting inflation for the random
kusano 7d535a
   access, mainly copying the 32K byte dictionary.  So if small pieces of the
kusano 7d535a
   file are being accessed, it would make sense to implement a cache to hold
kusano 7d535a
   some lookahead and avoid many calls to extract() for small lengths.
kusano 7d535a
kusano 7d535a
   Another way to build an index would be to use inflateCopy().  That would
kusano 7d535a
   not be constrained to have access points at block boundaries, but requires
kusano 7d535a
   more memory per access point, and also cannot be saved to file due to the
kusano 7d535a
   use of pointers in the state.  The approach here allows for storage of the
kusano 7d535a
   index in a file.
kusano 7d535a
 */
kusano 7d535a
kusano 7d535a
#include <stdio.h></stdio.h>
kusano 7d535a
#include <stdlib.h></stdlib.h>
kusano 7d535a
#include <string.h></string.h>
kusano 7d535a
#include "zlib.h"
kusano 7d535a
kusano 7d535a
#define local static
kusano 7d535a
kusano 7d535a
#define SPAN 1048576L       /* desired distance between access points */
kusano 7d535a
#define WINSIZE 32768U      /* sliding window size */
kusano 7d535a
#define CHUNK 16384         /* file input buffer size */
kusano 7d535a
kusano 7d535a
/* access point entry */
kusano 7d535a
struct point {
kusano 7d535a
    off_t out;          /* corresponding offset in uncompressed data */
kusano 7d535a
    off_t in;           /* offset in input file of first full byte */
kusano 7d535a
    int bits;           /* number of bits (1-7) from byte at in - 1, or 0 */
kusano 7d535a
    unsigned char window[WINSIZE];  /* preceding 32K of uncompressed data */
kusano 7d535a
};
kusano 7d535a
kusano 7d535a
/* access point list */
kusano 7d535a
struct access {
kusano 7d535a
    int have;           /* number of list entries filled in */
kusano 7d535a
    int size;           /* number of list entries allocated */
kusano 7d535a
    struct point *list; /* allocated list */
kusano 7d535a
};
kusano 7d535a
kusano 7d535a
/* Deallocate an index built by build_index() */
kusano 7d535a
local void free_index(struct access *index)
kusano 7d535a
{
kusano 7d535a
    if (index != NULL) {
kusano 7d535a
        free(index->list);
kusano 7d535a
        free(index);
kusano 7d535a
    }
kusano 7d535a
}
kusano 7d535a
kusano 7d535a
/* Add an entry to the access point list.  If out of memory, deallocate the
kusano 7d535a
   existing list and return NULL. */
kusano 7d535a
local struct access *addpoint(struct access *index, int bits,
kusano 7d535a
    off_t in, off_t out, unsigned left, unsigned char *window)
kusano 7d535a
{
kusano 7d535a
    struct point *next;
kusano 7d535a
kusano 7d535a
    /* if list is empty, create it (start with eight points) */
kusano 7d535a
    if (index == NULL) {
kusano 7d535a
        index = malloc(sizeof(struct access));
kusano 7d535a
        if (index == NULL) return NULL;
kusano 7d535a
        index->list = malloc(sizeof(struct point) << 3);
kusano 7d535a
        if (index->list == NULL) {
kusano 7d535a
            free(index);
kusano 7d535a
            return NULL;
kusano 7d535a
        }
kusano 7d535a
        index->size = 8;
kusano 7d535a
        index->have = 0;
kusano 7d535a
    }
kusano 7d535a
kusano 7d535a
    /* if list is full, make it bigger */
kusano 7d535a
    else if (index->have == index->size) {
kusano 7d535a
        index->size <<= 1;
kusano 7d535a
        next = realloc(index->list, sizeof(struct point) * index->size);
kusano 7d535a
        if (next == NULL) {
kusano 7d535a
            free_index(index);
kusano 7d535a
            return NULL;
kusano 7d535a
        }
kusano 7d535a
        index->list = next;
kusano 7d535a
    }
kusano 7d535a
kusano 7d535a
    /* fill in entry and increment how many we have */
kusano 7d535a
    next = index->list + index->have;
kusano 7d535a
    next->bits = bits;
kusano 7d535a
    next->in = in;
kusano 7d535a
    next->out = out;
kusano 7d535a
    if (left)
kusano 7d535a
        memcpy(next->window, window + WINSIZE - left, left);
kusano 7d535a
    if (left < WINSIZE)
kusano 7d535a
        memcpy(next->window + left, window, WINSIZE - left);
kusano 7d535a
    index->have++;
kusano 7d535a
kusano 7d535a
    /* return list, possibly reallocated */
kusano 7d535a
    return index;
kusano 7d535a
}
kusano 7d535a
kusano 7d535a
/* Make one entire pass through the compressed stream and build an index, with
kusano 7d535a
   access points about every span bytes of uncompressed output -- span is
kusano 7d535a
   chosen to balance the speed of random access against the memory requirements
kusano 7d535a
   of the list, about 32K bytes per access point.  Note that data after the end
kusano 7d535a
   of the first zlib or gzip stream in the file is ignored.  build_index()
kusano 7d535a
   returns the number of access points on success (>= 1), Z_MEM_ERROR for out
kusano 7d535a
   of memory, Z_DATA_ERROR for an error in the input file, or Z_ERRNO for a
kusano 7d535a
   file read error.  On success, *built points to the resulting index. */
kusano 7d535a
local int build_index(FILE *in, off_t span, struct access **built)
kusano 7d535a
{
kusano 7d535a
    int ret;
kusano 7d535a
    off_t totin, totout;        /* our own total counters to avoid 4GB limit */
kusano 7d535a
    off_t last;                 /* totout value of last access point */
kusano 7d535a
    struct access *index;       /* access points being generated */
kusano 7d535a
    z_stream strm;
kusano 7d535a
    unsigned char input[CHUNK];
kusano 7d535a
    unsigned char window[WINSIZE];
kusano 7d535a
kusano 7d535a
    /* initialize inflate */
kusano 7d535a
    strm.zalloc = Z_NULL;
kusano 7d535a
    strm.zfree = Z_NULL;
kusano 7d535a
    strm.opaque = Z_NULL;
kusano 7d535a
    strm.avail_in = 0;
kusano 7d535a
    strm.next_in = Z_NULL;
kusano 7d535a
    ret = inflateInit2(&strm, 47);      /* automatic zlib or gzip decoding */
kusano 7d535a
    if (ret != Z_OK)
kusano 7d535a
        return ret;
kusano 7d535a
kusano 7d535a
    /* inflate the input, maintain a sliding window, and build an index -- this
kusano 7d535a
       also validates the integrity of the compressed data using the check
kusano 7d535a
       information at the end of the gzip or zlib stream */
kusano 7d535a
    totin = totout = last = 0;
kusano 7d535a
    index = NULL;               /* will be allocated by first addpoint() */
kusano 7d535a
    strm.avail_out = 0;
kusano 7d535a
    do {
kusano 7d535a
        /* get some compressed data from input file */
kusano 7d535a
        strm.avail_in = fread(input, 1, CHUNK, in);
kusano 7d535a
        if (ferror(in)) {
kusano 7d535a
            ret = Z_ERRNO;
kusano 7d535a
            goto build_index_error;
kusano 7d535a
        }
kusano 7d535a
        if (strm.avail_in == 0) {
kusano 7d535a
            ret = Z_DATA_ERROR;
kusano 7d535a
            goto build_index_error;
kusano 7d535a
        }
kusano 7d535a
        strm.next_in = input;
kusano 7d535a
kusano 7d535a
        /* process all of that, or until end of stream */
kusano 7d535a
        do {
kusano 7d535a
            /* reset sliding window if necessary */
kusano 7d535a
            if (strm.avail_out == 0) {
kusano 7d535a
                strm.avail_out = WINSIZE;
kusano 7d535a
                strm.next_out = window;
kusano 7d535a
            }
kusano 7d535a
kusano 7d535a
            /* inflate until out of input, output, or at end of block --
kusano 7d535a
               update the total input and output counters */
kusano 7d535a
            totin += strm.avail_in;
kusano 7d535a
            totout += strm.avail_out;
kusano 7d535a
            ret = inflate(&strm, Z_BLOCK);      /* return at end of block */
kusano 7d535a
            totin -= strm.avail_in;
kusano 7d535a
            totout -= strm.avail_out;
kusano 7d535a
            if (ret == Z_NEED_DICT)
kusano 7d535a
                ret = Z_DATA_ERROR;
kusano 7d535a
            if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
kusano 7d535a
                goto build_index_error;
kusano 7d535a
            if (ret == Z_STREAM_END)
kusano 7d535a
                break;
kusano 7d535a
kusano 7d535a
            /* if at end of block, consider adding an index entry (note that if
kusano 7d535a
               data_type indicates an end-of-block, then all of the
kusano 7d535a
               uncompressed data from that block has been delivered, and none
kusano 7d535a
               of the compressed data after that block has been consumed,
kusano 7d535a
               except for up to seven bits) -- the totout == 0 provides an
kusano 7d535a
               entry point after the zlib or gzip header, and assures that the
kusano 7d535a
               index always has at least one access point; we avoid creating an
kusano 7d535a
               access point after the last block by checking bit 6 of data_type
kusano 7d535a
             */
kusano 7d535a
            if ((strm.data_type & 128) && !(strm.data_type & 64) &&
kusano 7d535a
                (totout == 0 || totout - last > span)) {
kusano 7d535a
                index = addpoint(index, strm.data_type & 7, totin,
kusano 7d535a
                                 totout, strm.avail_out, window);
kusano 7d535a
                if (index == NULL) {
kusano 7d535a
                    ret = Z_MEM_ERROR;
kusano 7d535a
                    goto build_index_error;
kusano 7d535a
                }
kusano 7d535a
                last = totout;
kusano 7d535a
            }
kusano 7d535a
        } while (strm.avail_in != 0);
kusano 7d535a
    } while (ret != Z_STREAM_END);
kusano 7d535a
kusano 7d535a
    /* clean up and return index (release unused entries in list) */
kusano 7d535a
    (void)inflateEnd(&strm);
kusano 7d535a
    index = realloc(index, sizeof(struct point) * index->have);
kusano 7d535a
    index->size = index->have;
kusano 7d535a
    *built = index;
kusano 7d535a
    return index->size;
kusano 7d535a
kusano 7d535a
    /* return error */
kusano 7d535a
  build_index_error:
kusano 7d535a
    (void)inflateEnd(&strm);
kusano 7d535a
    if (index != NULL)
kusano 7d535a
        free_index(index);
kusano 7d535a
    return ret;
kusano 7d535a
}
kusano 7d535a
kusano 7d535a
/* Use the index to read len bytes from offset into buf, return bytes read or
kusano 7d535a
   negative for error (Z_DATA_ERROR or Z_MEM_ERROR).  If data is requested past
kusano 7d535a
   the end of the uncompressed data, then extract() will return a value less
kusano 7d535a
   than len, indicating how much as actually read into buf.  This function
kusano 7d535a
   should not return a data error unless the file was modified since the index
kusano 7d535a
   was generated.  extract() may also return Z_ERRNO if there is an error on
kusano 7d535a
   reading or seeking the input file. */
kusano 7d535a
local int extract(FILE *in, struct access *index, off_t offset,
kusano 7d535a
                  unsigned char *buf, int len)
kusano 7d535a
{
kusano 7d535a
    int ret, skip;
kusano 7d535a
    z_stream strm;
kusano 7d535a
    struct point *here;
kusano 7d535a
    unsigned char input[CHUNK];
kusano 7d535a
    unsigned char discard[WINSIZE];
kusano 7d535a
kusano 7d535a
    /* proceed only if something reasonable to do */
kusano 7d535a
    if (len < 0)
kusano 7d535a
        return 0;
kusano 7d535a
kusano 7d535a
    /* find where in stream to start */
kusano 7d535a
    here = index->list;
kusano 7d535a
    ret = index->have;
kusano 7d535a
    while (--ret && here[1].out <= offset)
kusano 7d535a
        here++;
kusano 7d535a
kusano 7d535a
    /* initialize file and inflate state to start there */
kusano 7d535a
    strm.zalloc = Z_NULL;
kusano 7d535a
    strm.zfree = Z_NULL;
kusano 7d535a
    strm.opaque = Z_NULL;
kusano 7d535a
    strm.avail_in = 0;
kusano 7d535a
    strm.next_in = Z_NULL;
kusano 7d535a
    ret = inflateInit2(&strm, -15);         /* raw inflate */
kusano 7d535a
    if (ret != Z_OK)
kusano 7d535a
        return ret;
kusano 7d535a
    ret = fseeko(in, here->in - (here->bits ? 1 : 0), SEEK_SET);
kusano 7d535a
    if (ret == -1)
kusano 7d535a
        goto extract_ret;
kusano 7d535a
    if (here->bits) {
kusano 7d535a
        ret = getc(in);
kusano 7d535a
        if (ret == -1) {
kusano 7d535a
            ret = ferror(in) ? Z_ERRNO : Z_DATA_ERROR;
kusano 7d535a
            goto extract_ret;
kusano 7d535a
        }
kusano 7d535a
        (void)inflatePrime(&strm, here->bits, ret >> (8 - here->bits));
kusano 7d535a
    }
kusano 7d535a
    (void)inflateSetDictionary(&strm, here->window, WINSIZE);
kusano 7d535a
kusano 7d535a
    /* skip uncompressed bytes until offset reached, then satisfy request */
kusano 7d535a
    offset -= here->out;
kusano 7d535a
    strm.avail_in = 0;
kusano 7d535a
    skip = 1;                               /* while skipping to offset */
kusano 7d535a
    do {
kusano 7d535a
        /* define where to put uncompressed data, and how much */
kusano 7d535a
        if (offset == 0 && skip) {          /* at offset now */
kusano 7d535a
            strm.avail_out = len;
kusano 7d535a
            strm.next_out = buf;
kusano 7d535a
            skip = 0;                       /* only do this once */
kusano 7d535a
        }
kusano 7d535a
        if (offset > WINSIZE) {             /* skip WINSIZE bytes */
kusano 7d535a
            strm.avail_out = WINSIZE;
kusano 7d535a
            strm.next_out = discard;
kusano 7d535a
            offset -= WINSIZE;
kusano 7d535a
        }
kusano 7d535a
        else if (offset != 0) {             /* last skip */
kusano 7d535a
            strm.avail_out = (unsigned)offset;
kusano 7d535a
            strm.next_out = discard;
kusano 7d535a
            offset = 0;
kusano 7d535a
        }
kusano 7d535a
kusano 7d535a
        /* uncompress until avail_out filled, or end of stream */
kusano 7d535a
        do {
kusano 7d535a
            if (strm.avail_in == 0) {
kusano 7d535a
                strm.avail_in = fread(input, 1, CHUNK, in);
kusano 7d535a
                if (ferror(in)) {
kusano 7d535a
                    ret = Z_ERRNO;
kusano 7d535a
                    goto extract_ret;
kusano 7d535a
                }
kusano 7d535a
                if (strm.avail_in == 0) {
kusano 7d535a
                    ret = Z_DATA_ERROR;
kusano 7d535a
                    goto extract_ret;
kusano 7d535a
                }
kusano 7d535a
                strm.next_in = input;
kusano 7d535a
            }
kusano 7d535a
            ret = inflate(&strm, Z_NO_FLUSH);       /* normal inflate */
kusano 7d535a
            if (ret == Z_NEED_DICT)
kusano 7d535a
                ret = Z_DATA_ERROR;
kusano 7d535a
            if (ret == Z_MEM_ERROR || ret == Z_DATA_ERROR)
kusano 7d535a
                goto extract_ret;
kusano 7d535a
            if (ret == Z_STREAM_END)
kusano 7d535a
                break;
kusano 7d535a
        } while (strm.avail_out != 0);
kusano 7d535a
kusano 7d535a
        /* if reach end of stream, then don't keep trying to get more */
kusano 7d535a
        if (ret == Z_STREAM_END)
kusano 7d535a
            break;
kusano 7d535a
kusano 7d535a
        /* do until offset reached and requested data read, or stream ends */
kusano 7d535a
    } while (skip);
kusano 7d535a
kusano 7d535a
    /* compute number of uncompressed bytes read after offset */
kusano 7d535a
    ret = skip ? 0 : len - strm.avail_out;
kusano 7d535a
kusano 7d535a
    /* clean up and return bytes read or error */
kusano 7d535a
  extract_ret:
kusano 7d535a
    (void)inflateEnd(&strm);
kusano 7d535a
    return ret;
kusano 7d535a
}
kusano 7d535a
kusano 7d535a
/* Demonstrate the use of build_index() and extract() by processing the file
kusano 7d535a
   provided on the command line, and the extracting 16K from about 2/3rds of
kusano 7d535a
   the way through the uncompressed output, and writing that to stdout. */
kusano 7d535a
int main(int argc, char **argv)
kusano 7d535a
{
kusano 7d535a
    int len;
kusano 7d535a
    off_t offset;
kusano 7d535a
    FILE *in;
kusano 7d535a
    struct access *index = NULL;
kusano 7d535a
    unsigned char buf[CHUNK];
kusano 7d535a
kusano 7d535a
    /* open input file */
kusano 7d535a
    if (argc != 2) {
kusano 7d535a
        fprintf(stderr, "usage: zran file.gz\n");
kusano 7d535a
        return 1;
kusano 7d535a
    }
kusano 7d535a
    in = fopen(argv[1], "rb");
kusano 7d535a
    if (in == NULL) {
kusano 7d535a
        fprintf(stderr, "zran: could not open %s for reading\n", argv[1]);
kusano 7d535a
        return 1;
kusano 7d535a
    }
kusano 7d535a
kusano 7d535a
    /* build index */
kusano 7d535a
    len = build_index(in, SPAN, &index);
kusano 7d535a
    if (len < 0) {
kusano 7d535a
        fclose(in);
kusano 7d535a
        switch (len) {
kusano 7d535a
        case Z_MEM_ERROR:
kusano 7d535a
            fprintf(stderr, "zran: out of memory\n");
kusano 7d535a
            break;
kusano 7d535a
        case Z_DATA_ERROR:
kusano 7d535a
            fprintf(stderr, "zran: compressed data error in %s\n", argv[1]);
kusano 7d535a
            break;
kusano 7d535a
        case Z_ERRNO:
kusano 7d535a
            fprintf(stderr, "zran: read error on %s\n", argv[1]);
kusano 7d535a
            break;
kusano 7d535a
        default:
kusano 7d535a
            fprintf(stderr, "zran: error %d while building index\n", len);
kusano 7d535a
        }
kusano 7d535a
        return 1;
kusano 7d535a
    }
kusano 7d535a
    fprintf(stderr, "zran: built index with %d access points\n", len);
kusano 7d535a
kusano 7d535a
    /* use index by reading some bytes from an arbitrary offset */
kusano 7d535a
    offset = (index->list[index->have - 1].out << 1) / 3;
kusano 7d535a
    len = extract(in, index, offset, buf, CHUNK);
kusano 7d535a
    if (len < 0)
kusano 7d535a
        fprintf(stderr, "zran: extraction failed: %s error\n",
kusano 7d535a
                len == Z_MEM_ERROR ? "out of memory" : "input corrupted");
kusano 7d535a
    else {
kusano 7d535a
        fwrite(buf, 1, len, stdout);
kusano 7d535a
        fprintf(stderr, "zran: extracted %d bytes at %llu\n", len, offset);
kusano 7d535a
    }
kusano 7d535a
kusano 7d535a
    /* clean up and exit */
kusano 7d535a
    free_index(index);
kusano 7d535a
    fclose(in);
kusano 7d535a
    return 0;
kusano 7d535a
}