Advertisement
Bisqwit

gunzip.hh

Nov 21st, 2015
2,116
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C++ 9.40 KB | None | 0 0
  1. /* My tiny gzip decompressor without using zlib. - Joel Yliluoma
  2.  * http://iki.fi/bisqwit/ , http://youtube.com/user/Bisqwit
  3.  * Inspired and influenced by a 13th IOCCC winner program by Ron McFarland */
  4. /* Fun fact: Contains zero new/delete, and no STL data structures */
  5. #include <assert.h>
  6.  
  7. // This is the public method declared (later) in this file.
  8. template<typename InputFunctor, typename OutputFunctor>
  9. void Deflate(InputFunctor&& input, OutputFunctor&& output);
  10.  
  11. // The rest of the file is just for the curious about implementation.
  12. namespace gunzip_ns
  13. {
  14.     template<typename N, typename U=unsigned short>
  15.     struct PoolType
  16.     {
  17.         N* const ptr; // Pointer to the beginning of storage for nodes
  18.         U used;       // How many nodes have been allocated
  19.  
  20.         void clear()      { used=0; }
  21.         N* get(U n) const { return n ? &ptr[n-1] : nullptr; }
  22.         N* allocate(U& n) { return n ? get(n) : get(n = ++used)->Reset(); }
  23.     };
  24.  
  25.     struct huffnode
  26.     {
  27.         unsigned short branch[2], value,_;
  28.  
  29.         huffnode* Reset() { branch[0]=0; branch[1]=0; return this; }
  30.  
  31.         // Create a huffman tree for num_values, with given lengths.
  32.         // The tree will be put in branch[which]; other branch not touched.
  33.         // Length = how many bits to use for encoding this value.
  34.         void Create(unsigned which, unsigned num_values,
  35.                     const unsigned char lengths[], PoolType<huffnode>& pool)
  36.         {
  37.             branch[which] = 0;
  38.             unsigned count[16] { 0 }, offs[16] { 0 };
  39.             for(unsigned a = 0; a < num_values; ++a) count[lengths[a]] += 1;
  40.             for(unsigned a = 0; a < 15; a++) offs[a + 1] = (offs[a] + count[a]) * 2u;
  41.             // Largest values seen in wild for offs[16]: 16777216
  42.             for(unsigned value = 0; value < num_values; ++value)
  43.                 if(unsigned length = lengths[value])
  44.                 {
  45.                     huffnode* node = pool.allocate(branch[which]);
  46.                     for(unsigned q = offs[lengths[value]]++; length--; )
  47.                         node = pool.allocate(node->branch[ (q >> length) & 1 ]);
  48.                     node->value = value;
  49.                 }
  50.         }
  51.     };
  52.  
  53.     // Length codes: base + number of bits (0-28)
  54.     // Distance codes: base + number of bits (0-29)
  55.     // Order of lengths for dynamic block decoding (0-18)
  56.     constexpr unsigned short
  57.         dbase[30] { 1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257,385,513,769,1025,1537,2049,3073,4097,6145,8193 };
  58.     constexpr unsigned char
  59.         lbase[29] { 0,1,2,3,4,5,6,7,8,10,12,14,16,20,24,28,32,40,48,56,64,80,96,112,128,160,192,224,255 }, // +3
  60.         lbits[29] { 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0 },
  61.         dbits[30] { 0,0,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13 },
  62.         order[19] { 16,17,18,0, 8,7,9,6, 10,5,11,4, 12,3,13,2, 14,1,15 },
  63.         fix[15] { 7,32,4, 8,35,2, 8,0,20, 9,18,14, 5,36,4 };
  64. }
  65.  
  66. template<typename InputFunctor, typename OutputFunctor>
  67. void Deflate(InputFunctor&& input, OutputFunctor&& output)
  68. {
  69.     using namespace gunzip_ns;
  70.     // Bit-by-bit input routine
  71.     struct { unsigned long Cache; unsigned Count,_; } Bits {0,0,0};
  72.     auto GetBits = [&Bits,&input](unsigned l)
  73.     {
  74.         for (; Bits.Count < l; Bits.Count += 8)
  75.             Bits.Cache |= (input() & 0xFFul) << Bits.Count;
  76.         unsigned w = Bits.Cache & ((1ul << l) - 1);
  77.         Bits.Cache >>= l;
  78.         Bits.Count -= l;
  79.         return w;
  80.     };
  81.  
  82.     // Read stream header
  83.     unsigned header = GetBits(16), winsize = 32768u;
  84.     // ^ Read deflate header: method[4] ; winsize[4] ; checksum[8]
  85.     if(header == 0x8B1F) // Is it actually a gzip header?
  86.     {
  87.         unsigned fmt = GetBits(8), o = GetBits(8); // Get format & flags
  88.         assert(fmt == 8);
  89.         GetBits(48); // MTIME (3 of 4); MTIME(1 of 4), XFL and OS
  90.         if(o&4) // Skip extra fields as indicated by FEXTRA
  91.             for(unsigned q = GetBits(16); q--; GetBits(8));
  92.         if(o&8)  while(GetBits(8)) {} // NAME: Skip filename if FNAME was present
  93.         if(o&16) while(GetBits(8)) {} // COMMENT: Skip comment if FCOMMENT was present
  94.         if(o&2)  GetBits(16);         // HCRC: Skip FCRC if was present
  95.     }
  96.     else // No. Deflate header?
  97.     {
  98.         winsize = 256 << ((header >> 4) & 0xF);
  99.         assert((header & 0x200F) == 8 && !((((header<<8)+(header>>8))&0xFFFF)%31) && winsize <= 32768u);
  100.     }
  101.  
  102.     // Output routine. Also updates the window for backward references.
  103.     unsigned char WindowData[32768u/*winsize*/], Lengths[288+32]; // Lengths are in 0..15 range
  104.     struct { unsigned Head,SizeMask; unsigned char* const Data; } Window {0,winsize-1, WindowData};
  105.     auto Put = [&Window,&output](unsigned char l) { output(Window.Data[Window.Head++ & Window.SizeMask] = l); };
  106.  
  107.     // Function for reading a huffman-encoded value from bitstream.
  108.     constexpr unsigned pool1_reserved=638, pool2_reserved=642;
  109.     huffnode tables_fixed={{0,0},0,0}, tables_dyn, pool[pool1_reserved+pool2_reserved], *cur_table;
  110.     PoolType<huffnode> pool1{&pool[pool2_reserved],0}, pool2{&pool[0],0}, *cur_pool;
  111.     auto Read = [&GetBits](const huffnode& root, unsigned which, const PoolType<huffnode>& pool)
  112.     {
  113.         const auto* node = pool.get(root.branch[which]);
  114.         while(node->branch[1]) node = pool.get(node->branch[GetBits(1)]);
  115.         return node->value;
  116.     };
  117.  
  118.     // Read compressed blocks
  119.     for(bool last=false; !last; )
  120.     {
  121.         switch(last=GetBits(1), header=GetBits(2))
  122.         {
  123.         case 0: // copy stored block data
  124.             GetBits(Bits.Count%8); // Go to byte boundary (discard a few bits)
  125.             {unsigned a = GetBits(16), b = GetBits(16);
  126.             assert(a == (b^0xFFFF));
  127.             // Note: It is valid for "a" to be 0 here.
  128.             // It is sometimes used for aligning the stream to byte boundary.
  129.             while(a--) Put(GetBits(8)); }
  130.             continue;
  131.         case 1: // fixed block
  132.             if(!tables_fixed.branch[0])
  133.             {
  134.                 // Create fixed tables. Technically, this table could be generated
  135.                 // at compile-time, but I don't see an easy way to do it.
  136.                 for(auto n=0u; n<sizeof fix; )
  137.                     for(unsigned what=fix[n++], p=8*fix[n++], end=p+8*fix[n++]; p<end; ++p)
  138.                          Lengths[p] = what;
  139.                 // Note: fix[] intentionally contains overlaps. It is designed
  140.                 // such that clang generates rather optimal SSE assembly code.
  141.                 tables_fixed.Create(0, 288, Lengths,     pool1); // 575 used here
  142.                 tables_fixed.Create(1, 32,  Lengths+288, pool1); // 63 used here
  143.                 assert(pool1.used == pool1_reserved);
  144.                 // ^pool1 has always 638 elements. If this assertion fails,
  145.                 // something is wrong. Maybe coincidentally same as (288+32-1)*2.
  146.             }
  147.             cur_table = &tables_fixed; cur_pool = &pool1;
  148.             break;
  149.         default: // dynamic block
  150.             assert(header==2);
  151.             unsigned nlen = GetBits(5) + 257; // 257..288
  152.             unsigned ndist = GetBits(5) + 1; // 1..32
  153.             unsigned ncode = GetBits(4) + 4; // 4..19
  154.             assert(nlen+ndist <= 288+32);
  155.             for(unsigned a=0; a<19; ++a) Lengths[order[a]] = a<ncode ? GetBits(3) : 0;
  156.             pool2.clear();
  157.             tables_dyn.Create(0, 19, Lengths+0, pool2); // length-lengths
  158.             assert(pool2.used <= pool2_reserved);
  159.             for(unsigned end,lencode,prev_lencode=0, code=0; code < nlen + ndist; )
  160.             {
  161.                 switch(lencode = Read(tables_dyn, 0, pool2))
  162.                 {
  163.                     case 16: end = code+3+GetBits(2); lencode = prev_lencode; break;
  164.                     case 17: end = code+3+GetBits(3); lencode = 0; break;
  165.                     case 18: end = code+11+GetBits(7); lencode = 0; break;
  166.                     default: end = code+1; prev_lencode = lencode; assert(lencode < 16);
  167.                 }
  168.                 assert(end <= nlen+ndist);
  169.                 while(code < end) Lengths[code++] = lencode;
  170.             }
  171.             pool2.clear();
  172.             tables_dyn.Create(0, nlen,  Lengths+0,    pool2);
  173.             tables_dyn.Create(1, ndist, Lengths+nlen, pool2);
  174.             assert(pool2.used <= pool2_reserved);
  175.             // ^If this assertion fails, simply increase pool2_reserved.
  176.             //  Try e.g. 2048-pool1_reserved.
  177.             // So far the largest value seen in the wild is 630.
  178.  
  179.             //fprintf(stderr, "pool2 size%5u\n", pool2.used);
  180.             cur_table = &tables_dyn; cur_pool = &pool2;
  181.         }
  182.         // Do actual deflating.
  183.         for(;;)
  184.         {
  185.             auto code = Read(*cur_table, 0, *cur_pool);
  186.             if(!(code & -256)) Put(code); // 0..255: literal byte
  187.             else if(!(code & 0xFF)) break; // 256=end
  188.             else // 257..285: backwards reference
  189.             {
  190.                 unsigned length = (lbase-257)[code] + GetBits((lbits-257)[code]) + 3;
  191.                 code = Read(*cur_table, 1, *cur_pool); // Read distance code
  192.                 unsigned distance = dbase[code] + GetBits(dbits[code]);
  193.                 unsigned offs = Window.Head - distance;
  194.                 do Put(Window.Data[offs++ & Window.SizeMask]); while(--length);
  195.             }
  196.         }
  197.     }
  198.     // Note: after this, may come a checksum, and a trailer. Ignoring them.
  199. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement