diff options
Diffstat (limited to 'kviewshell/plugins/djvu/libdjvu/BSByteStream.h')
-rw-r--r-- | kviewshell/plugins/djvu/libdjvu/BSByteStream.h | 275 |
1 files changed, 275 insertions, 0 deletions
diff --git a/kviewshell/plugins/djvu/libdjvu/BSByteStream.h b/kviewshell/plugins/djvu/libdjvu/BSByteStream.h new file mode 100644 index 00000000..6a985cdf --- /dev/null +++ b/kviewshell/plugins/djvu/libdjvu/BSByteStream.h @@ -0,0 +1,275 @@ +//C- -*- C++ -*- +//C- ------------------------------------------------------------------- +//C- DjVuLibre-3.5 +//C- Copyright (c) 2002 Leon Bottou and Yann Le Cun. +//C- Copyright (c) 2001 AT&T +//C- +//C- This software is subject to, and may be distributed under, the +//C- GNU General Public License, Version 2. The license should have +//C- accompanied the software or you may obtain a copy of the license +//C- from the Free Software Foundation at http://www.fsf.org . +//C- +//C- This program is distributed in the hope that it will be useful, +//C- but WITHOUT ANY WARRANTY; without even the implied warranty of +//C- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +//C- GNU General Public License for more details. +//C- +//C- DjVuLibre-3.5 is derived from the DjVu(r) Reference Library +//C- distributed by Lizardtech Software. On July 19th 2002, Lizardtech +//C- Software authorized us to replace the original DjVu(r) Reference +//C- Library notice by the following text (see doc/lizard2002.djvu): +//C- +//C- ------------------------------------------------------------------ +//C- | DjVu (r) Reference Library (v. 3.5) +//C- | Copyright (c) 1999-2001 LizardTech, Inc. All Rights Reserved. +//C- | The DjVu Reference Library is protected by U.S. Pat. No. +//C- | 6,058,214 and patents pending. +//C- | +//C- | This software is subject to, and may be distributed under, the +//C- | GNU General Public License, Version 2. The license should have +//C- | accompanied the software or you may obtain a copy of the license +//C- | from the Free Software Foundation at http://www.fsf.org . +//C- | +//C- | The computer code originally released by LizardTech under this +//C- | license and unmodified by other parties is deemed "the LIZARDTECH +//C- | ORIGINAL CODE." Subject to any third party intellectual property +//C- | claims, LizardTech grants recipient a worldwide, royalty-free, +//C- | non-exclusive license to make, use, sell, or otherwise dispose of +//C- | the LIZARDTECH ORIGINAL CODE or of programs derived from the +//C- | LIZARDTECH ORIGINAL CODE in compliance with the terms of the GNU +//C- | General Public License. This grant only confers the right to +//C- | infringe patent claims underlying the LIZARDTECH ORIGINAL CODE to +//C- | the extent such infringement is reasonably necessary to enable +//C- | recipient to make, have made, practice, sell, or otherwise dispose +//C- | of the LIZARDTECH ORIGINAL CODE (or portions thereof) and not to +//C- | any greater extent that may be necessary to utilize further +//C- | modifications or combinations. +//C- | +//C- | The LIZARDTECH ORIGINAL CODE is provided "AS IS" WITHOUT WARRANTY +//C- | OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +//C- | TO ANY WARRANTY OF NON-INFRINGEMENT, OR ANY IMPLIED WARRANTY OF +//C- | MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. +//C- +------------------------------------------------------------------ +// +// $Id: BSByteStream.h,v 1.8 2003/11/07 22:08:20 leonb Exp $ +// $Name: release_3_5_15 $ + +#ifndef _BSBYTESTREAM_H +#define _BSBYTESTREAM_H +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#if NEED_GNUG_PRAGMAS +# pragma interface +#endif + +/** @name BSByteStream.h + + Files #"BSByteStream.h"# and #"BSByteStream.cpp"# implement a very compact + general purpose compressor based on the Burrows-Wheeler transform. The + utility program \Ref{bzz} provides a front-end for this class. Although + this compression model is not currently used in DjVu files, it may be used + in the future for encoding textual data chunks. + + {\bf Algorithms} --- The Burrows-Wheeler transform (also named Block-Sorting) + is performed using a combination of the Karp-Miller-Rosenberg and the + Bentley-Sedgewick algorithms. This is comparable to (Sadakane, DCC 98) + with a slightly more flexible ranking scheme. Symbols are then ordered + according to a running estimate of their occurrence frequencies. The + symbol ranks are then coded using a simple fixed tree and the + \Ref{ZPCodec} binary adaptive coder. + + {\bf Performances} --- The basic algorithm is mostly similar to those + implemented in well known compressors like #bzip# or #bzip2# + (\URL{http://www.muraroa.demon.co.uk}). The adaptive binary coder however + generates small differences. The adaptation noise may cost up to 5\% in + file size, but this penalty is usually offset by the benefits of + adaptation. This is good when processing large and highly structured + files like spreadsheet files. Compression and decompression speed is + about twice slower than #bzip2# but the sorting algorithms is more + robust. Unlike #bzip2# (as of August 1998), this code can compress half a + megabyte of "abababab...." in bounded time. + + Here are some comparative results (in bits per character) obtained on the + Canterbury Corpus (\URL{http://corpus.canterbury.ac.nz}) as of August + 1998. The BSByteStream performance on the single spreadsheet file #Excl# + moves #bzz#'s weighted average ahead of much more sophisticated methods, + like Suzanne Bunton's #fsmxBest# system + \URL{http://corpus.canterbury.ac.nz/methodinfo/fsmx.html}. This result + will not last very long. + + {\footnotesize + \begin{tabular}{lccccccccccccc} + & text & fax & Csrc & Excl & SPRC & tech + & poem & html & lisp & man & play & Weighted & Average \\ + compress + & 3.27 & 0.97 & 3.56 & 2.41 & 4.21 & 3.06 + & 3.38 & 3.68 & 3.90 & 4.43 & 3.51 + & 2.55 & 3.31 \\ + gzip -9 + & 2.85 & 0.82 & 2.24 & 1.63 & 2.67 & 2.71 + & 3.23 & 2.59 & 2.65 & 3.31 & 3.12 + & 2.08 & 2.53 \\ + bzip2 -9 + & 2.27 & 0.78 & 2.18 & 1.01 & 2.70 & 2.02 + & 2.42 & 2.48 & 2.79 & 3.33 & 2.53 + & 1.54 & 2.23 \\ + ppmd + & 2.31 & 0.99 & 2.11 & 1.08 & 2.68 & 2.19 + & 2.48 & 2.38 & 2.43 & 3.00 & 2.53 + & 1.65 & 2.20 \\ + fsmx + & {\bf 2.10} & 0.79 & {\bf 1.89} & 1.48 & {\bf 2.52} & {\bf 1.84} + & {\bf 2.21} & {\bf 2.24} & {\bf 2.29} & {\bf 2.91} & {\bf 2.35} + & 1.63 & {\bf 2.06} \\ + {\bf bzz} + & 2.25 & {\bf 0.76} & 2.13 & {\bf 0.78} & 2.67 & 2.00 + & 2.40 & 2.52 & 2.60 & 3.19 & 2.52 + & {\bf 1.44} & 2.16 + \end{tabular} + } + + Note that the DjVu people have several entries in this table. Program + #compress# was written some time ago by Joe Orost + (\URL{http://www.research.att.com/info/orost}). The #ppmc# method, (a + precursor of #ppmd#) was created by Paul Howard + (\URL{http://www.research.att.com/info/pgh}). The #bzz# program is just + below your eyes. + + @author + L\'eon Bottou <leonb@research.att.com> -- Initial implementation\\ + Andrei Erofeev <eaf@geocities.com> -- Improved Block Sorting algorithm. + @memo + Simple Burrows-Wheeler general purpose compressor. + @version + #$Id: BSByteStream.h,v 1.8 2003/11/07 22:08:20 leonb Exp $# */ +//@{ + + +#include "ByteStream.h" +#include "GException.h" +#include "ZPCodec.h" + + +#ifdef HAVE_NAMESPACES +namespace DJVU { +# ifdef NOT_DEFINED // Just to fool emacs c++ mode +} +#endif +#endif + + +/** Performs bzz compression/decompression. + + Class #BSByteStream# defines a \Ref{ByteStream} which transparently + performs the BZZ compression/decompression. The constructor of class + \Ref{BSByteStream} takes another \Ref{ByteStream} as argument. Any data + written to the BSByteStream is compressed and written to this second + ByteStream. Any data read from the BSByteStream is internally generated by + decompressing data read from the second ByteStream. + + Program \Ref{bzz} demonstrates how to use this class. All the hard work + is achieved by a simple ByteStream to ByteStream copy, as shown below. + \begin{verbatim} + GP<ByteStream> in=ByteStream::create(infile,"rb"); + GP<ByteStream> out=ByteStream::create(outfile,"wb"); + if (encoding) { + BSByteStream bsb(out, blocksize); + bsb.copy(*in); + } else { + BSByteStream bsb(in); + out->copy(bsb); + } + \end{verbatim} + Due to the block oriented nature of the Burrows-Wheeler transform, there + is a very significant latency between the data input and the data output. + You can use function #flush# to force data output at the expense of + compression efficiency. + + You should never directly access a ByteStream object connected to a valid + BSByteStream object. The ByteStream object can be accessed again after the + destruction of the BSByteStream object. Note that the encoder always + flushes its internal buffers and writes a few final code bytes when the + BSByteStream object is destroyed. Note also that the decoder often reads + a few bytes beyond the last code byte written by the encoder. This lag + means that you must reposition the ByteStream after the destruction of the + BSByteStream object and before re-using the ByteStream object (see + \Ref{IFFByteStream}.) +*/ +class BSByteStream : public ByteStream +{ +public: +// Limits on block sizes + enum { MINBLOCK=10, MAXBLOCK=4096 }; + +// Sorting tresholds + enum { FREQMAX=4, CTXIDS=3 }; + + class Decode; + class Encode; +protected: + BSByteStream(GP<ByteStream> bs); + +public: + /** Creates a BSByteStream. + The BSByteStream will be used for decompressing data. + \begin{description} + \item[Decompression] + The BSByteStream is created and the decompressor initializes. Chunks of + data will be read from ByteStream #bs# and decompressed into an internal + buffer. Function #read# can be used to access the decompressed data. + \end{description} */ + static GP<ByteStream> create(GP<ByteStream> bs); + + /** Constructs a BSByteStream. + The BSByteStream will be used for compressing data. + \begin{description} + \item[Compression] + Set #blocksize# to a positive number smaller than 4096 to + initialize the compressor. Data written to the BSByteStream will be + accumulated into an internal buffer. The buffered data will be + compressed and written to ByteStream #bs# whenever the buffer sizes + reaches the maximum value specified by argument #blocksize# (in + kilobytes). Using a larger block size usually increases the compression + ratio at the expense of computation time. There is no need however to + specify a block size larger than the total number of bytes to compress. + Setting #blocksize# to #1024# is a good starting point. A minimal block + size of 10 is silently enforced. + \end{description} */ + static GP<ByteStream> create(GP<ByteStream> bs, const int blocksize); + + // ByteStream Interface + ~BSByteStream(); + virtual long tell(void) const; + virtual void flush(void) = 0; +protected: + // Data + long offset; + int bptr; + unsigned int blocksize; + int size; + ByteStream *bs; + GP<ByteStream> gbs; + unsigned char *data; + GPBuffer<unsigned char> gdata; + // Coder + GP<ZPCodec> gzp; + BitContext ctx[300]; +private: + // Cancel C++ default stuff + BSByteStream(const BSByteStream &); + BSByteStream & operator=(const BSByteStream &); + BSByteStream(ByteStream *); + BSByteStream(ByteStream *, int); +}; + +//@} + + +#ifdef HAVE_NAMESPACES +} +# ifndef NOT_USING_DJVU_NAMESPACE +using namespace DJVU; +# endif +#endif +#endif |