--- a/manual.texi 2011-12-04 13:55:53.589856334 +1100 +++ b/manual.texi 2011-12-04 18:16:28.000000000 +1100 @@ -0,0 +1,2880 @@ +\input texinfo +@setfilename untitled.info +@documentencoding us-ascii +@dircategory Development +@direntry +* Bzip2: (bzip2). A program and library for data compression. +@end direntry + +@node Top, Introduction, , (dir) +@top bzip2 and libbzip2, version 1.0.3 +@documentlanguage en + +@menu +* Introduction:: +* How to use bzip2:: +* Programming with libbzip2:: +* Miscellanea:: + +@detailmenu +--- The Detailed Node Listing --- + +How to use bzip2 + +* NAME:: +* SYNOPSIS:: +* DESCRIPTION:: +* OPTIONS:: +* MEMORY MANAGEMENT:: +* RECOVERING DATA FROM DAMAGED FILES:: +* PERFORMANCE NOTES:: +* CAVEATS:: +* AUTHOR:: + + Programming with libbzip2 + +* Top-level structure:: +* Error handling:: +* Low-level interface: >Low-level interface. +* High-level interface:: +* Utility functions:: +* zlib compatibility functions:: +* Using the library in a stdio-free environment:: +* Making a Windows DLL:: + +Miscellanea + +* Limitations of the compressed file format:: +* Portability issues:: +* Reporting bugs:: +* Did you get the right package?:: +* Further Reading:: + +@end detailmenu +@end menu + +@node Introduction, How to use bzip2, Top, Top +@chapter Introduction + +@samp{bzip2} compresses files +using the Burrows-Wheeler block-sorting text compression +algorithm, and Huffman coding. Compression is generally +considerably better than that achieved by more conventional +LZ77/LZ78-based compressors, and approaches the performance of +the PPM family of statistical compressors. + +@samp{bzip2} is built on top of +@samp{libbzip2}, a flexible library for +handling compressed data in the +@samp{bzip2} format. This manual +describes both how to use the program and how to work with the +library interface. Most of the manual is devoted to this +library, not the program, which is good news if your interest is +only in the program. + +@itemize @bullet{} + +@item +@ref{How to use bzip2,,How to use bzip2}. describes how to use +@samp{bzip2}; this is the only part +you need to read if you just want to know how to operate the +program. + +@item +@ref{Programming with libbzip2,,Programming with libbzip2}. describes the +programming interfaces in detail, and + +@item +@ref{Miscellanea,,Miscellanea}. records some +miscellaneous notes which I thought ought to be recorded +somewhere. +@end itemize + +@node How to use bzip2, Programming with libbzip2, Introduction, Top +@chapter How to use bzip2 + +This chapter contains a copy of the +@samp{bzip2} man page, and nothing +else. + +@menu +* NAME:: +* SYNOPSIS:: +* DESCRIPTION:: +* OPTIONS:: +* MEMORY MANAGEMENT:: +* RECOVERING DATA FROM DAMAGED FILES:: +* PERFORMANCE NOTES:: +* CAVEATS:: +* AUTHOR:: +@end menu + +@node NAME, SYNOPSIS, , How to use bzip2 +@section NAME + +@itemize @bullet{} + +@item +@samp{bzip2}, +@samp{bunzip2} - a block-sorting file +compressor, v1.0.3 + +@item +@samp{bzcat} - +decompresses files to stdout + +@item +@samp{bzip2recover} - +recovers data from damaged bzip2 files +@end itemize + +@node SYNOPSIS, DESCRIPTION, NAME, How to use bzip2 +@section SYNOPSIS + +@itemize @bullet{} + +@item +@samp{bzip2} [ +-cdfkqstvzVL123456789 ] [ filenames ... ] + +@item +@samp{bunzip2} [ +-fkvsVL ] [ filenames ... ] + +@item +@samp{bzcat} [ -s ] [ +filenames ... ] + +@item +@samp{bzip2recover} +filename +@end itemize + +@node DESCRIPTION, OPTIONS, SYNOPSIS, How to use bzip2 +@section DESCRIPTION + +@samp{bzip2} compresses files +using the Burrows-Wheeler block sorting text compression +algorithm, and Huffman coding. Compression is generally +considerably better than that achieved by more conventional +LZ77/LZ78-based compressors, and approaches the performance of +the PPM family of statistical compressors. + +The command-line options are deliberately very similar to +those of GNU @samp{gzip}, but they are +not identical. + +@samp{bzip2} expects a list of +file names to accompany the command-line flags. Each file is +replaced by a compressed version of itself, with the name +@samp{original_name.bz2}. Each +compressed file has the same modification date, permissions, and, +when possible, ownership as the corresponding original, so that +these properties can be correctly restored at decompression time. +File name handling is naive in the sense that there is no +mechanism for preserving original file names, permissions, +ownerships or dates in filesystems which lack these concepts, or +have serious file name length restrictions, such as +MS-DOS. + +@samp{bzip2} and +@samp{bunzip2} will by default not +overwrite existing files. If you want this to happen, specify +the @samp{-f} flag. + +If no file names are specified, +@samp{bzip2} compresses from standard +input to standard output. In this case, +@samp{bzip2} will decline to write +compressed output to a terminal, as this would be entirely +incomprehensible and therefore pointless. + +@samp{bunzip2} (or +@samp{bzip2 -d}) decompresses all +specified files. Files which were not created by +@samp{bzip2} will be detected and +ignored, and a warning issued. +@samp{bzip2} attempts to guess the +filename for the decompressed file from that of the compressed +file as follows: + +@itemize @bullet{} + +@item +@samp{filename.bz2 } +becomes +@samp{filename} + +@item +@samp{filename.bz } +becomes +@samp{filename} + +@item +@samp{filename.tbz2} +becomes +@samp{filename.tar} + +@item +@samp{filename.tbz } +becomes +@samp{filename.tar} + +@item +@samp{anyothername } +becomes +@samp{anyothername.out} +@end itemize + +If the file does not end in one of the recognised endings, +@samp{.bz2}, +@samp{.bz}, +@samp{.tbz2} or +@samp{.tbz}, +@samp{bzip2} complains that it cannot +guess the name of the original file, and uses the original name +with @samp{.out} appended. + +As with compression, supplying no filenames causes +decompression from standard input to standard output. + +@samp{bunzip2} will correctly +decompress a file which is the concatenation of two or more +compressed files. The result is the concatenation of the +corresponding uncompressed files. Integrity testing +(@samp{-t}) of concatenated compressed +files is also supported. + +You can also compress or decompress files to the standard +output by giving the @samp{-c} flag. +Multiple files may be compressed and decompressed like this. The +resulting outputs are fed sequentially to stdout. Compression of +multiple files in this manner generates a stream containing +multiple compressed file representations. Such a stream can be +decompressed correctly only by +@samp{bzip2} version 0.9.0 or later. +Earlier versions of @samp{bzip2} will +stop after decompressing the first file in the stream. + +@samp{bzcat} (or +@samp{bzip2 -dc}) decompresses all +specified files to the standard output. + +@samp{bzip2} will read arguments +from the environment variables +@samp{BZIP2} and +@samp{BZIP}, in that order, and will +process them before any arguments read from the command line. +This gives a convenient way to supply default arguments. + +Compression is always performed, even if the compressed +file is slightly larger than the original. Files of less than +about one hundred bytes tend to get larger, since the compression +mechanism has a constant overhead in the region of 50 bytes. +Random data (including the output of most file compressors) is +coded at about 8.05 bits per byte, giving an expansion of around +0.5%. + +As a self-check for your protection, +@samp{bzip2} uses 32-bit CRCs to make +sure that the decompressed version of a file is identical to the +original. This guards against corruption of the compressed data, +and against undetected bugs in +@samp{bzip2} (hopefully very unlikely). +The chances of data corruption going undetected is microscopic, +about one chance in four billion for each file processed. Be +aware, though, that the check occurs upon decompression, so it +can only tell you that something is wrong. It can't help you +recover the original uncompressed data. You can use +@samp{bzip2recover} to try to recover +data from damaged files. + +Return values: 0 for a normal exit, 1 for environmental +problems (file not found, invalid flags, I/O errors, etc.), 2 +to indicate a corrupt compressed file, 3 for an internal +consistency error (eg, bug) which caused +@samp{bzip2} to panic. + +@node OPTIONS, MEMORY MANAGEMENT, DESCRIPTION, How to use bzip2 +@section OPTIONS + +@table @asis + +@item @samp{-c --stdout} +Compress or decompress to standard +output. + +@item @samp{-d --decompress} +Force decompression. +@samp{bzip2}, +@samp{bunzip2} and +@samp{bzcat} are really the same +program, and the decision about what actions to take is done on +the basis of which name is used. This flag overrides that +mechanism, and forces bzip2 to decompress. + +@item @samp{-z --compress} +The complement to +@samp{-d}: forces compression, +regardless of the invokation name. + +@item @samp{-t --test} +Check integrity of the specified file(s), but +don't decompress them. This really performs a trial +decompression and throws away the result. + +@item @samp{-f --force} +Force overwrite of output files. Normally, +@samp{bzip2} will not overwrite +existing output files. Also forces +@samp{bzip2} to break hard links to +files, which it otherwise wouldn't do. + +@samp{bzip2} normally declines +to decompress files which don't have the correct magic header +bytes. If forced (@samp{-f}), +however, it will pass such files through unmodified. This is +how GNU @samp{gzip} behaves. + +@item @samp{-k --keep} +Keep (don't delete) input files during +compression or decompression. + +@item @samp{-s --small} +Reduce memory usage, for compression, +decompression and testing. Files are decompressed and tested +using a modified algorithm which only requires 2.5 bytes per +block byte. This means any file can be decompressed in 2300k +of memory, albeit at about half the normal speed. + +During compression, @samp{-s} +selects a block size of 200k, which limits memory use to around +the same figure, at the expense of your compression ratio. In +short, if your machine is low on memory (8 megabytes or less), +use @samp{-s} for everything. See +@ref{MEMORY MANAGEMENT,,MEMORY MANAGEMENT}. below. + +@item @samp{-q --quiet} +Suppress non-essential warning messages. +Messages pertaining to I/O errors and other critical events +will not be suppressed. + +@item @samp{-v --verbose} +Verbose mode -- show the compression ratio for +each file processed. Further +@samp{-v}'s increase the verbosity +level, spewing out lots of information which is primarily of +interest for diagnostic purposes. + +@item @samp{-L --license -V --version} +Display the software version, license terms and +conditions. + +@item @samp{-1} (or @samp{--fast}) to @samp{-9} (or @samp{-best}) +Set the block size to 100 k, 200 k ... 900 k +when compressing. Has no effect when decompressing. See @ref{MEMORY MANAGEMENT,,MEMORY MANAGEMENT}. below. The +@samp{--fast} and +@samp{--best} aliases are primarily +for GNU @samp{gzip} compatibility. +In particular, @samp{--fast} doesn't +make things significantly faster. And +@samp{--best} merely selects the +default behaviour. + +@item @samp{--} +Treats all subsequent arguments as file names, +even if they start with a dash. This is so you can handle +files with names beginning with a dash, for example: +@samp{bzip2 -- +-myfilename}. + +@item @samp{--repetitive-fast} +@itemx @samp{--repetitive-best} +These flags are redundant in versions 0.9.5 and +above. They provided some coarse control over the behaviour of +the sorting algorithm in earlier versions, which was sometimes +useful. 0.9.5 and above have an improved algorithm which +renders these flags irrelevant. +@end table + +@node MEMORY MANAGEMENT, RECOVERING DATA FROM DAMAGED FILES, OPTIONS, How to use bzip2 +@section MEMORY MANAGEMENT + +@samp{bzip2} compresses large +files in blocks. The block size affects both the compression +ratio achieved, and the amount of memory needed for compression +and decompression. The flags @samp{-1} +through @samp{-9} specify the block +size to be 100,000 bytes through 900,000 bytes (the default) +respectively. At decompression time, the block size used for +compression is read from the header of the compressed file, and +@samp{bunzip2} then allocates itself +just enough memory to decompress the file. Since block sizes are +stored in compressed files, it follows that the flags +@samp{-1} to +@samp{-9} are irrelevant to and so +ignored during decompression. + +Compression and decompression requirements, in bytes, can be +estimated as: + +@example + +Compression: 400k + ( 8 x block size ) + +Decompression: 100k + ( 4 x block size ), or + 100k + ( 2.5 x block size ) +@end example + +Larger block sizes give rapidly diminishing marginal +returns. Most of the compression comes from the first two or +three hundred k of block size, a fact worth bearing in mind when +using @samp{bzip2} on small machines. +It is also important to appreciate that the decompression memory +requirement is set at compression time by the choice of block +size. + +For files compressed with the default 900k block size, +@samp{bunzip2} will require about 3700 +kbytes to decompress. To support decompression of any file on a +4 megabyte machine, @samp{bunzip2} has +an option to decompress using approximately half this amount of +memory, about 2300 kbytes. Decompression speed is also halved, +so you should use this option only where necessary. The relevant +flag is @samp{-s}. + +In general, try and use the largest block size memory +constraints allow, since that maximises the compression achieved. +Compression and decompression speed are virtually unaffected by +block size. + +Another significant point applies to files which fit in a +single block -- that means most files you'd encounter using a +large block size. The amount of real memory touched is +proportional to the size of the file, since the file is smaller +than a block. For example, compressing a file 20,000 bytes long +with the flag @samp{-9} will cause the +compressor to allocate around 7600k of memory, but only touch +400k + 20000 * 8 = 560 kbytes of it. Similarly, the decompressor +will allocate 3700k but only touch 100k + 20000 * 4 = 180 +kbytes. + +Here is a table which summarises the maximum memory usage +for different block sizes. Also recorded is the total compressed +size for 14 files of the Calgary Text Compression Corpus +totalling 3,141,622 bytes. This column gives some feel for how +compression varies with block size. These figures tend to +understate the advantage of larger block sizes for larger files, +since the Corpus is dominated by smaller files. + +@example + + Compress Decompress Decompress Corpus +Flag usage usage -s usage Size + + -1 1200k 500k 350k 914704 + -2 2000k 900k 600k 877703 + -3 2800k 1300k 850k 860338 + -4 3600k 1700k 1100k 846899 + -5 4400k 2100k 1350k 845160 + -6 5200k 2500k 1600k 838626 + -7 6100k 2900k 1850k 834096 + -8 6800k 3300k 2100k 828642 + -9 7600k 3700k 2350k 828642 +@end example + +@node RECOVERING DATA FROM DAMAGED FILES, PERFORMANCE NOTES, MEMORY MANAGEMENT, How to use bzip2 +@section RECOVERING DATA FROM DAMAGED FILES + +@samp{bzip2} compresses files in +blocks, usually 900kbytes long. Each block is handled +independently. If a media or transmission error causes a +multi-block @samp{.bz2} file to become +damaged, it may be possible to recover data from the undamaged +blocks in the file. + +The compressed representation of each block is delimited by +a 48-bit pattern, which makes it possible to find the block +boundaries with reasonable certainty. Each block also carries +its own 32-bit CRC, so damaged blocks can be distinguished from +undamaged ones. + +@samp{bzip2recover} is a simple +program whose purpose is to search for blocks in +@samp{.bz2} files, and write each block +out into its own @samp{.bz2} file. You +can then use @samp{bzip2 -t} to test +the integrity of the resulting files, and decompress those which +are undamaged. + +@samp{bzip2recover} takes a +single argument, the name of the damaged file, and writes a +number of files @samp{rec0001file.bz2}, +@samp{rec0002file.bz2}, etc, containing +the extracted blocks. The output filenames are designed so that +the use of wildcards in subsequent processing -- for example, +@samp{bzip2 -dc rec*file.bz2 > +recovered_data} -- lists the files in the correct +order. + +@samp{bzip2recover} should be of +most use dealing with large @samp{.bz2} +files, as these will contain many blocks. It is clearly futile +to use it on damaged single-block files, since a damaged block +cannot be recovered. If you wish to minimise any potential data +loss through media or transmission errors, you might consider +compressing with a smaller block size. + +@node PERFORMANCE NOTES, CAVEATS, RECOVERING DATA FROM DAMAGED FILES, How to use bzip2 +@section PERFORMANCE NOTES + +The sorting phase of compression gathers together similar +strings in the file. Because of this, files containing very long +runs of repeated symbols, like "aabaabaabaab ..." (repeated +several hundred times) may compress more slowly than normal. +Versions 0.9.5 and above fare much better than previous versions +in this respect. The ratio between worst-case and average-case +compression time is in the region of 10:1. For previous +versions, this figure was more like 100:1. You can use the +@samp{-vvvv} option to monitor progress +in great detail, if you want. + +Decompression speed is unaffected by these +phenomena. + +@samp{bzip2} usually allocates +several megabytes of memory to operate in, and then charges all +over it in a fairly random fashion. This means that performance, +both for compressing and decompressing, is largely determined by +the speed at which your machine can service cache misses. +Because of this, small changes to the code to reduce the miss +rate have been observed to give disproportionately large +performance improvements. I imagine +@samp{bzip2} will perform best on +machines with very large caches. + +@node CAVEATS, AUTHOR, PERFORMANCE NOTES, How to use bzip2 +@section CAVEATS + +I/O error messages are not as helpful as they could be. +@samp{bzip2} tries hard to detect I/O +errors and exit cleanly, but the details of what the problem is +sometimes seem rather misleading. + +This manual page pertains to version 1.0.3 of +@samp{bzip2}. Compressed data created +by this version is entirely forwards and backwards compatible +with the previous public releases, versions 0.1pl2, 0.9.0 and +0.9.5, 1.0.0, 1.0.1 and 1.0.2, but with the following exception: 0.9.0 +and above can correctly decompress multiple concatenated +compressed files. 0.1pl2 cannot do this; it will stop after +decompressing just the first file in the stream. + +@samp{bzip2recover} versions +prior to 1.0.2 used 32-bit integers to represent bit positions in +compressed files, so it could not handle compressed files more +than 512 megabytes long. Versions 1.0.2 and above use 64-bit ints +on some platforms which support them (GNU supported targets, and +Windows). To establish whether or not +@samp{bzip2recover} was built with such +a limitation, run it without arguments. In any event you can +build yourself an unlimited version if you can recompile it with +@samp{MaybeUInt64} set to be an +unsigned 64-bit integer. + +@node AUTHOR, , CAVEATS, How to use bzip2 +@section AUTHOR + +Julian Seward, +@samp{jseward@@bzip.org} + +The ideas embodied in +@samp{bzip2} are due to (at least) the +following people: Michael Burrows and David Wheeler (for the +block sorting transformation), David Wheeler (again, for the +Huffman coder), Peter Fenwick (for the structured coding model in +the original @samp{bzip}, and many +refinements), and Alistair Moffat, Radford Neal and Ian Witten +(for the arithmetic coder in the original +@samp{bzip}). I am much indebted for +their help, support and advice. See the manual in the source +distribution for pointers to sources of documentation. Christian +von Roques encouraged me to look for faster sorting algorithms, +so as to speed up compression. Bela Lubkin encouraged me to +improve the worst-case compression performance. +Donna Robinson XMLised the documentation. +Many people sent +patches, helped with portability problems, lent machines, gave +advice and were generally helpful. + +@node Programming with libbzip2, Miscellanea, How to use bzip2, Top +@chapter Programming with libbzip2 + +This chapter describes the programming interface to +@samp{libbzip2}. + +For general background information, particularly about +memory use and performance aspects, you'd be well advised to read +@ref{How to use bzip2,,How to use bzip2}. as well. + +@menu +* Top-level structure:: +* Error handling:: +* Low-level interface: >Low-level interface. +* High-level interface:: +* Utility functions:: +* zlib compatibility functions:: +* Using the library in a stdio-free environment:: +* Making a Windows DLL:: +@end menu + +@node Top-level structure, Error handling, , Programming with libbzip2 +@section Top-level structure + +@samp{libbzip2} is a flexible +library for compressing and decompressing data in the +@samp{bzip2} data format. Although +packaged as a single entity, it helps to regard the library as +three separate parts: the low level interface, and the high level +interface, and some utility functions. + +The structure of +@samp{libbzip2}'s interfaces is similar +to that of Jean-loup Gailly's and Mark Adler's excellent +@samp{zlib} library. + +All externally visible symbols have names beginning +@samp{BZ2_}. This is new in version +1.0. The intention is to minimise pollution of the namespaces of +library clients. + +To use any part of the library, you need to +@samp{#include <bzlib.h>} +into your sources. + +@menu +* Low-level summary:: +* High-level summary:: +* Utility functions summary:: +@end menu + +@node Low-level summary, High-level summary, , Top-level structure +@subsection Low-level summary + +This interface provides services for compressing and +decompressing data in memory. There's no provision for dealing +with files, streams or any other I/O mechanisms, just straight +memory-to-memory work. In fact, this part of the library can be +compiled without inclusion of +@samp{stdio.h}, which may be helpful +for embedded applications. + +The low-level part of the library has no global variables +and is therefore thread-safe. + +Six routines make up the low level interface: +@samp{BZ2_bzCompressInit}, +@samp{BZ2_bzCompress}, and +@samp{BZ2_bzCompressEnd} for +compression, and a corresponding trio +@samp{BZ2_bzDecompressInit}, +@samp{BZ2_bzDecompress} and +@samp{BZ2_bzDecompressEnd} for +decompression. The @samp{*Init} +functions allocate memory for compression/decompression and do +other initialisations, whilst the +@samp{*End} functions close down +operations and release memory. + +The real work is done by +@samp{BZ2_bzCompress} and +@samp{BZ2_bzDecompress}. These +compress and decompress data from a user-supplied input buffer to +a user-supplied output buffer. These buffers can be any size; +arbitrary quantities of data are handled by making repeated calls +to these functions. This is a flexible mechanism allowing a +consumer-pull style of activity, or producer-push, or a mixture +of both. + +@node High-level summary, Utility functions summary, Low-level summary, Top-level structure +@subsection High-level summary + +This interface provides some handy wrappers around the +low-level interface to facilitate reading and writing +@samp{bzip2} format files +(@samp{.bz2} files). The routines +provide hooks to facilitate reading files in which the +@samp{bzip2} data stream is embedded +within some larger-scale file structure, or where there are +multiple @samp{bzip2} data streams +concatenated end-to-end. + +For reading files, +@samp{BZ2_bzReadOpen}, +@samp{BZ2_bzRead}, +@samp{BZ2_bzReadClose} and +@samp{BZ2_bzReadGetUnused} are +supplied. For writing files, +@samp{BZ2_bzWriteOpen}, +@samp{BZ2_bzWrite} and +@samp{BZ2_bzWriteFinish} are +available. + +As with the low-level library, no global variables are used +so the library is per se thread-safe. However, if I/O errors +occur whilst reading or writing the underlying compressed files, +you may have to consult @samp{errno} to +determine the cause of the error. In that case, you'd need a C +library which correctly supports +@samp{errno} in a multithreaded +environment. + +To make the library a little simpler and more portable, +@samp{BZ2_bzReadOpen} and +@samp{BZ2_bzWriteOpen} require you to +pass them file handles (@samp{FILE*}s) +which have previously been opened for reading or writing +respectively. That avoids portability problems associated with +file operations and file attributes, whilst not being much of an +imposition on the programmer. + +@node Utility functions summary, , High-level summary, Top-level structure +@subsection Utility functions summary + +For very simple needs, +@samp{BZ2_bzBuffToBuffCompress} and +@samp{BZ2_bzBuffToBuffDecompress} are +provided. These compress data in memory from one buffer to +another buffer in a single function call. You should assess +whether these functions fulfill your memory-to-memory +compression/decompression requirements before investing effort in +understanding the more general but more complex low-level +interface. + +Yoshioka Tsuneo +(@samp{QWF00133@@niftyserve.or.jp} / +@samp{tsuneo-y@@is.aist-nara.ac.jp}) has +contributed some functions to give better +@samp{zlib} compatibility. These +functions are @samp{BZ2_bzopen}, +@samp{BZ2_bzread}, +@samp{BZ2_bzwrite}, +@samp{BZ2_bzflush}, +@samp{BZ2_bzclose}, +@samp{BZ2_bzerror} and +@samp{BZ2_bzlibVersion}. You may find +these functions more convenient for simple file reading and +writing, than those in the high-level interface. These functions +are not (yet) officially part of the library, and are minimally +documented here. If they break, you get to keep all the pieces. +I hope to document them properly when time permits. + +Yoshioka also contributed modifications to allow the +library to be built as a Windows DLL. + +@node Error handling, >Low-level interface, Top-level structure, Programming with libbzip2 +@section Error handling + +The library is designed to recover cleanly in all +situations, including the worst-case situation of decompressing +random data. I'm not 100% sure that it can always do this, so +you might want to add a signal handler to catch segmentation +violations during decompression if you are feeling especially +paranoid. I would be interested in hearing more about the +robustness of the library to corrupted compressed data. + +Version 1.0.3 more robust in this respect than any +previous version. Investigations with Valgrind (a tool for detecting +problems with memory management) indicate +that, at least for the few files I tested, all single-bit errors +in the decompressed data are caught properly, with no +segmentation faults, no uses of uninitialised data, no out of +range reads or writes, and no infinite looping in the decompressor. +So it's certainly pretty robust, although +I wouldn't claim it to be totally bombproof. + +The file @samp{bzlib.h} contains +all definitions needed to use the library. In particular, you +should definitely not include +@samp{bzlib_private.h}. + +In @samp{bzlib.h}, the various +return values are defined. The following list is not intended as +an exhaustive description of the circumstances in which a given +value may be returned -- those descriptions are given later. +Rather, it is intended to convey the rough meaning of each return +value. The first five actions are normal and not intended to +denote an error situation. + +@table @asis + +@item @samp{BZ_OK} +The requested action was completed +successfully. + +@item @samp{BZ_RUN_OK, BZ_FLUSH_OK, BZ_FINISH_OK} +In +@samp{BZ2_bzCompress}, the requested +flush/finish/nothing-special action was completed +successfully. + +@item @samp{BZ_STREAM_END} +Compression of data was completed, or the +logical stream end was detected during +decompression. +@end table + +The following return values indicate an error of some +kind. + +@table @asis + +@item @samp{BZ_CONFIG_ERROR} +Indicates that the library has been improperly +compiled on your platform -- a major configuration error. +Specifically, it means that +@samp{sizeof(char)}, +@samp{sizeof(short)} and +@samp{sizeof(int)} are not 1, 2 and +4 respectively, as they should be. Note that the library +should still work properly on 64-bit platforms which follow +the LP64 programming model -- that is, where +@samp{sizeof(long)} and +@samp{sizeof(void*)} are 8. Under +LP64, @samp{sizeof(int)} is still 4, +so @samp{libbzip2}, which doesn't +use the @samp{long} type, is +OK. + +@item @samp{BZ_SEQUENCE_ERROR} +When using the library, it is important to call +the functions in the correct sequence and with data structures +(buffers etc) in the correct states. +@samp{libbzip2} checks as much as it +can to ensure this is happening, and returns +@samp{BZ_SEQUENCE_ERROR} if not. +Code which complies precisely with the function semantics, as +detailed below, should never receive this value; such an event +denotes buggy code which you should +investigate. + +@item @samp{BZ_PARAM_ERROR} +Returned when a parameter to a function call is +out of range or otherwise manifestly incorrect. As with +@samp{BZ_SEQUENCE_ERROR}, this +denotes a bug in the client code. The distinction between +@samp{BZ_PARAM_ERROR} and +@samp{BZ_SEQUENCE_ERROR} is a bit +hazy, but still worth making. + +@item @samp{BZ_MEM_ERROR} +Returned when a request to allocate memory +failed. Note that the quantity of memory needed to decompress +a stream cannot be determined until the stream's header has +been read. So +@samp{BZ2_bzDecompress} and +@samp{BZ2_bzRead} may return +@samp{BZ_MEM_ERROR} even though some +of the compressed data has been read. The same is not true +for compression; once +@samp{BZ2_bzCompressInit} or +@samp{BZ2_bzWriteOpen} have +successfully completed, +@samp{BZ_MEM_ERROR} cannot +occur. + +@item @samp{BZ_DATA_ERROR} +Returned when a data integrity error is +detected during decompression. Most importantly, this means +when stored and computed CRCs for the data do not match. This +value is also returned upon detection of any other anomaly in +the compressed data. + +@item @samp{BZ_DATA_ERROR_MAGIC} +As a special case of +@samp{BZ_DATA_ERROR}, it is +sometimes useful to know when the compressed stream does not +start with the correct magic bytes (@samp{'B' 'Z' +'h'}). + +@item @samp{BZ_IO_ERROR} +Returned by +@samp{BZ2_bzRead} and +@samp{BZ2_bzWrite} when there is an +error reading or writing in the compressed file, and by +@samp{BZ2_bzReadOpen} and +@samp{BZ2_bzWriteOpen} for attempts +to use a file for which the error indicator (viz, +@samp{ferror(f)}) is set. On +receipt of @samp{BZ_IO_ERROR}, the +caller should consult @samp{errno} +and/or @samp{perror} to acquire +operating-system specific information about the +problem. + +@item @samp{BZ_UNEXPECTED_EOF} +Returned by +@samp{BZ2_bzRead} when the +compressed file finishes before the logical end of stream is +detected. + +@item @samp{BZ_OUTBUFF_FULL} +Returned by +@samp{BZ2_bzBuffToBuffCompress} and +@samp{BZ2_bzBuffToBuffDecompress} to +indicate that the output data will not fit into the output +buffer provided. +@end table + +@node >Low-level interface, High-level interface, Error handling, Programming with libbzip2 +@section Low-level interface + +@menu +* BZ2_bzCompressInit:: +* BZ2_bzCompress:: +* BZ2_bzCompressEnd:: +* BZ2_bzDecompressInit:: +* BZ2_bzDecompress:: +* BZ2_bzDecompressEnd:: +@end menu + +@node BZ2_bzCompressInit, BZ2_bzCompress, , >Low-level interface +@subsection BZ2_bzCompressInit + +@example + +typedef struct @{ + char *next_in; + unsigned int avail_in; + unsigned int total_in_lo32; + unsigned int total_in_hi32; + + char *next_out; + unsigned int avail_out; + unsigned int total_out_lo32; + unsigned int total_out_hi32; + + void *state; + + void *(*bzalloc)(void *,int,int); + void (*bzfree)(void *,void *); + void *opaque; +@} bz_stream; + +int BZ2_bzCompressInit ( bz_stream *strm, + int blockSize100k, + int verbosity, + int workFactor ); +@end example + +Prepares for compression. The +@samp{bz_stream} structure holds all +data pertaining to the compression activity. A +@samp{bz_stream} structure should be +allocated and initialised prior to the call. The fields of +@samp{bz_stream} comprise the entirety +of the user-visible data. @samp{state} +is a pointer to the private data structures required for +compression. + +Custom memory allocators are supported, via fields +@samp{bzalloc}, +@samp{bzfree}, and +@samp{opaque}. The value +@samp{opaque} is passed to as the first +argument to all calls to @samp{bzalloc} +and @samp{bzfree}, but is otherwise +ignored by the library. The call @samp{bzalloc ( +opaque, n, m )} is expected to return a pointer +@samp{p} to @samp{n * +m} bytes of memory, and @samp{bzfree ( +opaque, p )} should free that memory. + +If you don't want to use a custom memory allocator, set +@samp{bzalloc}, +@samp{bzfree} and +@samp{opaque} to +@samp{NULL}, and the library will then +use the standard @samp{malloc} / +@samp{free} routines. + +Before calling +@samp{BZ2_bzCompressInit}, fields +@samp{bzalloc}, +@samp{bzfree} and +@samp{opaque} should be filled +appropriately, as just described. Upon return, the internal +state will have been allocated and initialised, and +@samp{total_in_lo32}, +@samp{total_in_hi32}, +@samp{total_out_lo32} and +@samp{total_out_hi32} will have been +set to zero. These four fields are used by the library to inform +the caller of the total amount of data passed into and out of the +library, respectively. You should not try to change them. As of +version 1.0, 64-bit counts are maintained, even on 32-bit +platforms, using the @samp{_hi32} +fields to store the upper 32 bits of the count. So, for example, +the total amount of data in is @samp{(total_in_hi32 +<< 32) + total_in_lo32}. + +Parameter @samp{blockSize100k} +specifies the block size to be used for compression. It should +be a value between 1 and 9 inclusive, and the actual block size +used is 100000 x this figure. 9 gives the best compression but +takes most memory. + +Parameter @samp{verbosity} should +be set to a number between 0 and 4 inclusive. 0 is silent, and +greater numbers give increasingly verbose monitoring/debugging +output. If the library has been compiled with +@samp{-DBZ_NO_STDIO}, no such output +will appear for any verbosity setting. + +Parameter @samp{workFactor} +controls how the compression phase behaves when presented with +worst case, highly repetitive, input data. If compression runs +into difficulties caused by repetitive data, the library switches +from the standard sorting algorithm to a fallback algorithm. The +fallback is slower than the standard algorithm by perhaps a +factor of three, but always behaves reasonably, no matter how bad +the input. + +Lower values of @samp{workFactor} +reduce the amount of effort the standard algorithm will expend +before resorting to the fallback. You should set this parameter +carefully; too low, and many inputs will be handled by the +fallback algorithm and so compress rather slowly, too high, and +your average-to-worst case compression times can become very +large. The default value of 30 gives reasonable behaviour over a +wide range of circumstances. + +Allowable values range from 0 to 250 inclusive. 0 is a +special case, equivalent to using the default value of 30. + +Note that the compressed output generated is the same +regardless of whether or not the fallback algorithm is +used. + +Be aware also that this parameter may disappear entirely in +future versions of the library. In principle it should be +possible to devise a good way to automatically choose which +algorithm to use. Such a mechanism would render the parameter +obsolete. + +Possible return values: + +@example + +BZ_CONFIG_ERROR + if the library has been mis-compiled +BZ_PARAM_ERROR + if strm is NULL + or blockSize < 1 or blockSize > 9 + or verbosity < 0 or verbosity > 4 + or workFactor < 0 or workFactor > 250 +BZ_MEM_ERROR + if not enough memory is available +BZ_OK + otherwise +@end example + +Allowable next actions: + +@example + +BZ2_bzCompress + if BZ_OK is returned + no specific action needed in case of error +@end example + +@node BZ2_bzCompress, BZ2_bzCompressEnd, BZ2_bzCompressInit, >Low-level interface +@subsection BZ2_bzCompress + +@example + +int BZ2_bzCompress ( bz_stream *strm, int action ); +@end example + +Provides more input and/or output buffer space for the +library. The caller maintains input and output buffers, and +calls @samp{BZ2_bzCompress} to transfer +data between them. + +Before each call to +@samp{BZ2_bzCompress}, +@samp{next_in} should point at the data +to be compressed, and @samp{avail_in} +should indicate how many bytes the library may read. +@samp{BZ2_bzCompress} updates +@samp{next_in}, +@samp{avail_in} and +@samp{total_in} to reflect the number +of bytes it has read. + +Similarly, @samp{next_out} should +point to a buffer in which the compressed data is to be placed, +with @samp{avail_out} indicating how +much output space is available. +@samp{BZ2_bzCompress} updates +@samp{next_out}, +@samp{avail_out} and +@samp{total_out} to reflect the number +of bytes output. + +You may provide and remove as little or as much data as you +like on each call of +@samp{BZ2_bzCompress}. In the limit, +it is acceptable to supply and remove data one byte at a time, +although this would be terribly inefficient. You should always +ensure that at least one byte of output space is available at +each call. + +A second purpose of +@samp{BZ2_bzCompress} is to request a +change of mode of the compressed stream. + +Conceptually, a compressed stream can be in one of four +states: IDLE, RUNNING, FLUSHING and FINISHING. Before +initialisation +(@samp{BZ2_bzCompressInit}) and after +termination (@samp{BZ2_bzCompressEnd}), +a stream is regarded as IDLE. + +Upon initialisation +(@samp{BZ2_bzCompressInit}), the stream +is placed in the RUNNING state. Subsequent calls to +@samp{BZ2_bzCompress} should pass +@samp{BZ_RUN} as the requested action; +other actions are illegal and will result in +@samp{BZ_SEQUENCE_ERROR}. + +At some point, the calling program will have provided all +the input data it wants to. It will then want to finish up -- in +effect, asking the library to process any data it might have +buffered internally. In this state, +@samp{BZ2_bzCompress} will no longer +attempt to read data from +@samp{next_in}, but it will want to +write data to @samp{next_out}. Because +the output buffer supplied by the user can be arbitrarily small, +the finishing-up operation cannot necessarily be done with a +single call of +@samp{BZ2_bzCompress}. + +Instead, the calling program passes +@samp{BZ_FINISH} as an action to +@samp{BZ2_bzCompress}. This changes +the stream's state to FINISHING. Any remaining input (ie, +@samp{next_in[0 .. avail_in-1]}) is +compressed and transferred to the output buffer. To do this, +@samp{BZ2_bzCompress} must be called +repeatedly until all the output has been consumed. At that +point, @samp{BZ2_bzCompress} returns +@samp{BZ_STREAM_END}, and the stream's +state is set back to IDLE. +@samp{BZ2_bzCompressEnd} should then be +called. + +Just to make sure the calling program does not cheat, the +library makes a note of @samp{avail_in} +at the time of the first call to +@samp{BZ2_bzCompress} which has +@samp{BZ_FINISH} as an action (ie, at +the time the program has announced its intention to not supply +any more input). By comparing this value with that of +@samp{avail_in} over subsequent calls +to @samp{BZ2_bzCompress}, the library +can detect any attempts to slip in more data to compress. Any +calls for which this is detected will return +@samp{BZ_SEQUENCE_ERROR}. This +indicates a programming mistake which should be corrected. + +Instead of asking to finish, the calling program may ask +@samp{BZ2_bzCompress} to take all the +remaining input, compress it and terminate the current +(Burrows-Wheeler) compression block. This could be useful for +error control purposes. The mechanism is analogous to that for +finishing: call @samp{BZ2_bzCompress} +with an action of @samp{BZ_FLUSH}, +remove output data, and persist with the +@samp{BZ_FLUSH} action until the value +@samp{BZ_RUN} is returned. As with +finishing, @samp{BZ2_bzCompress} +detects any attempt to provide more input data once the flush has +begun. + +Once the flush is complete, the stream returns to the +normal RUNNING state. + +This all sounds pretty complex, but isn't really. Here's a +table which shows which actions are allowable in each state, what +action will be taken, what the next state is, and what the +non-error return values are. Note that you can't explicitly ask +what state the stream is in, but nor do you need to -- it can be +inferred from the values returned by +@samp{BZ2_bzCompress}. + +@example + +IDLE/any + Illegal. IDLE state only exists after BZ2_bzCompressEnd or + before BZ2_bzCompressInit. + Return value = BZ_SEQUENCE_ERROR + +RUNNING/BZ_RUN + Compress from next_in to next_out as much as possible. + Next state = RUNNING + Return value = BZ_RUN_OK + +RUNNING/BZ_FLUSH + Remember current value of next_in. Compress from next_in + to next_out as much as possible, but do not accept any more input. + Next state = FLUSHING + Return value = BZ_FLUSH_OK + +RUNNING/BZ_FINISH + Remember current value of next_in. Compress from next_in + to next_out as much as possible, but do not accept any more input. + Next state = FINISHING + Return value = BZ_FINISH_OK + +FLUSHING/BZ_FLUSH + Compress from next_in to next_out as much as possible, + but do not accept any more input. + If all the existing input has been used up and all compressed + output has been removed + Next state = RUNNING; Return value = BZ_RUN_OK + else + Next state = FLUSHING; Return value = BZ_FLUSH_OK + +FLUSHING/other + Illegal. + Return value = BZ_SEQUENCE_ERROR + +FINISHING/BZ_FINISH + Compress from next_in to next_out as much as possible, + but to not accept any more input. + If all the existing input has been used up and all compressed + output has been removed + Next state = IDLE; Return value = BZ_STREAM_END + else + Next state = FINISHING; Return value = BZ_FINISHING + +FINISHING/other + Illegal. + Return value = BZ_SEQUENCE_ERROR +@end example + +That still looks complicated? Well, fair enough. The +usual sequence of calls for compressing a load of data is: + +@enumerate + +@item +Get started with +@samp{BZ2_bzCompressInit}. + +@item +Shovel data in and shlurp out its compressed form +using zero or more calls of +@samp{BZ2_bzCompress} with action = +@samp{BZ_RUN}. + +@item +Finish up. Repeatedly call +@samp{BZ2_bzCompress} with action = +@samp{BZ_FINISH}, copying out the +compressed output, until +@samp{BZ_STREAM_END} is +returned. + +@item +Close up and go home. Call +@samp{BZ2_bzCompressEnd}. +@end enumerate + +If the data you want to compress fits into your input +buffer all at once, you can skip the calls of +@samp{BZ2_bzCompress ( ..., BZ_RUN )} +and just do the @samp{BZ2_bzCompress ( ..., BZ_FINISH +)} calls. + +All required memory is allocated by +@samp{BZ2_bzCompressInit}. The +compression library can accept any data at all (obviously). So +you shouldn't get any error return values from the +@samp{BZ2_bzCompress} calls. If you +do, they will be +@samp{BZ_SEQUENCE_ERROR}, and indicate +a bug in your programming. + +Trivial other possible return values: + +@example + +BZ_PARAM_ERROR + if strm is NULL, or strm->s is NULL +@end example + +@node BZ2_bzCompressEnd, BZ2_bzDecompressInit, BZ2_bzCompress, >Low-level interface +@subsection BZ2_bzCompressEnd + +@example + +int BZ2_bzCompressEnd ( bz_stream *strm ); +@end example + +Releases all memory associated with a compression +stream. + +Possible return values: + +@example + +BZ_PARAM_ERROR if strm is NULL or strm->s is NULL +BZ_OK otherwise +@end example + +@node BZ2_bzDecompressInit, BZ2_bzDecompress, BZ2_bzCompressEnd, >Low-level interface +@subsection BZ2_bzDecompressInit + +@example + +int BZ2_bzDecompressInit ( bz_stream *strm, int verbosity, int small ); +@end example + +Prepares for decompression. As with +@samp{BZ2_bzCompressInit}, a +@samp{bz_stream} record should be +allocated and initialised before the call. Fields +@samp{bzalloc}, +@samp{bzfree} and +@samp{opaque} should be set if a custom +memory allocator is required, or made +@samp{NULL} for the normal +@samp{malloc} / +@samp{free} routines. Upon return, the +internal state will have been initialised, and +@samp{total_in} and +@samp{total_out} will be zero. + +For the meaning of parameter +@samp{verbosity}, see +@samp{BZ2_bzCompressInit}. + +If @samp{small} is nonzero, the +library will use an alternative decompression algorithm which +uses less memory but at the cost of decompressing more slowly +(roughly speaking, half the speed, but the maximum memory +requirement drops to around 2300k). See @ref{How to use bzip2,,How to use bzip2}. +for more information on memory management. + +Note that the amount of memory needed to decompress a +stream cannot be determined until the stream's header has been +read, so even if +@samp{BZ2_bzDecompressInit} succeeds, a +subsequent @samp{BZ2_bzDecompress} +could fail with +@samp{BZ_MEM_ERROR}. + +Possible return values: + +@example + +BZ_CONFIG_ERROR + if the library has been mis-compiled +BZ_PARAM_ERROR + if ( small != 0 && small != 1 ) + or (verbosity < 0 || verbosity > 4) +BZ_MEM_ERROR + if insufficient memory is available +@end example + +Allowable next actions: + +@example + +BZ2_bzDecompress + if BZ_OK was returned + no specific action required in case of error +@end example + +@node BZ2_bzDecompress, BZ2_bzDecompressEnd, BZ2_bzDecompressInit, >Low-level interface +@subsection BZ2_bzDecompress + +@example + +int BZ2_bzDecompress ( bz_stream *strm ); +@end example + +Provides more input and/out output buffer space for the +library. The caller maintains input and output buffers, and uses +@samp{BZ2_bzDecompress} to transfer +data between them. + +Before each call to +@samp{BZ2_bzDecompress}, +@samp{next_in} should point at the +compressed data, and @samp{avail_in} +should indicate how many bytes the library may read. +@samp{BZ2_bzDecompress} updates +@samp{next_in}, +@samp{avail_in} and +@samp{total_in} to reflect the number +of bytes it has read. + +Similarly, @samp{next_out} should +point to a buffer in which the uncompressed output is to be +placed, with @samp{avail_out} +indicating how much output space is available. +@samp{BZ2_bzCompress} updates +@samp{next_out}, +@samp{avail_out} and +@samp{total_out} to reflect the number +of bytes output. + +You may provide and remove as little or as much data as you +like on each call of +@samp{BZ2_bzDecompress}. In the limit, +it is acceptable to supply and remove data one byte at a time, +although this would be terribly inefficient. You should always +ensure that at least one byte of output space is available at +each call. + +Use of @samp{BZ2_bzDecompress} is +simpler than +@samp{BZ2_bzCompress}. + +You should provide input and remove output as described +above, and repeatedly call +@samp{BZ2_bzDecompress} until +@samp{BZ_STREAM_END} is returned. +Appearance of @samp{BZ_STREAM_END} +denotes that @samp{BZ2_bzDecompress} +has detected the logical end of the compressed stream. +@samp{BZ2_bzDecompress} will not +produce @samp{BZ_STREAM_END} until all +output data has been placed into the output buffer, so once +@samp{BZ_STREAM_END} appears, you are +guaranteed to have available all the decompressed output, and +@samp{BZ2_bzDecompressEnd} can safely +be called. + +If case of an error return value, you should call +@samp{BZ2_bzDecompressEnd} to clean up +and release memory. + +Possible return values: + +@example + +BZ_PARAM_ERROR + if strm is NULL or strm->s is NULL + or strm->avail_out < 1 +BZ_DATA_ERROR + if a data integrity error is detected in the compressed stream +BZ_DATA_ERROR_MAGIC + if the compressed stream doesn't begin with the right magic bytes +BZ_MEM_ERROR + if there wasn't enough memory available +BZ_STREAM_END + if the logical end of the data stream was detected and all + output in has been consumed, eg s-->avail_out > 0 +BZ_OK + otherwise +@end example + +Allowable next actions: + +@example + +BZ2_bzDecompress + if BZ_OK was returned +BZ2_bzDecompressEnd + otherwise +@end example + +@node BZ2_bzDecompressEnd, , BZ2_bzDecompress, >Low-level interface +@subsection BZ2_bzDecompressEnd + +@example + +int BZ2_bzDecompressEnd ( bz_stream *strm ); +@end example + +Releases all memory associated with a decompression +stream. + +Possible return values: + +@example + +BZ_PARAM_ERROR + if strm is NULL or strm->s is NULL +BZ_OK + otherwise +@end example + +Allowable next actions: + +@example + + None. +@end example + +@node High-level interface, Utility functions, >Low-level interface, Programming with libbzip2 +@section High-level interface + +This interface provides functions for reading and writing +@samp{bzip2} format files. First, some +general points. + +@itemize @bullet{} + +@item +All of the functions take an +@samp{int*} first argument, +@samp{bzerror}. After each call, +@samp{bzerror} should be consulted +first to determine the outcome of the call. If +@samp{bzerror} is +@samp{BZ_OK}, the call completed +successfully, and only then should the return value of the +function (if any) be consulted. If +@samp{bzerror} is +@samp{BZ_IO_ERROR}, there was an +error reading/writing the underlying compressed file, and you +should then consult @samp{errno} / +@samp{perror} to determine the cause +of the difficulty. @samp{bzerror} +may also be set to various other values; precise details are +given on a per-function basis below. + +@item +If @samp{bzerror} indicates +an error (ie, anything except +@samp{BZ_OK} and +@samp{BZ_STREAM_END}), you should +immediately call +@samp{BZ2_bzReadClose} (or +@samp{BZ2_bzWriteClose}, depending on +whether you are attempting to read or to write) to free up all +resources associated with the stream. Once an error has been +indicated, behaviour of all calls except +@samp{BZ2_bzReadClose} +(@samp{BZ2_bzWriteClose}) is +undefined. The implication is that (1) +@samp{bzerror} should be checked +after each call, and (2) if +@samp{bzerror} indicates an error, +@samp{BZ2_bzReadClose} +(@samp{BZ2_bzWriteClose}) should then +be called to clean up. + +@item +The @samp{FILE*} arguments +passed to @samp{BZ2_bzReadOpen} / +@samp{BZ2_bzWriteOpen} should be set +to binary mode. Most Unix systems will do this by default, but +other platforms, including Windows and Mac, will not. If you +omit this, you may encounter problems when moving code to new +platforms. + +@item +Memory allocation requests are handled by +@samp{malloc} / +@samp{free}. At present there is no +facility for user-defined memory allocators in the file I/O +functions (could easily be added, though). +@end itemize + +@menu +* BZ2_bzReadOpen:: +* BZ2_bzRead:: +* BZ2_bzReadGetUnused:: +* BZ2_bzReadClose:: +* BZ2_bzWriteOpen:: +* BZ2_bzWrite:: +* BZ2_bzWriteClose:: +* Handling embedded compressed data streams:: +* Standard file-reading/writing code:: +@end menu + +@node BZ2_bzReadOpen, BZ2_bzRead, , High-level interface +@subsection BZ2_bzReadOpen + +@example + +typedef void BZFILE; + +BZFILE *BZ2_bzReadOpen( int *bzerror, FILE *f, + int verbosity, int small, + void *unused, int nUnused ); +@end example + +Prepare to read compressed data from file handle +@samp{f}. +@samp{f} should refer to a file which +has been opened for reading, and for which the error indicator +(@samp{ferror(f)})is not set. If +@samp{small} is 1, the library will try +to decompress using less memory, at the expense of speed. + +For reasons explained below, +@samp{BZ2_bzRead} will decompress the +@samp{nUnused} bytes starting at +@samp{unused}, before starting to read +from the file @samp{f}. At most +@samp{BZ_MAX_UNUSED} bytes may be +supplied like this. If this facility is not required, you should +pass @samp{NULL} and +@samp{0} for +@samp{unused} and +n@samp{Unused} respectively. + +For the meaning of parameters +@samp{small} and +@samp{verbosity}, see +@samp{BZ2_bzDecompressInit}. + +The amount of memory needed to decompress a file cannot be +determined until the file's header has been read. So it is +possible that @samp{BZ2_bzReadOpen} +returns @samp{BZ_OK} but a subsequent +call of @samp{BZ2_bzRead} will return +@samp{BZ_MEM_ERROR}. + +Possible assignments to +@samp{bzerror}: + +@example + +BZ_CONFIG_ERROR + if the library has been mis-compiled +BZ_PARAM_ERROR + if f is NULL + or small is neither 0 nor 1 + or ( unused == NULL && nUnused != 0 ) + or ( unused != NULL && !(0 <= nUnused <= BZ_MAX_UNUSED) ) +BZ_IO_ERROR + if ferror(f) is nonzero +BZ_MEM_ERROR + if insufficient memory is available +BZ_OK + otherwise. +@end example + +Possible return values: + +@example + +Pointer to an abstract BZFILE + if bzerror is BZ_OK +NULL + otherwise +@end example + +Allowable next actions: + +@example + +BZ2_bzRead + if bzerror is BZ_OK +BZ2_bzClose + otherwise +@end example + +@node BZ2_bzRead, BZ2_bzReadGetUnused, BZ2_bzReadOpen, High-level interface +@subsection BZ2_bzRead + +@example + +int BZ2_bzRead ( int *bzerror, BZFILE *b, void *buf, int len ); +@end example + +Reads up to @samp{len} +(uncompressed) bytes from the compressed file +@samp{b} into the buffer +@samp{buf}. If the read was +successful, @samp{bzerror} is set to +@samp{BZ_OK} and the number of bytes +read is returned. If the logical end-of-stream was detected, +@samp{bzerror} will be set to +@samp{BZ_STREAM_END}, and the number of +bytes read is returned. All other +@samp{bzerror} values denote an +error. + +@samp{BZ2_bzRead} will supply +@samp{len} bytes, unless the logical +stream end is detected or an error occurs. Because of this, it +is possible to detect the stream end by observing when the number +of bytes returned is less than the number requested. +Nevertheless, this is regarded as inadvisable; you should instead +check @samp{bzerror} after every call +and watch out for +@samp{BZ_STREAM_END}. + +Internally, @samp{BZ2_bzRead} +copies data from the compressed file in chunks of size +@samp{BZ_MAX_UNUSED} bytes before +decompressing it. If the file contains more bytes than strictly +needed to reach the logical end-of-stream, +@samp{BZ2_bzRead} will almost certainly +read some of the trailing data before signalling +@samp{BZ_SEQUENCE_END}. To collect the +read but unused data once +@samp{BZ_SEQUENCE_END} has appeared, +call @samp{BZ2_bzReadGetUnused} +immediately before +@samp{BZ2_bzReadClose}. + +Possible assignments to +@samp{bzerror}: + +@example + +BZ_PARAM_ERROR + if b is NULL or buf is NULL or len < 0 +BZ_SEQUENCE_ERROR + if b was opened with BZ2_bzWriteOpen +BZ_IO_ERROR + if there is an error reading from the compressed file +BZ_UNEXPECTED_EOF + if the compressed file ended before + the logical end-of-stream was detected +BZ_DATA_ERROR + if a data integrity error was detected in the compressed stream +BZ_DATA_ERROR_MAGIC + if the stream does not begin with the requisite header bytes + (ie, is not a bzip2 data file). This is really + a special case of BZ_DATA_ERROR. +BZ_MEM_ERROR + if insufficient memory was available +BZ_STREAM_END + if the logical end of stream was detected. +BZ_OK + otherwise. +@end example + +Possible return values: + +@example + +number of bytes read + if bzerror is BZ_OK or BZ_STREAM_END +undefined + otherwise +@end example + +Allowable next actions: + +@example + +collect data from buf, then BZ2_bzRead or BZ2_bzReadClose + if bzerror is BZ_OK +collect data from buf, then BZ2_bzReadClose or BZ2_bzReadGetUnused + if bzerror is BZ_SEQUENCE_END +BZ2_bzReadClose + otherwise +@end example + +@node BZ2_bzReadGetUnused, BZ2_bzReadClose, BZ2_bzRead, High-level interface +@subsection BZ2_bzReadGetUnused + +@example + +void BZ2_bzReadGetUnused( int* bzerror, BZFILE *b, + void** unused, int* nUnused ); +@end example + +Returns data which was read from the compressed file but +was not needed to get to the logical end-of-stream. +@samp{*unused} is set to the address of +the data, and @samp{*nUnused} to the +number of bytes. @samp{*nUnused} will +be set to a value between @samp{0} and +@samp{BZ_MAX_UNUSED} inclusive. + +This function may only be called once +@samp{BZ2_bzRead} has signalled +@samp{BZ_STREAM_END} but before +@samp{BZ2_bzReadClose}. + +Possible assignments to +@samp{bzerror}: + +@example + +BZ_PARAM_ERROR + if b is NULL + or unused is NULL or nUnused is NULL +BZ_SEQUENCE_ERROR + if BZ_STREAM_END has not been signalled + or if b was opened with BZ2_bzWriteOpen +BZ_OK + otherwise +@end example + +Allowable next actions: + +@example + +BZ2_bzReadClose +@end example + +@node BZ2_bzReadClose, BZ2_bzWriteOpen, BZ2_bzReadGetUnused, High-level interface +@subsection BZ2_bzReadClose + +@example + +void BZ2_bzReadClose ( int *bzerror, BZFILE *b ); +@end example + +Releases all memory pertaining to the compressed file +@samp{b}. +@samp{BZ2_bzReadClose} does not call +@samp{fclose} on the underlying file +handle, so you should do that yourself if appropriate. +@samp{BZ2_bzReadClose} should be called +to clean up after all error situations. + +Possible assignments to +@samp{bzerror}: + +@example + +BZ_SEQUENCE_ERROR + if b was opened with BZ2_bzOpenWrite +BZ_OK + otherwise +@end example + +Allowable next actions: + +@example + +none +@end example + +@node BZ2_bzWriteOpen, BZ2_bzWrite, BZ2_bzReadClose, High-level interface +@subsection BZ2_bzWriteOpen + +@example + +BZFILE *BZ2_bzWriteOpen( int *bzerror, FILE *f, + int blockSize100k, int verbosity, + int workFactor ); +@end example + +Prepare to write compressed data to file handle +@samp{f}. +@samp{f} should refer to a file which +has been opened for writing, and for which the error indicator +(@samp{ferror(f)})is not set. + +For the meaning of parameters +@samp{blockSize100k}, +@samp{verbosity} and +@samp{workFactor}, see +@samp{BZ2_bzCompressInit}. + +All required memory is allocated at this stage, so if the +call completes successfully, +@samp{BZ_MEM_ERROR} cannot be signalled +by a subsequent call to +@samp{BZ2_bzWrite}. + +Possible assignments to +@samp{bzerror}: + +@example + +BZ_CONFIG_ERROR + if the library has been mis-compiled +BZ_PARAM_ERROR + if f is NULL + or blockSize100k < 1 or blockSize100k > 9 +BZ_IO_ERROR + if ferror(f) is nonzero +BZ_MEM_ERROR + if insufficient memory is available +BZ_OK + otherwise +@end example + +Possible return values: + +@example + +Pointer to an abstract BZFILE + if bzerror is BZ_OK +NULL + otherwise +@end example + +Allowable next actions: + +@example + +BZ2_bzWrite + if bzerror is BZ_OK + (you could go directly to BZ2_bzWriteClose, but this would be pretty pointless) +BZ2_bzWriteClose + otherwise +@end example + +@node BZ2_bzWrite, BZ2_bzWriteClose, BZ2_bzWriteOpen, High-level interface +@subsection BZ2_bzWrite + +@example + +void BZ2_bzWrite ( int *bzerror, BZFILE *b, void *buf, int len ); +@end example + +Absorbs @samp{len} bytes from the +buffer @samp{buf}, eventually to be +compressed and written to the file. + +Possible assignments to +@samp{bzerror}: + +@example + +BZ_PARAM_ERROR + if b is NULL or buf is NULL or len < 0 +BZ_SEQUENCE_ERROR + if b was opened with BZ2_bzReadOpen +BZ_IO_ERROR + if there is an error writing the compressed file. +BZ_OK + otherwise +@end example + +@node BZ2_bzWriteClose, Handling embedded compressed data streams, BZ2_bzWrite, High-level interface +@subsection BZ2_bzWriteClose + +@example + +void BZ2_bzWriteClose( int *bzerror, BZFILE* f, + int abandon, + unsigned int* nbytes_in, + unsigned int* nbytes_out ); + +void BZ2_bzWriteClose64( int *bzerror, BZFILE* f, + int abandon, + unsigned int* nbytes_in_lo32, + unsigned int* nbytes_in_hi32, + unsigned int* nbytes_out_lo32, + unsigned int* nbytes_out_hi32 ); +@end example + +Compresses and flushes to the compressed file all data so +far supplied by @samp{BZ2_bzWrite}. +The logical end-of-stream markers are also written, so subsequent +calls to @samp{BZ2_bzWrite} are +illegal. All memory associated with the compressed file +@samp{b} is released. +@samp{fflush} is called on the +compressed file, but it is not +@samp{fclose}'d. + +If @samp{BZ2_bzWriteClose} is +called to clean up after an error, the only action is to release +the memory. The library records the error codes issued by +previous calls, so this situation will be detected automatically. +There is no attempt to complete the compression operation, nor to +@samp{fflush} the compressed file. You +can force this behaviour to happen even in the case of no error, +by passing a nonzero value to +@samp{abandon}. + +If @samp{nbytes_in} is non-null, +@samp{*nbytes_in} will be set to be the +total volume of uncompressed data handled. Similarly, +@samp{nbytes_out} will be set to the +total volume of compressed data written. For compatibility with +older versions of the library, +@samp{BZ2_bzWriteClose} only yields the +lower 32 bits of these counts. Use +@samp{BZ2_bzWriteClose64} if you want +the full 64 bit counts. These two functions are otherwise +absolutely identical. + +Possible assignments to +@samp{bzerror}: + +@example + +BZ_SEQUENCE_ERROR + if b was opened with BZ2_bzReadOpen +BZ_IO_ERROR + if there is an error writing the compressed file +BZ_OK + otherwise +@end example + +@node Handling embedded compressed data streams, Standard file-reading/writing code, BZ2_bzWriteClose, High-level interface +@subsection Handling embedded compressed data streams + +The high-level library facilitates use of +@samp{bzip2} data streams which form +some part of a surrounding, larger data stream. + +@itemize @bullet{} + +@item +For writing, the library takes an open file handle, +writes compressed data to it, +@samp{fflush}es it but does not +@samp{fclose} it. The calling +application can write its own data before and after the +compressed data stream, using that same file handle. + +@item +Reading is more complex, and the facilities are not as +general as they could be since generality is hard to reconcile +with efficiency. @samp{BZ2_bzRead} +reads from the compressed file in blocks of size +@samp{BZ_MAX_UNUSED} bytes, and in +doing so probably will overshoot the logical end of compressed +stream. To recover this data once decompression has ended, +call @samp{BZ2_bzReadGetUnused} after +the last call of @samp{BZ2_bzRead} +(the one returning +@samp{BZ_STREAM_END}) but before +calling +@samp{BZ2_bzReadClose}. +@end itemize + +This mechanism makes it easy to decompress multiple +@samp{bzip2} streams placed end-to-end. +As the end of one stream, when +@samp{BZ2_bzRead} returns +@samp{BZ_STREAM_END}, call +@samp{BZ2_bzReadGetUnused} to collect +the unused data (copy it into your own buffer somewhere). That +data forms the start of the next compressed stream. To start +uncompressing that next stream, call +@samp{BZ2_bzReadOpen} again, feeding in +the unused data via the @samp{unused} / +@samp{nUnused} parameters. Keep doing +this until @samp{BZ_STREAM_END} return +coincides with the physical end of file +(@samp{feof(f)}). In this situation +@samp{BZ2_bzReadGetUnused} will of +course return no data. + +This should give some feel for how the high-level interface +can be used. If you require extra flexibility, you'll have to +bite the bullet and get to grips with the low-level +interface. + +@node Standard file-reading/writing code, , Handling embedded compressed data streams, High-level interface +@subsection Standard file-reading/writing code + +Here's how you'd write data to a compressed file: + +@example + +FILE* f; +BZFILE* b; +int nBuf; +char buf[ /* whatever size you like */ ]; +int bzerror; +int nWritten; + +f = fopen ( "myfile.bz2", "w" ); +if ( !f ) @{ + /* handle error */ +@} +b = BZ2_bzWriteOpen( &bzerror, f, 9 ); +if (bzerror != BZ_OK) @{ + BZ2_bzWriteClose ( b ); + /* handle error */ +@} + +while ( /* condition */ ) @{ + /* get data to write into buf, and set nBuf appropriately */ + nWritten = BZ2_bzWrite ( &bzerror, b, buf, nBuf ); + if (bzerror == BZ_IO_ERROR) @{ + BZ2_bzWriteClose ( &bzerror, b ); + /* handle error */ + @} +@} + +BZ2_bzWriteClose( &bzerror, b ); +if (bzerror == BZ_IO_ERROR) @{ + /* handle error */ +@} +@end example + +And to read from a compressed file: + +@example + +FILE* f; +BZFILE* b; +int nBuf; +char buf[ /* whatever size you like */ ]; +int bzerror; +int nWritten; + +f = fopen ( "myfile.bz2", "r" ); +if ( !f ) @{ + /* handle error */ +@} +b = BZ2_bzReadOpen ( &bzerror, f, 0, NULL, 0 ); +if ( bzerror != BZ_OK ) @{ + BZ2_bzReadClose ( &bzerror, b ); + /* handle error */ +@} + +bzerror = BZ_OK; +while ( bzerror == BZ_OK && /* arbitrary other conditions */) @{ + nBuf = BZ2_bzRead ( &bzerror, b, buf, /* size of buf */ ); + if ( bzerror == BZ_OK ) @{ + /* do something with buf[0 .. nBuf-1] */ + @} +@} +if ( bzerror != BZ_STREAM_END ) @{ + BZ2_bzReadClose ( &bzerror, b ); + /* handle error */ +@} else @{ + BZ2_bzReadClose ( &bzerror ); +@} +@end example + +@node Utility functions, zlib compatibility functions, High-level interface, Programming with libbzip2 +@section Utility functions + +@menu +* BZ2_bzBuffToBuffCompress:: +* BZ2_bzBuffToBuffDecompress:: +@end menu + +@node BZ2_bzBuffToBuffCompress, BZ2_bzBuffToBuffDecompress, , Utility functions +@subsection BZ2_bzBuffToBuffCompress + +@example + +int BZ2_bzBuffToBuffCompress( char* dest, + unsigned int* destLen, + char* source, + unsigned int sourceLen, + int blockSize100k, + int verbosity, + int workFactor ); +@end example + +Attempts to compress the data in @samp{source[0 +.. sourceLen-1]} into the destination buffer, +@samp{dest[0 .. *destLen-1]}. If the +destination buffer is big enough, +@samp{*destLen} is set to the size of +the compressed data, and @samp{BZ_OK} +is returned. If the compressed data won't fit, +@samp{*destLen} is unchanged, and +@samp{BZ_OUTBUFF_FULL} is +returned. + +Compression in this manner is a one-shot event, done with a +single call to this function. The resulting compressed data is a +complete @samp{bzip2} format data +stream. There is no mechanism for making additional calls to +provide extra input data. If you want that kind of mechanism, +use the low-level interface. + +For the meaning of parameters +@samp{blockSize100k}, +@samp{verbosity} and +@samp{workFactor}, see +@samp{BZ2_bzCompressInit}. + +To guarantee that the compressed data will fit in its +buffer, allocate an output buffer of size 1% larger than the +uncompressed data, plus six hundred extra bytes. + +@samp{BZ2_bzBuffToBuffDecompress} +will not write data at or beyond +@samp{dest[*destLen]}, even in case of +buffer overflow. + +Possible return values: + +@example + +BZ_CONFIG_ERROR + if the library has been mis-compiled +BZ_PARAM_ERROR + if dest is NULL or destLen is NULL + or blockSize100k < 1 or blockSize100k > 9 + or verbosity < 0 or verbosity > 4 + or workFactor < 0 or workFactor > 250 +BZ_MEM_ERROR + if insufficient memory is available +BZ_OUTBUFF_FULL + if the size of the compressed data exceeds *destLen +BZ_OK + otherwise +@end example + +@node BZ2_bzBuffToBuffDecompress, , BZ2_bzBuffToBuffCompress, Utility functions +@subsection BZ2_bzBuffToBuffDecompress + +@example + +int BZ2_bzBuffToBuffDecompress( char* dest, + unsigned int* destLen, + char* source, + unsigned int sourceLen, + int small, + int verbosity ); +@end example + +Attempts to decompress the data in @samp{source[0 +.. sourceLen-1]} into the destination buffer, +@samp{dest[0 .. *destLen-1]}. If the +destination buffer is big enough, +@samp{*destLen} is set to the size of +the uncompressed data, and @samp{BZ_OK} +is returned. If the compressed data won't fit, +@samp{*destLen} is unchanged, and +@samp{BZ_OUTBUFF_FULL} is +returned. + +@samp{source} is assumed to hold +a complete @samp{bzip2} format data +stream. +@samp{BZ2_bzBuffToBuffDecompress} tries +to decompress the entirety of the stream into the output +buffer. + +For the meaning of parameters +@samp{small} and +@samp{verbosity}, see +@samp{BZ2_bzDecompressInit}. + +Because the compression ratio of the compressed data cannot +be known in advance, there is no easy way to guarantee that the +output buffer will be big enough. You may of course make +arrangements in your code to record the size of the uncompressed +data, but such a mechanism is beyond the scope of this +library. + +@samp{BZ2_bzBuffToBuffDecompress} +will not write data at or beyond +@samp{dest[*destLen]}, even in case of +buffer overflow. + +Possible return values: + +@example + +BZ_CONFIG_ERROR + if the library has been mis-compiled +BZ_PARAM_ERROR + if dest is NULL or destLen is NULL + or small != 0 && small != 1 + or verbosity < 0 or verbosity > 4 +BZ_MEM_ERROR + if insufficient memory is available +BZ_OUTBUFF_FULL + if the size of the compressed data exceeds *destLen +BZ_DATA_ERROR + if a data integrity error was detected in the compressed data +BZ_DATA_ERROR_MAGIC + if the compressed data doesn't begin with the right magic bytes +BZ_UNEXPECTED_EOF + if the compressed data ends unexpectedly +BZ_OK + otherwise +@end example + +@node zlib compatibility functions, Using the library in a stdio-free environment, Utility functions, Programming with libbzip2 +@section zlib compatibility functions + +Yoshioka Tsuneo has contributed some functions to give +better @samp{zlib} compatibility. +These functions are @samp{BZ2_bzopen}, +@samp{BZ2_bzread}, +@samp{BZ2_bzwrite}, +@samp{BZ2_bzflush}, +@samp{BZ2_bzclose}, +@samp{BZ2_bzerror} and +@samp{BZ2_bzlibVersion}. These +functions are not (yet) officially part of the library. If they +break, you get to keep all the pieces. Nevertheless, I think +they work ok. + +@example + +typedef void BZFILE; + +const char * BZ2_bzlibVersion ( void ); +@end example + +Returns a string indicating the library version. + +@example + +BZFILE * BZ2_bzopen ( const char *path, const char *mode ); +BZFILE * BZ2_bzdopen ( int fd, const char *mode ); +@end example + +Opens a @samp{.bz2} file for +reading or writing, using either its name or a pre-existing file +descriptor. Analogous to @samp{fopen} +and @samp{fdopen}. + +@example + +int BZ2_bzread ( BZFILE* b, void* buf, int len ); +int BZ2_bzwrite ( BZFILE* b, void* buf, int len ); +@end example + +Reads/writes data from/to a previously opened +@samp{BZFILE}. Analogous to +@samp{fread} and +@samp{fwrite}. + +@example + +int BZ2_bzflush ( BZFILE* b ); +void BZ2_bzclose ( BZFILE* b ); +@end example + +Flushes/closes a @samp{BZFILE}. +@samp{BZ2_bzflush} doesn't actually do +anything. Analogous to @samp{fflush} +and @samp{fclose}. + +@example + +const char * BZ2_bzerror ( BZFILE *b, int *errnum ) +@end example + +Returns a string describing the more recent error status of +@samp{b}, and also sets +@samp{*errnum} to its numerical +value. + +@node Using the library in a stdio-free environment, Making a Windows DLL, zlib compatibility functions, Programming with libbzip2 +@section Using the library in a stdio-free environment + +@menu +* Getting rid of stdio:: +* Critical error handling:: +@end menu + +@node Getting rid of stdio, Critical error handling, , Using the library in a stdio-free environment +@subsection Getting rid of stdio + +In a deeply embedded application, you might want to use +just the memory-to-memory functions. You can do this +conveniently by compiling the library with preprocessor symbol +@samp{BZ_NO_STDIO} defined. Doing this +gives you a library containing only the following eight +functions: + +@samp{BZ2_bzCompressInit}, +@samp{BZ2_bzCompress}, +@samp{BZ2_bzCompressEnd} +@samp{BZ2_bzDecompressInit}, +@samp{BZ2_bzDecompress}, +@samp{BZ2_bzDecompressEnd} +@samp{BZ2_bzBuffToBuffCompress}, +@samp{BZ2_bzBuffToBuffDecompress} + +When compiled like this, all functions will ignore +@samp{verbosity} settings. + +@node Critical error handling, , Getting rid of stdio, Using the library in a stdio-free environment +@subsection Critical error handling + +@samp{libbzip2} contains a number +of internal assertion checks which should, needless to say, never +be activated. Nevertheless, if an assertion should fail, +behaviour depends on whether or not the library was compiled with +@samp{BZ_NO_STDIO} set. + +For a normal compile, an assertion failure yields the +message: + +@quotation + +bzip2/libbzip2: internal error number N. + +This is a bug in bzip2/libbzip2, 1.0.3 of 15 February 2005. +Please report it to me at: jseward@@bzip.org. If this happened +when you were using some program which uses libbzip2 as a +component, you should also report this bug to the author(s) +of that program. Please make an effort to report this bug; +timely and accurate bug reports eventually lead to higher +quality software. Thanks. Julian Seward, 15 February 2005. +@end quotation + +where @samp{N} is some error code +number. If @samp{N == 1007}, it also +prints some extra text advising the reader that unreliable memory +is often associated with internal error 1007. (This is a +frequently-observed-phenomenon with versions 1.0.0/1.0.1). + +@samp{exit(3)} is then +called. + +For a @samp{stdio}-free library, +assertion failures result in a call to a function declared +as: + +@example + +extern void bz_internal_error ( int errcode ); +@end example + +The relevant code is passed as a parameter. You should +supply such a function. + +In either case, once an assertion failure has occurred, any +@samp{bz_stream} records involved can +be regarded as invalid. You should not attempt to resume normal +operation with them. + +You may, of course, change critical error handling to suit +your needs. As I said above, critical errors indicate bugs in +the library and should not occur. All "normal" error situations +are indicated via error return codes from functions, and can be +recovered from. + +@node Making a Windows DLL, , Using the library in a stdio-free environment, Programming with libbzip2 +@section Making a Windows DLL + +Everything related to Windows has been contributed by +Yoshioka Tsuneo +(@samp{QWF00133@@niftyserve.or.jp} / +@samp{tsuneo-y@@is.aist-nara.ac.jp}), so +you should send your queries to him (but perhaps Cc: me, +@samp{jseward@@bzip.org}). + +My vague understanding of what to do is: using Visual C++ +5.0, open the project file +@samp{libbz2.dsp}, and build. That's +all. + +If you can't open the project file for some reason, make a +new one, naming these files: +@samp{blocksort.c}, +@samp{bzlib.c}, +@samp{compress.c}, +@samp{crctable.c}, +@samp{decompress.c}, +@samp{huffman.c}, +@samp{randtable.c} and +@samp{libbz2.def}. You will also need +to name the header files @samp{bzlib.h} +and @samp{bzlib_private.h}. + +If you don't use VC++, you may need to define the +proprocessor symbol +@samp{_WIN32}. + +Finally, @samp{dlltest.c} is a +sample program using the DLL. It has a project file, +@samp{dlltest.dsp}. + +If you just want a makefile for Visual C, have a look at +@samp{makefile.msc}. + +Be aware that if you compile +@samp{bzip2} itself on Win32, you must +set @samp{BZ_UNIX} to 0 and +@samp{BZ_LCCWIN32} to 1, in the file +@samp{bzip2.c}, before compiling. +Otherwise the resulting binary won't work correctly. + +I haven't tried any of this stuff myself, but it all looks +plausible. + +@node Miscellanea, , Programming with libbzip2, Top +@chapter Miscellanea + +These are just some random thoughts of mine. Your mileage +may vary. + +@menu +* Limitations of the compressed file format:: +* Portability issues:: +* Reporting bugs:: +* Did you get the right package?:: +* Further Reading:: +@end menu + +@node Limitations of the compressed file format, Portability issues, , Miscellanea +@section Limitations of the compressed file format + +@samp{bzip2-1.0.X}, +@samp{0.9.5} and +@samp{0.9.0} use exactly the same file +format as the original version, +@samp{bzip2-0.1}. This decision was +made in the interests of stability. Creating yet another +incompatible compressed file format would create further +confusion and disruption for users. + +Nevertheless, this is not a painless decision. Development +work since the release of +@samp{bzip2-0.1} in August 1997 has +shown complexities in the file format which slow down +decompression and, in retrospect, are unnecessary. These +are: + +@itemize @bullet{} + +@item +The run-length encoder, which is the first of the +compression transformations, is entirely irrelevant. The +original purpose was to protect the sorting algorithm from the +very worst case input: a string of repeated symbols. But +algorithm steps Q6a and Q6b in the original Burrows-Wheeler +technical report (SRC-124) show how repeats can be handled +without difficulty in block sorting. + +@item +The randomisation mechanism doesn't really need to be +there. Udi Manber and Gene Myers published a suffix array +construction algorithm a few years back, which can be employed +to sort any block, no matter how repetitive, in O(N log N) +time. Subsequent work by Kunihiko Sadakane has produced a +derivative O(N (log N)^2) algorithm which usually outperforms +the Manber-Myers algorithm. + +I could have changed to Sadakane's algorithm, but I find +it to be slower than @samp{bzip2}'s +existing algorithm for most inputs, and the randomisation +mechanism protects adequately against bad cases. I didn't +think it was a good tradeoff to make. Partly this is due to +the fact that I was not flooded with email complaints about +@samp{bzip2-0.1}'s performance on +repetitive data, so perhaps it isn't a problem for real +inputs. + +Probably the best long-term solution, and the one I have +incorporated into 0.9.5 and above, is to use the existing +sorting algorithm initially, and fall back to a O(N (log N)^2) +algorithm if the standard algorithm gets into +difficulties. + +@item +The compressed file format was never designed to be +handled by a library, and I have had to jump though some hoops +to produce an efficient implementation of decompression. It's +a bit hairy. Try passing +@samp{decompress.c} through the C +preprocessor and you'll see what I mean. Much of this +complexity could have been avoided if the compressed size of +each block of data was recorded in the data stream. + +@item +An Adler-32 checksum, rather than a CRC32 checksum, +would be faster to compute. +@end itemize + +It would be fair to say that the +@samp{bzip2} format was frozen before I +properly and fully understood the performance consequences of +doing so. + +Improvements which I was able to incorporate into 0.9.0, +despite using the same file format, are: + +@itemize @bullet{} + +@item +Single array implementation of the inverse BWT. This +significantly speeds up decompression, presumably because it +reduces the number of cache misses. + +@item +Faster inverse MTF transform for large MTF values. +The new implementation is based on the notion of sliding blocks +of values. + +@item +@samp{bzip2-0.9.0} now reads +and writes files with @samp{fread} +and @samp{fwrite}; version 0.1 used +@samp{putc} and +@samp{getc}. Duh! Well, you live +and learn. +@end itemize + +Further ahead, it would be nice to be able to do random +access into files. This will require some careful design of +compressed file formats. + +@node Portability issues, Reporting bugs, Limitations of the compressed file format, Miscellanea +@section Portability issues + +After some consideration, I have decided not to use GNU +@samp{autoconf} to configure 0.9.5 or +1.0. + +@samp{autoconf}, admirable and +wonderful though it is, mainly assists with portability problems +between Unix-like platforms. But +@samp{bzip2} doesn't have much in the +way of portability problems on Unix; most of the difficulties +appear when porting to the Mac, or to Microsoft's operating +systems. @samp{autoconf} doesn't help +in those cases, and brings in a whole load of new +complexity. + +Most people should be able to compile the library and +program under Unix straight out-of-the-box, so to speak, +especially if you have a version of GNU C available. + +There are a couple of +@samp{__inline__} directives in the +code. GNU C (@samp{gcc}) should be +able to handle them. If you're not using GNU C, your C compiler +shouldn't see them at all. If your compiler does, for some +reason, see them and doesn't like them, just +@samp{#define} +@samp{__inline__} to be +@samp{/* */}. One easy way to do this +is to compile with the flag +@samp{-D__inline__=}, which should be +understood by most Unix compilers. + +If you still have difficulties, try compiling with the +macro @samp{BZ_STRICT_ANSI} defined. +This should enable you to build the library in a strictly ANSI +compliant environment. Building the program itself like this is +dangerous and not supported, since you remove +@samp{bzip2}'s checks against +compressing directories, symbolic links, devices, and other +not-really-a-file entities. This could cause filesystem +corruption! + +One other thing: if you create a +@samp{bzip2} binary for public distribution, +please consider linking it statically (@samp{gcc +-static}). This avoids all sorts of library-version +issues that others may encounter later on. + +If you build @samp{bzip2} on +Win32, you must set @samp{BZ_UNIX} to 0 +and @samp{BZ_LCCWIN32} to 1, in the +file @samp{bzip2.c}, before compiling. +Otherwise the resulting binary won't work correctly. + +@node Reporting bugs, Did you get the right package?, Portability issues, Miscellanea +@section Reporting bugs + +I tried pretty hard to make sure +@samp{bzip2} is bug free, both by +design and by testing. Hopefully you'll never need to read this +section for real. + +Nevertheless, if @samp{bzip2} dies +with a segmentation fault, a bus error or an internal assertion +failure, it will ask you to email me a bug report. Experience from +years of feedback of bzip2 users indicates that almost all these +problems can be traced to either compiler bugs or hardware +problems. + +@itemize @bullet{} + +@item +Recompile the program with no optimisation, and +see if it works. And/or try a different compiler. I heard all +sorts of stories about various flavours of GNU C (and other +compilers) generating bad code for +@samp{bzip2}, and I've run across two +such examples myself. + +2.7.X versions of GNU C are known to generate bad code +from time to time, at high optimisation levels. If you get +problems, try using the flags +@samp{-O2} +@samp{-fomit-frame-pointer} +@samp{-fno-strength-reduce}. You +should specifically @i{not} use +@samp{-funroll-loops}. + +You may notice that the Makefile runs six tests as part +of the build process. If the program passes all of these, it's +a pretty good (but not 100%) indication that the compiler has +done its job correctly. + +@item +If @samp{bzip2} +crashes randomly, and the crashes are not repeatable, you may +have a flaky memory subsystem. +@samp{bzip2} really hammers your +memory hierarchy, and if it's a bit marginal, you may get these +problems. Ditto if your disk or I/O subsystem is slowly +failing. Yup, this really does happen. + +Try using a different machine of the same type, and see +if you can repeat the problem. + +@item +This isn't really a bug, but ... If +@samp{bzip2} tells you your file is +corrupted on decompression, and you obtained the file via FTP, +there is a possibility that you forgot to tell FTP to do a +binary mode transfer. That absolutely will cause the file to +be non-decompressible. You'll have to transfer it +again. +@end itemize + +If you've incorporated +@samp{libbzip2} into your own program +and are getting problems, please, please, please, check that the +parameters you are passing in calls to the library, are correct, +and in accordance with what the documentation says is allowable. +I have tried to make the library robust against such problems, +but I'm sure I haven't succeeded. + +Finally, if the above comments don't help, you'll have to +send me a bug report. Now, it's just amazing how many people +will send me a bug report saying something like: + +@example + +bzip2 crashed with segmentation fault on my machine +@end example + +and absolutely nothing else. Needless to say, a such a +report is @i{totally, utterly, completely and +comprehensively 100% useless; a waste of your time, my time, and +net bandwidth}. With no details at all, there's no way +I can possibly begin to figure out what the problem is. + +The rules of the game are: facts, facts, facts. Don't omit +them because "oh, they won't be relevant". At the bare +minimum: + +@example + +Machine type. Operating system version. +Exact version of bzip2 (do bzip2 -V). +Exact version of the compiler used. +Flags passed to the compiler. +@end example + +However, the most important single thing that will help me +is the file that you were trying to compress or decompress at the +time the problem happened. Without that, my ability to do +anything more than speculate about the cause, is limited. + +@node Did you get the right package?, Further Reading, Reporting bugs, Miscellanea +@section Did you get the right package? + +@samp{bzip2} is a resource hog. +It soaks up large amounts of CPU cycles and memory. Also, it +gives very large latencies. In the worst case, you can feed many +megabytes of uncompressed data into the library before getting +any compressed output, so this probably rules out applications +requiring interactive behaviour. + +These aren't faults of my implementation, I hope, but more +an intrinsic property of the Burrows-Wheeler transform +(unfortunately). Maybe this isn't what you want. + +If you want a compressor and/or library which is faster, +uses less memory but gets pretty good compression, and has +minimal latency, consider Jean-loup Gailly's and Mark Adler's +work, @samp{zlib-1.2.1} and +@samp{gzip-1.2.4}. Look for them at +@uref{http://www.zlib.org,http://www.zlib.org} and +@uref{http://www.gzip.org,http://www.gzip.org} +respectively. + +For something faster and lighter still, you might try Markus F +X J Oberhumer's @samp{LZO} real-time +compression/decompression library, at +@uref{http://www.oberhumer.com/opensource,http://www.oberhumer.com/opensource}. + +@node Further Reading, , Did you get the right package?, Miscellanea +@section Further Reading + +@samp{bzip2} is not research +work, in the sense that it doesn't present any new ideas. +Rather, it's an engineering exercise based on existing +ideas. + +Four documents describe essentially all the ideas behind +@samp{bzip2}: + +@display +Michael Burrows and D. J. Wheeler: + "A block-sorting lossless data compression algorithm" + 10th May 1994. + Digital SRC Research Report 124. + ftp://ftp.digital.com/pub/DEC/SRC/research-reports/SRC-124.ps.gz + If you have trouble finding it, try searching at the + New Zealand Digital Library, http://www.nzdl.org. + +Daniel S. Hirschberg and Debra A. LeLewer + "Efficient Decoding of Prefix Codes" + Communications of the ACM, April 1990, Vol 33, Number 4. + You might be able to get an electronic copy of this + from the ACM Digital Library. + +David J. Wheeler + Program bred3.c and accompanying document bred3.ps. + This contains the idea behind the multi-table Huffman coding scheme. + ftp://ftp.cl.cam.ac.uk/users/djw3/ + +Jon L. Bentley and Robert Sedgewick + "Fast Algorithms for Sorting and Searching Strings" + Available from Sedgewick's web page, + www.cs.princeton.edu/~rs +@end display + +The following paper gives valuable additional insights into +the algorithm, but is not immediately the basis of any code used +in bzip2. + +@display +Peter Fenwick: + Block Sorting Text Compression + Proceedings of the 19th Australasian Computer Science Conference, + Melbourne, Australia. Jan 31 - Feb 2, 1996. + ftp://ftp.cs.auckland.ac.nz/pub/peter-f/ACSC96paper.ps +@end display + +Kunihiko Sadakane's sorting algorithm, mentioned above, is +available from: + +@display +http://naomi.is.s.u-tokyo.ac.jp/~sada/papers/Sada98b.ps.gz +@end display + +The Manber-Myers suffix array construction algorithm is +described in a paper available from: + +@display +http://www.cs.arizona.edu/people/gene/PAPERS/suffix.ps +@end display + +Finally, the following papers document some +investigations I made into the performance of sorting +and decompression algorithms: + +@display +Julian Seward + On the Performance of BWT Sorting Algorithms + Proceedings of the IEEE Data Compression Conference 2000 + Snowbird, Utah. 28-30 March 2000. + +Julian Seward + Space-time Tradeoffs in the Inverse B-W Transform + Proceedings of the IEEE Data Compression Conference 2001 + Snowbird, Utah. 27-29 March 2001. +@end display + +@bye --- a/bzexe.1 2011-12-04 13:55:53.589856334 +1100 +++ b/bzexe.1 2011-12-04 18:16:28.000000000 +1100 @@ -0,0 +1,43 @@ +.TH BZEXE 1 +.SH NAME +bzexe \- compress executable files in place +.SH SYNOPSIS +.B bzexe +[ name ... ] +.SH DESCRIPTION +The +.I bzexe +utility allows you to compress executables in place and have them +automatically uncompress and execute when you run them (at a penalty +in performance). For example if you execute ``bzexe /bin/cat'' it +will create the following two files: +.nf +.br + -r-xr-xr-x 1 root bin 9644 Feb 11 11:16 /bin/cat + -r-xr-xr-x 1 bin bin 24576 Nov 23 13:21 /bin/cat~ +.fi +/bin/cat~ is the original file and /bin/cat is the self-uncompressing +executable file. You can remove /bin/cat~ once you are sure that +/bin/cat works properly. +.PP +This utility is most useful on systems with very small disks. +.SH OPTIONS +.TP +.B \-d +Decompress the given executables instead of compressing them. +.SH "SEE ALSO" +bzip2(1), znew(1), zmore(1), zcmp(1), zforce(1) +.SH CAVEATS +The compressed executable is a shell script. This may create some +security holes. In particular, the compressed executable relies +on the PATH environment variable to find +.I gzip +and some other utilities +.I (tail, chmod, ln, sleep). +.SH "BUGS" +.I bzexe +attempts to retain the original file attributes on the compressed executable, +but you may have to fix them manually in some cases, using +.I chmod +or +.I chown. --- a/manual.info 2011-12-04 13:55:53.589856334 +1100 +++ b/manual.info 2011-12-04 18:16:28.000000000 +1100 @@ -0,0 +1,2338 @@ +This is manual.info, produced by makeinfo version 4.8 from manual.texi. + +START-INFO-DIR-ENTRY +* Bzip2: (bzip2). A program and library for data compression. +END-INFO-DIR-ENTRY + + +File: manual.info, Node: Top, Next: Introduction, Up: (dir) + +bzip2 and libbzip2, version 1.0.3 +********************************* + +* Menu: + +* Introduction:: +* How to use bzip2:: +* Programming with libbzip2:: +* Miscellanea:: + +--- The Detailed Node Listing --- + +How to use bzip2 + +* NAME:: +* SYNOPSIS:: +* DESCRIPTION:: +* OPTIONS:: +* MEMORY MANAGEMENT:: +* RECOVERING DATA FROM DAMAGED FILES:: +* PERFORMANCE NOTES:: +* CAVEATS:: +* AUTHOR:: + + Programming with libbzip2 + +* Top-level structure:: +* Error handling:: +* Low-level interface: >Low-level interface. +* High-level interface:: +* Utility functions:: +* zlib compatibility functions:: +* Using the library in a stdio-free environment:: +* Making a Windows DLL:: + +Miscellanea + +* Limitations of the compressed file format:: +* Portability issues:: +* Reporting bugs:: +* Did you get the right package?:: +* Further Reading:: + + +File: manual.info, Node: Introduction, Next: How to use bzip2, Prev: Top, Up: Top + +1 Introduction +************** + +`bzip2' compresses files using the Burrows-Wheeler block-sorting text +compression algorithm, and Huffman coding. Compression is generally +considerably better than that achieved by more conventional +LZ77/LZ78-based compressors, and approaches the performance of the PPM +family of statistical compressors. + + `bzip2' is built on top of `libbzip2', a flexible library for +handling compressed data in the `bzip2' format. This manual describes +both how to use the program and how to work with the library interface. +Most of the manual is devoted to this library, not the program, which +is good news if your interest is only in the program. + + * *Note How to use bzip2: How to use bzip2. describes how to use + `bzip2'; this is the only part you need to read if you just want + to know how to operate the program. + + * *Note Programming with libbzip2: Programming with libbzip2. + describes the programming interfaces in detail, and + + * *Note Miscellanea: Miscellanea. records some miscellaneous notes + which I thought ought to be recorded somewhere. + + +File: manual.info, Node: How to use bzip2, Next: Programming with libbzip2, Prev: Introduction, Up: Top + +2 How to use bzip2 +****************** + +This chapter contains a copy of the `bzip2' man page, and nothing else. + +* Menu: + +* NAME:: +* SYNOPSIS:: +* DESCRIPTION:: +* OPTIONS:: +* MEMORY MANAGEMENT:: +* RECOVERING DATA FROM DAMAGED FILES:: +* PERFORMANCE NOTES:: +* CAVEATS:: +* AUTHOR:: + + +File: manual.info, Node: NAME, Next: SYNOPSIS, Up: How to use bzip2 + +2.1 NAME +======== + + * `bzip2', `bunzip2' - a block-sorting file compressor, v1.0.3 + + * `bzcat' - decompresses files to stdout + + * `bzip2recover' - recovers data from damaged bzip2 files + + +File: manual.info, Node: SYNOPSIS, Next: DESCRIPTION, Prev: NAME, Up: How to use bzip2 + +2.2 SYNOPSIS +============ + + * `bzip2' [ -cdfkqstvzVL123456789 ] [ filenames ... ] + + * `bunzip2' [ -fkvsVL ] [ filenames ... ] + + * `bzcat' [ -s ] [ filenames ... ] + + * `bzip2recover' filename + + +File: manual.info, Node: DESCRIPTION, Next: OPTIONS, Prev: SYNOPSIS, Up: How to use bzip2 + +2.3 DESCRIPTION +=============== + +`bzip2' compresses files using the Burrows-Wheeler block sorting text +compression algorithm, and Huffman coding. Compression is generally +considerably better than that achieved by more conventional +LZ77/LZ78-based compressors, and approaches the performance of the PPM +family of statistical compressors. + + The command-line options are deliberately very similar to those of +GNU `gzip', but they are not identical. + + `bzip2' expects a list of file names to accompany the command-line +flags. Each file is replaced by a compressed version of itself, with +the name `original_name.bz2'. Each compressed file has the same +modification date, permissions, and, when possible, ownership as the +corresponding original, so that these properties can be correctly +restored at decompression time. File name handling is naive in the +sense that there is no mechanism for preserving original file names, +permissions, ownerships or dates in filesystems which lack these +concepts, or have serious file name length restrictions, such as MS-DOS. + + `bzip2' and `bunzip2' will by default not overwrite existing files. +If you want this to happen, specify the `-f' flag. + + If no file names are specified, `bzip2' compresses from standard +input to standard output. In this case, `bzip2' will decline to write +compressed output to a terminal, as this would be entirely +incomprehensible and therefore pointless. + + `bunzip2' (or `bzip2 -d') decompresses all specified files. Files +which were not created by `bzip2' will be detected and ignored, and a +warning issued. `bzip2' attempts to guess the filename for the +decompressed file from that of the compressed file as follows: + + * `filename.bz2 ' becomes `filename' + + * `filename.bz ' becomes `filename' + + * `filename.tbz2' becomes `filename.tar' + + * `filename.tbz ' becomes `filename.tar' + + * `anyothername ' becomes `anyothername.out' + + If the file does not end in one of the recognised endings, `.bz2', +`.bz', `.tbz2' or `.tbz', `bzip2' complains that it cannot guess the +name of the original file, and uses the original name with `.out' +appended. + + As with compression, supplying no filenames causes decompression +from standard input to standard output. + + `bunzip2' will correctly decompress a file which is the +concatenation of two or more compressed files. The result is the +concatenation of the corresponding uncompressed files. Integrity testing +(`-t') of concatenated compressed files is also supported. + + You can also compress or decompress files to the standard output by +giving the `-c' flag. Multiple files may be compressed and +decompressed like this. The resulting outputs are fed sequentially to +stdout. Compression of multiple files in this manner generates a stream +containing multiple compressed file representations. Such a stream can +be decompressed correctly only by `bzip2' version 0.9.0 or later. +Earlier versions of `bzip2' will stop after decompressing the first +file in the stream. + + `bzcat' (or `bzip2 -dc') decompresses all specified files to the +standard output. + + `bzip2' will read arguments from the environment variables `BZIP2' +and `BZIP', in that order, and will process them before any arguments +read from the command line. This gives a convenient way to supply +default arguments. + + Compression is always performed, even if the compressed file is +slightly larger than the original. Files of less than about one hundred +bytes tend to get larger, since the compression mechanism has a +constant overhead in the region of 50 bytes. Random data (including +the output of most file compressors) is coded at about 8.05 bits per +byte, giving an expansion of around 0.5%. + + As a self-check for your protection, `bzip2' uses 32-bit CRCs to make +sure that the decompressed version of a file is identical to the +original. This guards against corruption of the compressed data, and +against undetected bugs in `bzip2' (hopefully very unlikely). The +chances of data corruption going undetected is microscopic, about one +chance in four billion for each file processed. Be aware, though, that +the check occurs upon decompression, so it can only tell you that +something is wrong. It can't help you recover the original uncompressed +data. You can use `bzip2recover' to try to recover data from damaged +files. + + Return values: 0 for a normal exit, 1 for environmental problems +(file not found, invalid flags, I/O errors, etc.), 2 to indicate a +corrupt compressed file, 3 for an internal consistency error (eg, bug) +which caused `bzip2' to panic. + + +File: manual.info, Node: OPTIONS, Next: MEMORY MANAGEMENT, Prev: DESCRIPTION, Up: How to use bzip2 + +2.4 OPTIONS +=========== + +`-c --stdout' + Compress or decompress to standard output. + +`-d --decompress' + Force decompression. `bzip2', `bunzip2' and `bzcat' are really + the same program, and the decision about what actions to take is + done on the basis of which name is used. This flag overrides that + mechanism, and forces bzip2 to decompress. + +`-z --compress' + The complement to `-d': forces compression, regardless of the + invokation name. + +`-t --test' + Check integrity of the specified file(s), but don't decompress + them. This really performs a trial decompression and throws away + the result. + +`-f --force' + Force overwrite of output files. Normally, `bzip2' will not + overwrite existing output files. Also forces `bzip2' to break hard + links to files, which it otherwise wouldn't do. + + `bzip2' normally declines to decompress files which don't have the + correct magic header bytes. If forced (`-f'), however, it will + pass such files through unmodified. This is how GNU `gzip' behaves. + +`-k --keep' + Keep (don't delete) input files during compression or + decompression. + +`-s --small' + Reduce memory usage, for compression, decompression and testing. + Files are decompressed and tested using a modified algorithm which + only requires 2.5 bytes per block byte. This means any file can be + decompressed in 2300k of memory, albeit at about half the normal + speed. + + During compression, `-s' selects a block size of 200k, which + limits memory use to around the same figure, at the expense of + your compression ratio. In short, if your machine is low on memory + (8 megabytes or less), use `-s' for everything. See *Note MEMORY + MANAGEMENT: MEMORY MANAGEMENT. below. + +`-q --quiet' + Suppress non-essential warning messages. Messages pertaining to + I/O errors and other critical events will not be suppressed. + +`-v --verbose' + Verbose mode - show the compression ratio for each file processed. + Further `-v''s increase the verbosity level, spewing out lots of + information which is primarily of interest for diagnostic purposes. + +`-L --license -V --version' + Display the software version, license terms and conditions. + +`-1' (or `--fast') to `-9' (or `-best') + Set the block size to 100 k, 200 k ... 900 k when compressing. Has + no effect when decompressing. See *Note MEMORY MANAGEMENT: MEMORY + MANAGEMENT. below. The `--fast' and `--best' aliases are primarily + for GNU `gzip' compatibility. In particular, `--fast' doesn't + make things significantly faster. And `--best' merely selects the + default behaviour. + +`--' + Treats all subsequent arguments as file names, even if they start + with a dash. This is so you can handle files with names beginning + with a dash, for example: `bzip2 -- -myfilename'. + +`--repetitive-fast' +`--repetitive-best' + These flags are redundant in versions 0.9.5 and above. They + provided some coarse control over the behaviour of the sorting + algorithm in earlier versions, which was sometimes useful. 0.9.5 + and above have an improved algorithm which renders these flags + irrelevant. + + +File: manual.info, Node: MEMORY MANAGEMENT, Next: RECOVERING DATA FROM DAMAGED FILES, Prev: OPTIONS, Up: How to use bzip2 + +2.5 MEMORY MANAGEMENT +===================== + +`bzip2' compresses large files in blocks. The block size affects both +the compression ratio achieved, and the amount of memory needed for +compression and decompression. The flags `-1' through `-9' specify the +block size to be 100,000 bytes through 900,000 bytes (the default) +respectively. At decompression time, the block size used for +compression is read from the header of the compressed file, and +`bunzip2' then allocates itself just enough memory to decompress the +file. Since block sizes are stored in compressed files, it follows that +the flags `-1' to `-9' are irrelevant to and so ignored during +decompression. + + Compression and decompression requirements, in bytes, can be +estimated as: + + + Compression: 400k + ( 8 x block size ) + + Decompression: 100k + ( 4 x block size ), or + 100k + ( 2.5 x block size ) + + Larger block sizes give rapidly diminishing marginal returns. Most +of the compression comes from the first two or three hundred k of block +size, a fact worth bearing in mind when using `bzip2' on small machines. +It is also important to appreciate that the decompression memory +requirement is set at compression time by the choice of block size. + + For files compressed with the default 900k block size, `bunzip2' +will require about 3700 kbytes to decompress. To support decompression +of any file on a 4 megabyte machine, `bunzip2' has an option to +decompress using approximately half this amount of memory, about 2300 +kbytes. Decompression speed is also halved, so you should use this +option only where necessary. The relevant flag is `-s'. + + In general, try and use the largest block size memory constraints +allow, since that maximises the compression achieved. Compression and +decompression speed are virtually unaffected by block size. + + Another significant point applies to files which fit in a single +block - that means most files you'd encounter using a large block size. +The amount of real memory touched is proportional to the size of the +file, since the file is smaller than a block. For example, compressing +a file 20,000 bytes long with the flag `-9' will cause the compressor +to allocate around 7600k of memory, but only touch 400k + 20000 * 8 = +560 kbytes of it. Similarly, the decompressor will allocate 3700k but +only touch 100k + 20000 * 4 = 180 kbytes. + + Here is a table which summarises the maximum memory usage for +different block sizes. Also recorded is the total compressed size for +14 files of the Calgary Text Compression Corpus totalling 3,141,622 +bytes. This column gives some feel for how compression varies with +block size. These figures tend to understate the advantage of larger +block sizes for larger files, since the Corpus is dominated by smaller +files. + + + Compress Decompress Decompress Corpus + Flag usage usage -s usage Size + + -1 1200k 500k 350k 914704 + -2 2000k 900k 600k 877703 + -3 2800k 1300k 850k 860338 + -4 3600k 1700k 1100k 846899 + -5 4400k 2100k 1350k 845160 + -6 5200k 2500k 1600k 838626 + -7 6100k 2900k 1850k 834096 + -8 6800k 3300k 2100k 828642 + -9 7600k 3700k 2350k 828642 + + +File: manual.info, Node: RECOVERING DATA FROM DAMAGED FILES, Next: PERFORMANCE NOTES, Prev: MEMORY MANAGEMENT, Up: How to use bzip2 + +2.6 RECOVERING DATA FROM DAMAGED FILES +====================================== + +`bzip2' compresses files in blocks, usually 900kbytes long. Each block +is handled independently. If a media or transmission error causes a +multi-block `.bz2' file to become damaged, it may be possible to +recover data from the undamaged blocks in the file. + + The compressed representation of each block is delimited by a 48-bit +pattern, which makes it possible to find the block boundaries with +reasonable certainty. Each block also carries its own 32-bit CRC, so +damaged blocks can be distinguished from undamaged ones. + + `bzip2recover' is a simple program whose purpose is to search for +blocks in `.bz2' files, and write each block out into its own `.bz2' +file. You can then use `bzip2 -t' to test the integrity of the +resulting files, and decompress those which are undamaged. + + `bzip2recover' takes a single argument, the name of the damaged +file, and writes a number of files `rec0001file.bz2', +`rec0002file.bz2', etc, containing the extracted blocks. The output +filenames are designed so that the use of wildcards in subsequent +processing - for example, `bzip2 -dc rec*file.bz2 > recovered_data' - +lists the files in the correct order. + + `bzip2recover' should be of most use dealing with large `.bz2' +files, as these will contain many blocks. It is clearly futile to use +it on damaged single-block files, since a damaged block cannot be +recovered. If you wish to minimise any potential data loss through +media or transmission errors, you might consider compressing with a +smaller block size. + + +File: manual.info, Node: PERFORMANCE NOTES, Next: CAVEATS, Prev: RECOVERING DATA FROM DAMAGED FILES, Up: How to use bzip2 + +2.7 PERFORMANCE NOTES +===================== + +The sorting phase of compression gathers together similar strings in +the file. Because of this, files containing very long runs of repeated +symbols, like "aabaabaabaab ..." (repeated several hundred times) may +compress more slowly than normal. Versions 0.9.5 and above fare much +better than previous versions in this respect. The ratio between +worst-case and average-case compression time is in the region of 10:1. +For previous versions, this figure was more like 100:1. You can use the +`-vvvv' option to monitor progress in great detail, if you want. + + Decompression speed is unaffected by these phenomena. + + `bzip2' usually allocates several megabytes of memory to operate in, +and then charges all over it in a fairly random fashion. This means +that performance, both for compressing and decompressing, is largely +determined by the speed at which your machine can service cache misses. +Because of this, small changes to the code to reduce the miss rate have +been observed to give disproportionately large performance +improvements. I imagine `bzip2' will perform best on machines with very +large caches. + + +File: manual.info, Node: CAVEATS, Next: AUTHOR, Prev: PERFORMANCE NOTES, Up: How to use bzip2 + +2.8 CAVEATS +=========== + +I/O error messages are not as helpful as they could be. `bzip2' tries +hard to detect I/O errors and exit cleanly, but the details of what the +problem is sometimes seem rather misleading. + + This manual page pertains to version 1.0.3 of `bzip2'. Compressed +data created by this version is entirely forwards and backwards +compatible with the previous public releases, versions 0.1pl2, 0.9.0 and +0.9.5, 1.0.0, 1.0.1 and 1.0.2, but with the following exception: 0.9.0 +and above can correctly decompress multiple concatenated compressed +files. 0.1pl2 cannot do this; it will stop after decompressing just the +first file in the stream. + + `bzip2recover' versions prior to 1.0.2 used 32-bit integers to +represent bit positions in compressed files, so it could not handle +compressed files more than 512 megabytes long. Versions 1.0.2 and above +use 64-bit ints on some platforms which support them (GNU supported +targets, and Windows). To establish whether or not `bzip2recover' was +built with such a limitation, run it without arguments. In any event +you can build yourself an unlimited version if you can recompile it with +`MaybeUInt64' set to be an unsigned 64-bit integer. + + +File: manual.info, Node: AUTHOR, Prev: CAVEATS, Up: How to use bzip2 + +2.9 AUTHOR +========== + +Julian Seward, `jseward@bzip.org' + + The ideas embodied in `bzip2' are due to (at least) the following +people: Michael Burrows and David Wheeler (for the block sorting +transformation), David Wheeler (again, for the Huffman coder), Peter +Fenwick (for the structured coding model in the original `bzip', and +many refinements), and Alistair Moffat, Radford Neal and Ian Witten +(for the arithmetic coder in the original `bzip'). I am much indebted +for their help, support and advice. See the manual in the source +distribution for pointers to sources of documentation. Christian von +Roques encouraged me to look for faster sorting algorithms, so as to +speed up compression. Bela Lubkin encouraged me to improve the +worst-case compression performance. Donna Robinson XMLised the +documentation. Many people sent patches, helped with portability +problems, lent machines, gave advice and were generally helpful. + + +File: manual.info, Node: Programming with libbzip2, Next: Miscellanea, Prev: How to use bzip2, Up: Top + +3 Programming with libbzip2 +*************************** + +This chapter describes the programming interface to `libbzip2'. + + For general background information, particularly about memory use +and performance aspects, you'd be well advised to read *Note How to use +bzip2: How to use bzip2. as well. + +* Menu: + +* Top-level structure:: +* Error handling:: +* Low-level interface: >Low-level interface. +* High-level interface:: +* Utility functions:: +* zlib compatibility functions:: +* Using the library in a stdio-free environment:: +* Making a Windows DLL:: + + +File: manual.info, Node: Top-level structure, Next: Error handling, Up: Programming with libbzip2 + +3.1 Top-level structure +======================= + +`libbzip2' is a flexible library for compressing and decompressing data +in the `bzip2' data format. Although packaged as a single entity, it +helps to regard the library as three separate parts: the low level +interface, and the high level interface, and some utility functions. + + The structure of `libbzip2''s interfaces is similar to that of +Jean-loup Gailly's and Mark Adler's excellent `zlib' library. + + All externally visible symbols have names beginning `BZ2_'. This is +new in version 1.0. The intention is to minimise pollution of the +namespaces of library clients. + + To use any part of the library, you need to `#include <bzlib.h>' +into your sources. + +* Menu: + +* Low-level summary:: +* High-level summary:: +* Utility functions summary:: + + +File: manual.info, Node: Low-level summary, Next: High-level summary, Up: Top-level structure + +3.1.1 Low-level summary +----------------------- + +This interface provides services for compressing and decompressing data +in memory. There's no provision for dealing with files, streams or any +other I/O mechanisms, just straight memory-to-memory work. In fact, +this part of the library can be compiled without inclusion of +`stdio.h', which may be helpful for embedded applications. + + The low-level part of the library has no global variables and is +therefore thread-safe. + + Six routines make up the low level interface: `BZ2_bzCompressInit', +`BZ2_bzCompress', and `BZ2_bzCompressEnd' for compression, and a +corresponding trio `BZ2_bzDecompressInit', `BZ2_bzDecompress' and +`BZ2_bzDecompressEnd' for decompression. The `*Init' functions allocate +memory for compression/decompression and do other initialisations, +whilst the `*End' functions close down operations and release memory. + + The real work is done by `BZ2_bzCompress' and `BZ2_bzDecompress'. +These compress and decompress data from a user-supplied input buffer to +a user-supplied output buffer. These buffers can be any size; arbitrary +quantities of data are handled by making repeated calls to these +functions. This is a flexible mechanism allowing a consumer-pull style +of activity, or producer-push, or a mixture of both. + + +File: manual.info, Node: High-level summary, Next: Utility functions summary, Prev: Low-level summary, Up: Top-level structure + +3.1.2 High-level summary +------------------------ + +This interface provides some handy wrappers around the low-level +interface to facilitate reading and writing `bzip2' format files +(`.bz2' files). The routines provide hooks to facilitate reading files +in which the `bzip2' data stream is embedded within some larger-scale +file structure, or where there are multiple `bzip2' data streams +concatenated end-to-end. + + For reading files, `BZ2_bzReadOpen', `BZ2_bzRead', `BZ2_bzReadClose' +and `BZ2_bzReadGetUnused' are supplied. For writing files, +`BZ2_bzWriteOpen', `BZ2_bzWrite' and `BZ2_bzWriteFinish' are available. + + As with the low-level library, no global variables are used so the +library is per se thread-safe. However, if I/O errors occur whilst +reading or writing the underlying compressed files, you may have to +consult `errno' to determine the cause of the error. In that case, +you'd need a C library which correctly supports `errno' in a +multithreaded environment. + + To make the library a little simpler and more portable, +`BZ2_bzReadOpen' and `BZ2_bzWriteOpen' require you to pass them file +handles (`FILE*'s) which have previously been opened for reading or +writing respectively. That avoids portability problems associated with +file operations and file attributes, whilst not being much of an +imposition on the programmer. + + +File: manual.info, Node: Utility functions summary, Prev: High-level summary, Up: Top-level structure + +3.1.3 Utility functions summary +------------------------------- + +For very simple needs, `BZ2_bzBuffToBuffCompress' and +`BZ2_bzBuffToBuffDecompress' are provided. These compress data in +memory from one buffer to another buffer in a single function call. You +should assess whether these functions fulfill your memory-to-memory +compression/decompression requirements before investing effort in +understanding the more general but more complex low-level interface. + + Yoshioka Tsuneo (`QWF00133@niftyserve.or.jp' / +`tsuneo-y@is.aist-nara.ac.jp') has contributed some functions to give +better `zlib' compatibility. These functions are `BZ2_bzopen', +`BZ2_bzread', `BZ2_bzwrite', `BZ2_bzflush', `BZ2_bzclose', +`BZ2_bzerror' and `BZ2_bzlibVersion'. You may find these functions more +convenient for simple file reading and writing, than those in the +high-level interface. These functions are not (yet) officially part of +the library, and are minimally documented here. If they break, you get +to keep all the pieces. I hope to document them properly when time +permits. + + Yoshioka also contributed modifications to allow the library to be +built as a Windows DLL. + + +File: manual.info, Node: Error handling, Next: >Low-level interface, Prev: Top-level structure, Up: Programming with libbzip2 + +3.2 Error handling +================== + +The library is designed to recover cleanly in all situations, including +the worst-case situation of decompressing random data. I'm not 100% +sure that it can always do this, so you might want to add a signal +handler to catch segmentation violations during decompression if you +are feeling especially paranoid. I would be interested in hearing more +about the robustness of the library to corrupted compressed data. + + Version 1.0.3 more robust in this respect than any previous version. +Investigations with Valgrind (a tool for detecting problems with memory +management) indicate that, at least for the few files I tested, all +single-bit errors in the decompressed data are caught properly, with no +segmentation faults, no uses of uninitialised data, no out of range +reads or writes, and no infinite looping in the decompressor. So it's +certainly pretty robust, although I wouldn't claim it to be totally +bombproof. + + The file `bzlib.h' contains all definitions needed to use the +library. In particular, you should definitely not include +`bzlib_private.h'. + + In `bzlib.h', the various return values are defined. The following +list is not intended as an exhaustive description of the circumstances +in which a given value may be returned - those descriptions are given +later. Rather, it is intended to convey the rough meaning of each +return value. The first five actions are normal and not intended to +denote an error situation. + +`BZ_OK' + The requested action was completed successfully. + +`BZ_RUN_OK, BZ_FLUSH_OK, BZ_FINISH_OK' + In `BZ2_bzCompress', the requested flush/finish/nothing-special + action was completed successfully. + +`BZ_STREAM_END' + Compression of data was completed, or the logical stream end was + detected during decompression. + + The following return values indicate an error of some kind. + +`BZ_CONFIG_ERROR' + Indicates that the library has been improperly compiled on your + platform - a major configuration error. Specifically, it means + that `sizeof(char)', `sizeof(short)' and `sizeof(int)' are not 1, + 2 and 4 respectively, as they should be. Note that the library + should still work properly on 64-bit platforms which follow the + LP64 programming model - that is, where `sizeof(long)' and + `sizeof(void*)' are 8. Under LP64, `sizeof(int)' is still 4, so + `libbzip2', which doesn't use the `long' type, is OK. + +`BZ_SEQUENCE_ERROR' + When using the library, it is important to call the functions in + the correct sequence and with data structures (buffers etc) in the + correct states. `libbzip2' checks as much as it can to ensure + this is happening, and returns `BZ_SEQUENCE_ERROR' if not. Code + which complies precisely with the function semantics, as detailed + below, should never receive this value; such an event denotes + buggy code which you should investigate. + +`BZ_PARAM_ERROR' + Returned when a parameter to a function call is out of range or + otherwise manifestly incorrect. As with `BZ_SEQUENCE_ERROR', this + denotes a bug in the client code. The distinction between + `BZ_PARAM_ERROR' and `BZ_SEQUENCE_ERROR' is a bit hazy, but still + worth making. + +`BZ_MEM_ERROR' + Returned when a request to allocate memory failed. Note that the + quantity of memory needed to decompress a stream cannot be + determined until the stream's header has been read. So + `BZ2_bzDecompress' and `BZ2_bzRead' may return `BZ_MEM_ERROR' even + though some of the compressed data has been read. The same is not + true for compression; once `BZ2_bzCompressInit' or + `BZ2_bzWriteOpen' have successfully completed, `BZ_MEM_ERROR' + cannot occur. + +`BZ_DATA_ERROR' + Returned when a data integrity error is detected during + decompression. Most importantly, this means when stored and + computed CRCs for the data do not match. This value is also + returned upon detection of any other anomaly in the compressed + data. + +`BZ_DATA_ERROR_MAGIC' + As a special case of `BZ_DATA_ERROR', it is sometimes useful to + know when the compressed stream does not start with the correct + magic bytes (`'B' 'Z' 'h''). + +`BZ_IO_ERROR' + Returned by `BZ2_bzRead' and `BZ2_bzWrite' when there is an error + reading or writing in the compressed file, and by `BZ2_bzReadOpen' + and `BZ2_bzWriteOpen' for attempts to use a file for which the + error indicator (viz, `ferror(f)') is set. On receipt of + `BZ_IO_ERROR', the caller should consult `errno' and/or `perror' + to acquire operating-system specific information about the problem. + +`BZ_UNEXPECTED_EOF' + Returned by `BZ2_bzRead' when the compressed file finishes before + the logical end of stream is detected. + +`BZ_OUTBUFF_FULL' + Returned by `BZ2_bzBuffToBuffCompress' and + `BZ2_bzBuffToBuffDecompress' to indicate that the output data will + not fit into the output buffer provided. + + +File: manual.info, Node: >Low-level interface, Next: High-level interface, Prev: Error handling, Up: Programming with libbzip2 + +3.3 Low-level interface +======================= + +* Menu: + +* BZ2_bzCompressInit:: +* BZ2_bzCompress:: +* BZ2_bzCompressEnd:: +* BZ2_bzDecompressInit:: +* BZ2_bzDecompress:: +* BZ2_bzDecompressEnd:: + + +File: manual.info, Node: BZ2_bzCompressInit, Next: BZ2_bzCompress, Up: >Low-level interface + +3.3.1 BZ2_bzCompressInit +------------------------ + + + typedef struct { + char *next_in; + unsigned int avail_in; + unsigned int total_in_lo32; + unsigned int total_in_hi32; + + char *next_out; + unsigned int avail_out; + unsigned int total_out_lo32; + unsigned int total_out_hi32; + + void *state; + + void *(*bzalloc)(void *,int,int); + void (*bzfree)(void *,void *); + void *opaque; + } bz_stream; + + int BZ2_bzCompressInit ( bz_stream *strm, + int blockSize100k, + int verbosity, + int workFactor ); + + Prepares for compression. The `bz_stream' structure holds all data +pertaining to the compression activity. A `bz_stream' structure should +be allocated and initialised prior to the call. The fields of +`bz_stream' comprise the entirety of the user-visible data. `state' is +a pointer to the private data structures required for compression. + + Custom memory allocators are supported, via fields `bzalloc', +`bzfree', and `opaque'. The value `opaque' is passed to as the first +argument to all calls to `bzalloc' and `bzfree', but is otherwise +ignored by the library. The call `bzalloc ( opaque, n, m )' is expected +to return a pointer `p' to `n * m' bytes of memory, and `bzfree ( +opaque, p )' should free that memory. + + If you don't want to use a custom memory allocator, set `bzalloc', +`bzfree' and `opaque' to `NULL', and the library will then use the +standard `malloc' / `free' routines. + + Before calling `BZ2_bzCompressInit', fields `bzalloc', `bzfree' and +`opaque' should be filled appropriately, as just described. Upon +return, the internal state will have been allocated and initialised, and +`total_in_lo32', `total_in_hi32', `total_out_lo32' and `total_out_hi32' +will have been set to zero. These four fields are used by the library +to inform the caller of the total amount of data passed into and out of +the library, respectively. You should not try to change them. As of +version 1.0, 64-bit counts are maintained, even on 32-bit platforms, +using the `_hi32' fields to store the upper 32 bits of the count. So, +for example, the total amount of data in is `(total_in_hi32 << 32) + +total_in_lo32'. + + Parameter `blockSize100k' specifies the block size to be used for +compression. It should be a value between 1 and 9 inclusive, and the +actual block size used is 100000 x this figure. 9 gives the best +compression but takes most memory. + + Parameter `verbosity' should be set to a number between 0 and 4 +inclusive. 0 is silent, and greater numbers give increasingly verbose +monitoring/debugging output. If the library has been compiled with +`-DBZ_NO_STDIO', no such output will appear for any verbosity setting. + + Parameter `workFactor' controls how the compression phase behaves +when presented with worst case, highly repetitive, input data. If +compression runs into difficulties caused by repetitive data, the +library switches from the standard sorting algorithm to a fallback +algorithm. The fallback is slower than the standard algorithm by +perhaps a factor of three, but always behaves reasonably, no matter how +bad the input. + + Lower values of `workFactor' reduce the amount of effort the +standard algorithm will expend before resorting to the fallback. You +should set this parameter carefully; too low, and many inputs will be +handled by the fallback algorithm and so compress rather slowly, too +high, and your average-to-worst case compression times can become very +large. The default value of 30 gives reasonable behaviour over a wide +range of circumstances. + + Allowable values range from 0 to 250 inclusive. 0 is a special case, +equivalent to using the default value of 30. + + Note that the compressed output generated is the same regardless of +whether or not the fallback algorithm is used. + + Be aware also that this parameter may disappear entirely in future +versions of the library. In principle it should be possible to devise a +good way to automatically choose which algorithm to use. Such a +mechanism would render the parameter obsolete. + + Possible return values: + + + BZ_CONFIG_ERROR + if the library has been mis-compiled + BZ_PARAM_ERROR + if strm is NULL + or blockSize < 1 or blockSize > 9 + or verbosity < 0 or verbosity > 4 + or workFactor < 0 or workFactor > 250 + BZ_MEM_ERROR + if not enough memory is available + BZ_OK + otherwise + + Allowable next actions: + + + BZ2_bzCompress + if BZ_OK is returned + no specific action needed in case of error + + +File: manual.info, Node: BZ2_bzCompress, Next: BZ2_bzCompressEnd, Prev: BZ2_bzCompressInit, Up: >Low-level interface + +3.3.2 BZ2_bzCompress +-------------------- + + + int BZ2_bzCompress ( bz_stream *strm, int action ); + + Provides more input and/or output buffer space for the library. The +caller maintains input and output buffers, and calls `BZ2_bzCompress' +to transfer data between them. + + Before each call to `BZ2_bzCompress', `next_in' should point at the +data to be compressed, and `avail_in' should indicate how many bytes +the library may read. `BZ2_bzCompress' updates `next_in', `avail_in' +and `total_in' to reflect the number of bytes it has read. + + Similarly, `next_out' should point to a buffer in which the +compressed data is to be placed, with `avail_out' indicating how much +output space is available. `BZ2_bzCompress' updates `next_out', +`avail_out' and `total_out' to reflect the number of bytes output. + + You may provide and remove as little or as much data as you like on +each call of `BZ2_bzCompress'. In the limit, it is acceptable to supply +and remove data one byte at a time, although this would be terribly +inefficient. You should always ensure that at least one byte of output +space is available at each call. + + A second purpose of `BZ2_bzCompress' is to request a change of mode +of the compressed stream. + + Conceptually, a compressed stream can be in one of four states: +IDLE, RUNNING, FLUSHING and FINISHING. Before initialisation +(`BZ2_bzCompressInit') and after termination (`BZ2_bzCompressEnd'), a +stream is regarded as IDLE. + + Upon initialisation (`BZ2_bzCompressInit'), the stream is placed in +the RUNNING state. Subsequent calls to `BZ2_bzCompress' should pass +`BZ_RUN' as the requested action; other actions are illegal and will +result in `BZ_SEQUENCE_ERROR'. + + At some point, the calling program will have provided all the input +data it wants to. It will then want to finish up - in effect, asking +the library to process any data it might have buffered internally. In +this state, `BZ2_bzCompress' will no longer attempt to read data from +`next_in', but it will want to write data to `next_out'. Because the +output buffer supplied by the user can be arbitrarily small, the +finishing-up operation cannot necessarily be done with a single call of +`BZ2_bzCompress'. + + Instead, the calling program passes `BZ_FINISH' as an action to +`BZ2_bzCompress'. This changes the stream's state to FINISHING. Any +remaining input (ie, `next_in[0 .. avail_in-1]') is compressed and +transferred to the output buffer. To do this, `BZ2_bzCompress' must be +called repeatedly until all the output has been consumed. At that +point, `BZ2_bzCompress' returns `BZ_STREAM_END', and the stream's state +is set back to IDLE. `BZ2_bzCompressEnd' should then be called. + + Just to make sure the calling program does not cheat, the library +makes a note of `avail_in' at the time of the first call to +`BZ2_bzCompress' which has `BZ_FINISH' as an action (ie, at the time +the program has announced its intention to not supply any more input). +By comparing this value with that of `avail_in' over subsequent calls +to `BZ2_bzCompress', the library can detect any attempts to slip in +more data to compress. Any calls for which this is detected will return +`BZ_SEQUENCE_ERROR'. This indicates a programming mistake which should +be corrected. + + Instead of asking to finish, the calling program may ask +`BZ2_bzCompress' to take all the remaining input, compress it and +terminate the current (Burrows-Wheeler) compression block. This could +be useful for error control purposes. The mechanism is analogous to +that for finishing: call `BZ2_bzCompress' with an action of `BZ_FLUSH', +remove output data, and persist with the `BZ_FLUSH' action until the +value `BZ_RUN' is returned. As with finishing, `BZ2_bzCompress' detects +any attempt to provide more input data once the flush has begun. + + Once the flush is complete, the stream returns to the normal RUNNING +state. + + This all sounds pretty complex, but isn't really. Here's a table +which shows which actions are allowable in each state, what action will +be taken, what the next state is, and what the non-error return values +are. Note that you can't explicitly ask what state the stream is in, +but nor do you need to - it can be inferred from the values returned by +`BZ2_bzCompress'. + + + IDLE/any + Illegal. IDLE state only exists after BZ2_bzCompressEnd or + before BZ2_bzCompressInit. + Return value = BZ_SEQUENCE_ERROR + + RUNNING/BZ_RUN + Compress from next_in to next_out as much as possible. + Next state = RUNNING + Return value = BZ_RUN_OK + + RUNNING/BZ_FLUSH + Remember current value of next_in. Compress from next_in + to next_out as much as possible, but do not accept any more input. + Next state = FLUSHING + Return value = BZ_FLUSH_OK + + RUNNING/BZ_FINISH + Remember current value of next_in. Compress from next_in + to next_out as much as possible, but do not accept any more input. + Next state = FINISHING + Return value = BZ_FINISH_OK + + FLUSHING/BZ_FLUSH + Compress from next_in to next_out as much as possible, + but do not accept any more input. + If all the existing input has been used up and all compressed + output has been removed + Next state = RUNNING; Return value = BZ_RUN_OK + else + Next state = FLUSHING; Return value = BZ_FLUSH_OK + + FLUSHING/other + Illegal. + Return value = BZ_SEQUENCE_ERROR + + FINISHING/BZ_FINISH + Compress from next_in to next_out as much as possible, + but to not accept any more input. + If all the existing input has been used up and all compressed + output has been removed + Next state = IDLE; Return value = BZ_STREAM_END + else + Next state = FINISHING; Return value = BZ_FINISHING + + FINISHING/other + Illegal. + Return value = BZ_SEQUENCE_ERROR + + That still looks complicated? Well, fair enough. The usual sequence +of calls for compressing a load of data is: + + 1. Get started with `BZ2_bzCompressInit'. + + 2. Shovel data in and shlurp out its compressed form using zero or + more calls of `BZ2_bzCompress' with action = `BZ_RUN'. + + 3. Finish up. Repeatedly call `BZ2_bzCompress' with action = + `BZ_FINISH', copying out the compressed output, until + `BZ_STREAM_END' is returned. + + 4. Close up and go home. Call `BZ2_bzCompressEnd'. + + If the data you want to compress fits into your input buffer all at +once, you can skip the calls of `BZ2_bzCompress ( ..., BZ_RUN )' and +just do the `BZ2_bzCompress ( ..., BZ_FINISH )' calls. + + All required memory is allocated by `BZ2_bzCompressInit'. The +compression library can accept any data at all (obviously). So you +shouldn't get any error return values from the `BZ2_bzCompress' calls. +If you do, they will be `BZ_SEQUENCE_ERROR', and indicate a bug in your +programming. + + Trivial other possible return values: + + + BZ_PARAM_ERROR + if strm is NULL, or strm->s is NULL + + +File: manual.info, Node: BZ2_bzCompressEnd, Next: BZ2_bzDecompressInit, Prev: BZ2_bzCompress, Up: >Low-level interface + +3.3.3 BZ2_bzCompressEnd +----------------------- + + + int BZ2_bzCompressEnd ( bz_stream *strm ); + + Releases all memory associated with a compression stream. + + Possible return values: + + + BZ_PARAM_ERROR if strm is NULL or strm->s is NULL + BZ_OK otherwise + + +File: manual.info, Node: BZ2_bzDecompressInit, Next: BZ2_bzDecompress, Prev: BZ2_bzCompressEnd, Up: >Low-level interface + +3.3.4 BZ2_bzDecompressInit +-------------------------- + + + int BZ2_bzDecompressInit ( bz_stream *strm, int verbosity, int small ); + + Prepares for decompression. As with `BZ2_bzCompressInit', a +`bz_stream' record should be allocated and initialised before the call. +Fields `bzalloc', `bzfree' and `opaque' should be set if a custom +memory allocator is required, or made `NULL' for the normal `malloc' / +`free' routines. Upon return, the internal state will have been +initialised, and `total_in' and `total_out' will be zero. + + For the meaning of parameter `verbosity', see `BZ2_bzCompressInit'. + + If `small' is nonzero, the library will use an alternative +decompression algorithm which uses less memory but at the cost of +decompressing more slowly (roughly speaking, half the speed, but the +maximum memory requirement drops to around 2300k). See *Note How to use +bzip2: How to use bzip2. for more information on memory management. + + Note that the amount of memory needed to decompress a stream cannot +be determined until the stream's header has been read, so even if +`BZ2_bzDecompressInit' succeeds, a subsequent `BZ2_bzDecompress' could +fail with `BZ_MEM_ERROR'. + + Possible return values: + + + BZ_CONFIG_ERROR + if the library has been mis-compiled + BZ_PARAM_ERROR + if ( small != 0 && small != 1 ) + or (verbosity < 0 || verbosity > 4) + BZ_MEM_ERROR + if insufficient memory is available + + Allowable next actions: + + + BZ2_bzDecompress + if BZ_OK was returned + no specific action required in case of error + + +File: manual.info, Node: BZ2_bzDecompress, Next: BZ2_bzDecompressEnd, Prev: BZ2_bzDecompressInit, Up: >Low-level interface + +3.3.5 BZ2_bzDecompress +---------------------- + + + int BZ2_bzDecompress ( bz_stream *strm ); + + Provides more input and/out output buffer space for the library. The +caller maintains input and output buffers, and uses `BZ2_bzDecompress' +to transfer data between them. + + Before each call to `BZ2_bzDecompress', `next_in' should point at the +compressed data, and `avail_in' should indicate how many bytes the +library may read. `BZ2_bzDecompress' updates `next_in', `avail_in' and +`total_in' to reflect the number of bytes it has read. + + Similarly, `next_out' should point to a buffer in which the +uncompressed output is to be placed, with `avail_out' indicating how +much output space is available. `BZ2_bzCompress' updates `next_out', +`avail_out' and `total_out' to reflect the number of bytes output. + + You may provide and remove as little or as much data as you like on +each call of `BZ2_bzDecompress'. In the limit, it is acceptable to +supply and remove data one byte at a time, although this would be +terribly inefficient. You should always ensure that at least one byte +of output space is available at each call. + + Use of `BZ2_bzDecompress' is simpler than `BZ2_bzCompress'. + + You should provide input and remove output as described above, and +repeatedly call `BZ2_bzDecompress' until `BZ_STREAM_END' is returned. +Appearance of `BZ_STREAM_END' denotes that `BZ2_bzDecompress' has +detected the logical end of the compressed stream. `BZ2_bzDecompress' +will not produce `BZ_STREAM_END' until all output data has been placed +into the output buffer, so once `BZ_STREAM_END' appears, you are +guaranteed to have available all the decompressed output, and +`BZ2_bzDecompressEnd' can safely be called. + + If case of an error return value, you should call +`BZ2_bzDecompressEnd' to clean up and release memory. + + Possible return values: + + + BZ_PARAM_ERROR + if strm is NULL or strm->s is NULL + or strm->avail_out < 1 + BZ_DATA_ERROR + if a data integrity error is detected in the compressed stream + BZ_DATA_ERROR_MAGIC + if the compressed stream doesn't begin with the right magic bytes + BZ_MEM_ERROR + if there wasn't enough memory available + BZ_STREAM_END + if the logical end of the data stream was detected and all + output in has been consumed, eg s-->avail_out > 0 + BZ_OK + otherwise + + Allowable next actions: + + + BZ2_bzDecompress + if BZ_OK was returned + BZ2_bzDecompressEnd + otherwise + + +File: manual.info, Node: BZ2_bzDecompressEnd, Prev: BZ2_bzDecompress, Up: >Low-level interface + +3.3.6 BZ2_bzDecompressEnd +------------------------- + + + int BZ2_bzDecompressEnd ( bz_stream *strm ); + + Releases all memory associated with a decompression stream. + + Possible return values: + + + BZ_PARAM_ERROR + if strm is NULL or strm->s is NULL + BZ_OK + otherwise + + Allowable next actions: + + + None. + + +File: manual.info, Node: High-level interface, Next: Utility functions, Prev: >Low-level interface, Up: Programming with libbzip2 + +3.4 High-level interface +======================== + +This interface provides functions for reading and writing `bzip2' +format files. First, some general points. + + * All of the functions take an `int*' first argument, `bzerror'. + After each call, `bzerror' should be consulted first to determine + the outcome of the call. If `bzerror' is `BZ_OK', the call + completed successfully, and only then should the return value of + the function (if any) be consulted. If `bzerror' is `BZ_IO_ERROR', + there was an error reading/writing the underlying compressed file, + and you should then consult `errno' / `perror' to determine the + cause of the difficulty. `bzerror' may also be set to various + other values; precise details are given on a per-function basis + below. + + * If `bzerror' indicates an error (ie, anything except `BZ_OK' and + `BZ_STREAM_END'), you should immediately call `BZ2_bzReadClose' (or + `BZ2_bzWriteClose', depending on whether you are attempting to + read or to write) to free up all resources associated with the + stream. Once an error has been indicated, behaviour of all calls + except `BZ2_bzReadClose' (`BZ2_bzWriteClose') is undefined. The + implication is that (1) `bzerror' should be checked after each + call, and (2) if `bzerror' indicates an error, `BZ2_bzReadClose' + (`BZ2_bzWriteClose') should then be called to clean up. + + * The `FILE*' arguments passed to `BZ2_bzReadOpen' / + `BZ2_bzWriteOpen' should be set to binary mode. Most Unix systems + will do this by default, but other platforms, including Windows + and Mac, will not. If you omit this, you may encounter problems + when moving code to new platforms. + + * Memory allocation requests are handled by `malloc' / `free'. At + present there is no facility for user-defined memory allocators in + the file I/O functions (could easily be added, though). + +* Menu: + +* BZ2_bzReadOpen:: +* BZ2_bzRead:: +* BZ2_bzReadGetUnused:: +* BZ2_bzReadClose:: +* BZ2_bzWriteOpen:: +* BZ2_bzWrite:: +* BZ2_bzWriteClose:: +* Handling embedded compressed data streams:: +* Standard file-reading/writing code:: + + +File: manual.info, Node: BZ2_bzReadOpen, Next: BZ2_bzRead, Up: High-level interface + +3.4.1 BZ2_bzReadOpen +-------------------- + + + typedef void BZFILE; + + BZFILE *BZ2_bzReadOpen( int *bzerror, FILE *f, + int verbosity, int small, + void *unused, int nUnused ); + + Prepare to read compressed data from file handle `f'. `f' should +refer to a file which has been opened for reading, and for which the +error indicator (`ferror(f)')is not set. If `small' is 1, the library +will try to decompress using less memory, at the expense of speed. + + For reasons explained below, `BZ2_bzRead' will decompress the +`nUnused' bytes starting at `unused', before starting to read from the +file `f'. At most `BZ_MAX_UNUSED' bytes may be supplied like this. If +this facility is not required, you should pass `NULL' and `0' for +`unused' and n`Unused' respectively. + + For the meaning of parameters `small' and `verbosity', see +`BZ2_bzDecompressInit'. + + The amount of memory needed to decompress a file cannot be +determined until the file's header has been read. So it is possible +that `BZ2_bzReadOpen' returns `BZ_OK' but a subsequent call of +`BZ2_bzRead' will return `BZ_MEM_ERROR'. + + Possible assignments to `bzerror': + + + BZ_CONFIG_ERROR + if the library has been mis-compiled + BZ_PARAM_ERROR + if f is NULL + or small is neither 0 nor 1 + or ( unused == NULL && nUnused != 0 ) + or ( unused != NULL && !(0 <= nUnused <= BZ_MAX_UNUSED) ) + BZ_IO_ERROR + if ferror(f) is nonzero + BZ_MEM_ERROR + if insufficient memory is available + BZ_OK + otherwise. + + Possible return values: + + + Pointer to an abstract BZFILE + if bzerror is BZ_OK + NULL + otherwise + + Allowable next actions: + + + BZ2_bzRead + if bzerror is BZ_OK + BZ2_bzClose + otherwise + + +File: manual.info, Node: BZ2_bzRead, Next: BZ2_bzReadGetUnused, Prev: BZ2_bzReadOpen, Up: High-level interface + +3.4.2 BZ2_bzRead +---------------- + + + int BZ2_bzRead ( int *bzerror, BZFILE *b, void *buf, int len ); + + Reads up to `len' (uncompressed) bytes from the compressed file `b' +into the buffer `buf'. If the read was successful, `bzerror' is set to +`BZ_OK' and the number of bytes read is returned. If the logical +end-of-stream was detected, `bzerror' will be set to `BZ_STREAM_END', +and the number of bytes read is returned. All other `bzerror' values +denote an error. + + `BZ2_bzRead' will supply `len' bytes, unless the logical stream end +is detected or an error occurs. Because of this, it is possible to +detect the stream end by observing when the number of bytes returned is +less than the number requested. Nevertheless, this is regarded as +inadvisable; you should instead check `bzerror' after every call and +watch out for `BZ_STREAM_END'. + + Internally, `BZ2_bzRead' copies data from the compressed file in +chunks of size `BZ_MAX_UNUSED' bytes before decompressing it. If the +file contains more bytes than strictly needed to reach the logical +end-of-stream, `BZ2_bzRead' will almost certainly read some of the +trailing data before signalling `BZ_SEQUENCE_END'. To collect the read +but unused data once `BZ_SEQUENCE_END' has appeared, call +`BZ2_bzReadGetUnused' immediately before `BZ2_bzReadClose'. + + Possible assignments to `bzerror': + + + BZ_PARAM_ERROR + if b is NULL or buf is NULL or len < 0 + BZ_SEQUENCE_ERROR + if b was opened with BZ2_bzWriteOpen + BZ_IO_ERROR + if there is an error reading from the compressed file + BZ_UNEXPECTED_EOF + if the compressed file ended before + the logical end-of-stream was detected + BZ_DATA_ERROR + if a data integrity error was detected in the compressed stream + BZ_DATA_ERROR_MAGIC + if the stream does not begin with the requisite header bytes + (ie, is not a bzip2 data file). This is really + a special case of BZ_DATA_ERROR. + BZ_MEM_ERROR + if insufficient memory was available + BZ_STREAM_END + if the logical end of stream was detected. + BZ_OK + otherwise. + + Possible return values: + + + number of bytes read + if bzerror is BZ_OK or BZ_STREAM_END + undefined + otherwise + + Allowable next actions: + + + collect data from buf, then BZ2_bzRead or BZ2_bzReadClose + if bzerror is BZ_OK + collect data from buf, then BZ2_bzReadClose or BZ2_bzReadGetUnused + if bzerror is BZ_SEQUENCE_END + BZ2_bzReadClose + otherwise + + +File: manual.info, Node: BZ2_bzReadGetUnused, Next: BZ2_bzReadClose, Prev: BZ2_bzRead, Up: High-level interface + +3.4.3 BZ2_bzReadGetUnused +------------------------- + + + void BZ2_bzReadGetUnused( int* bzerror, BZFILE *b, + void** unused, int* nUnused ); + + Returns data which was read from the compressed file but was not +needed to get to the logical end-of-stream. `*unused' is set to the +address of the data, and `*nUnused' to the number of bytes. `*nUnused' +will be set to a value between `0' and `BZ_MAX_UNUSED' inclusive. + + This function may only be called once `BZ2_bzRead' has signalled +`BZ_STREAM_END' but before `BZ2_bzReadClose'. + + Possible assignments to `bzerror': + + + BZ_PARAM_ERROR + if b is NULL + or unused is NULL or nUnused is NULL + BZ_SEQUENCE_ERROR + if BZ_STREAM_END has not been signalled + or if b was opened with BZ2_bzWriteOpen + BZ_OK + otherwise + + Allowable next actions: + + + BZ2_bzReadClose + + +File: manual.info, Node: BZ2_bzReadClose, Next: BZ2_bzWriteOpen, Prev: BZ2_bzReadGetUnused, Up: High-level interface + +3.4.4 BZ2_bzReadClose +--------------------- + + + void BZ2_bzReadClose ( int *bzerror, BZFILE *b ); + + Releases all memory pertaining to the compressed file `b'. +`BZ2_bzReadClose' does not call `fclose' on the underlying file handle, +so you should do that yourself if appropriate. `BZ2_bzReadClose' +should be called to clean up after all error situations. + + Possible assignments to `bzerror': + + + BZ_SEQUENCE_ERROR + if b was opened with BZ2_bzOpenWrite + BZ_OK + otherwise + + Allowable next actions: + + + none + + +File: manual.info, Node: BZ2_bzWriteOpen, Next: BZ2_bzWrite, Prev: BZ2_bzReadClose, Up: High-level interface + +3.4.5 BZ2_bzWriteOpen +--------------------- + + + BZFILE *BZ2_bzWriteOpen( int *bzerror, FILE *f, + int blockSize100k, int verbosity, + int workFactor ); + + Prepare to write compressed data to file handle `f'. `f' should +refer to a file which has been opened for writing, and for which the +error indicator (`ferror(f)')is not set. + + For the meaning of parameters `blockSize100k', `verbosity' and +`workFactor', see `BZ2_bzCompressInit'. + + All required memory is allocated at this stage, so if the call +completes successfully, `BZ_MEM_ERROR' cannot be signalled by a +subsequent call to `BZ2_bzWrite'. + + Possible assignments to `bzerror': + + + BZ_CONFIG_ERROR + if the library has been mis-compiled + BZ_PARAM_ERROR + if f is NULL + or blockSize100k < 1 or blockSize100k > 9 + BZ_IO_ERROR + if ferror(f) is nonzero + BZ_MEM_ERROR + if insufficient memory is available + BZ_OK + otherwise + + Possible return values: + + + Pointer to an abstract BZFILE + if bzerror is BZ_OK + NULL + otherwise + + Allowable next actions: + + + BZ2_bzWrite + if bzerror is BZ_OK + (you could go directly to BZ2_bzWriteClose, but this would be pretty pointless) + BZ2_bzWriteClose + otherwise + + +File: manual.info, Node: BZ2_bzWrite, Next: BZ2_bzWriteClose, Prev: BZ2_bzWriteOpen, Up: High-level interface + +3.4.6 BZ2_bzWrite +----------------- + + + void BZ2_bzWrite ( int *bzerror, BZFILE *b, void *buf, int len ); + + Absorbs `len' bytes from the buffer `buf', eventually to be +compressed and written to the file. + + Possible assignments to `bzerror': + + + BZ_PARAM_ERROR + if b is NULL or buf is NULL or len < 0 + BZ_SEQUENCE_ERROR + if b was opened with BZ2_bzReadOpen + BZ_IO_ERROR + if there is an error writing the compressed file. + BZ_OK + otherwise + + +File: manual.info, Node: BZ2_bzWriteClose, Next: Handling embedded compressed data streams, Prev: BZ2_bzWrite, Up: High-level interface + +3.4.7 BZ2_bzWriteClose +---------------------- + + + void BZ2_bzWriteClose( int *bzerror, BZFILE* f, + int abandon, + unsigned int* nbytes_in, + unsigned int* nbytes_out ); + + void BZ2_bzWriteClose64( int *bzerror, BZFILE* f, + int abandon, + unsigned int* nbytes_in_lo32, + unsigned int* nbytes_in_hi32, + unsigned int* nbytes_out_lo32, + unsigned int* nbytes_out_hi32 ); + + Compresses and flushes to the compressed file all data so far +supplied by `BZ2_bzWrite'. The logical end-of-stream markers are also +written, so subsequent calls to `BZ2_bzWrite' are illegal. All memory +associated with the compressed file `b' is released. `fflush' is +called on the compressed file, but it is not `fclose''d. + + If `BZ2_bzWriteClose' is called to clean up after an error, the only +action is to release the memory. The library records the error codes +issued by previous calls, so this situation will be detected +automatically. There is no attempt to complete the compression +operation, nor to `fflush' the compressed file. You can force this +behaviour to happen even in the case of no error, by passing a nonzero +value to `abandon'. + + If `nbytes_in' is non-null, `*nbytes_in' will be set to be the total +volume of uncompressed data handled. Similarly, `nbytes_out' will be +set to the total volume of compressed data written. For compatibility +with older versions of the library, `BZ2_bzWriteClose' only yields the +lower 32 bits of these counts. Use `BZ2_bzWriteClose64' if you want the +full 64 bit counts. These two functions are otherwise absolutely +identical. + + Possible assignments to `bzerror': + + + BZ_SEQUENCE_ERROR + if b was opened with BZ2_bzReadOpen + BZ_IO_ERROR + if there is an error writing the compressed file + BZ_OK + otherwise + + +File: manual.info, Node: Handling embedded compressed data streams, Next: Standard file-reading/writing code, Prev: BZ2_bzWriteClose, Up: High-level interface + +3.4.8 Handling embedded compressed data streams +----------------------------------------------- + +The high-level library facilitates use of `bzip2' data streams which +form some part of a surrounding, larger data stream. + + * For writing, the library takes an open file handle, writes + compressed data to it, `fflush'es it but does not `fclose' it. The + calling application can write its own data before and after the + compressed data stream, using that same file handle. + + * Reading is more complex, and the facilities are not as general as + they could be since generality is hard to reconcile with + efficiency. `BZ2_bzRead' reads from the compressed file in blocks + of size `BZ_MAX_UNUSED' bytes, and in doing so probably will + overshoot the logical end of compressed stream. To recover this + data once decompression has ended, call `BZ2_bzReadGetUnused' after + the last call of `BZ2_bzRead' (the one returning `BZ_STREAM_END') + but before calling `BZ2_bzReadClose'. + + This mechanism makes it easy to decompress multiple `bzip2' streams +placed end-to-end. As the end of one stream, when `BZ2_bzRead' returns +`BZ_STREAM_END', call `BZ2_bzReadGetUnused' to collect the unused data +(copy it into your own buffer somewhere). That data forms the start of +the next compressed stream. To start uncompressing that next stream, +call `BZ2_bzReadOpen' again, feeding in the unused data via the +`unused' / `nUnused' parameters. Keep doing this until `BZ_STREAM_END' +return coincides with the physical end of file (`feof(f)'). In this +situation `BZ2_bzReadGetUnused' will of course return no data. + + This should give some feel for how the high-level interface can be +used. If you require extra flexibility, you'll have to bite the bullet +and get to grips with the low-level interface. + + +File: manual.info, Node: Standard file-reading/writing code, Prev: Handling embedded compressed data streams, Up: High-level interface + +3.4.9 Standard file-reading/writing code +---------------------------------------- + +Here's how you'd write data to a compressed file: + + + FILE* f; + BZFILE* b; + int nBuf; + char buf[ /* whatever size you like */ ]; + int bzerror; + int nWritten; + + f = fopen ( "myfile.bz2", "w" ); + if ( !f ) { + /* handle error */ + } + b = BZ2_bzWriteOpen( &bzerror, f, 9 ); + if (bzerror != BZ_OK) { + BZ2_bzWriteClose ( b ); + /* handle error */ + } + + while ( /* condition */ ) { + /* get data to write into buf, and set nBuf appropriately */ + nWritten = BZ2_bzWrite ( &bzerror, b, buf, nBuf ); + if (bzerror == BZ_IO_ERROR) { + BZ2_bzWriteClose ( &bzerror, b ); + /* handle error */ + } + } + + BZ2_bzWriteClose( &bzerror, b ); + if (bzerror == BZ_IO_ERROR) { + /* handle error */ + } + + And to read from a compressed file: + + + FILE* f; + BZFILE* b; + int nBuf; + char buf[ /* whatever size you like */ ]; + int bzerror; + int nWritten; + + f = fopen ( "myfile.bz2", "r" ); + if ( !f ) { + /* handle error */ + } + b = BZ2_bzReadOpen ( &bzerror, f, 0, NULL, 0 ); + if ( bzerror != BZ_OK ) { + BZ2_bzReadClose ( &bzerror, b ); + /* handle error */ + } + + bzerror = BZ_OK; + while ( bzerror == BZ_OK && /* arbitrary other conditions */) { + nBuf = BZ2_bzRead ( &bzerror, b, buf, /* size of buf */ ); + if ( bzerror == BZ_OK ) { + /* do something with buf[0 .. nBuf-1] */ + } + } + if ( bzerror != BZ_STREAM_END ) { + BZ2_bzReadClose ( &bzerror, b ); + /* handle error */ + } else { + BZ2_bzReadClose ( &bzerror ); + } + + +File: manual.info, Node: Utility functions, Next: zlib compatibility functions, Prev: High-level interface, Up: Programming with libbzip2 + +3.5 Utility functions +===================== + +* Menu: + +* BZ2_bzBuffToBuffCompress:: +* BZ2_bzBuffToBuffDecompress:: + + +File: manual.info, Node: BZ2_bzBuffToBuffCompress, Next: BZ2_bzBuffToBuffDecompress, Up: Utility functions + +3.5.1 BZ2_bzBuffToBuffCompress +------------------------------ + + + int BZ2_bzBuffToBuffCompress( char* dest, + unsigned int* destLen, + char* source, + unsigned int sourceLen, + int blockSize100k, + int verbosity, + int workFactor ); + + Attempts to compress the data in `source[0 .. sourceLen-1]' into the +destination buffer, `dest[0 .. *destLen-1]'. If the destination buffer +is big enough, `*destLen' is set to the size of the compressed data, +and `BZ_OK' is returned. If the compressed data won't fit, `*destLen' +is unchanged, and `BZ_OUTBUFF_FULL' is returned. + + Compression in this manner is a one-shot event, done with a single +call to this function. The resulting compressed data is a complete +`bzip2' format data stream. There is no mechanism for making additional +calls to provide extra input data. If you want that kind of mechanism, +use the low-level interface. + + For the meaning of parameters `blockSize100k', `verbosity' and +`workFactor', see `BZ2_bzCompressInit'. + + To guarantee that the compressed data will fit in its buffer, +allocate an output buffer of size 1% larger than the uncompressed data, +plus six hundred extra bytes. + + `BZ2_bzBuffToBuffDecompress' will not write data at or beyond +`dest[*destLen]', even in case of buffer overflow. + + Possible return values: + + + BZ_CONFIG_ERROR + if the library has been mis-compiled + BZ_PARAM_ERROR + if dest is NULL or destLen is NULL + or blockSize100k < 1 or blockSize100k > 9 + or verbosity < 0 or verbosity > 4 + or workFactor < 0 or workFactor > 250 + BZ_MEM_ERROR + if insufficient memory is available + BZ_OUTBUFF_FULL + if the size of the compressed data exceeds *destLen + BZ_OK + otherwise + + +File: manual.info, Node: BZ2_bzBuffToBuffDecompress, Prev: BZ2_bzBuffToBuffCompress, Up: Utility functions + +3.5.2 BZ2_bzBuffToBuffDecompress +-------------------------------- + + + int BZ2_bzBuffToBuffDecompress( char* dest, + unsigned int* destLen, + char* source, + unsigned int sourceLen, + int small, + int verbosity ); + + Attempts to decompress the data in `source[0 .. sourceLen-1]' into +the destination buffer, `dest[0 .. *destLen-1]'. If the destination +buffer is big enough, `*destLen' is set to the size of the uncompressed +data, and `BZ_OK' is returned. If the compressed data won't fit, +`*destLen' is unchanged, and `BZ_OUTBUFF_FULL' is returned. + + `source' is assumed to hold a complete `bzip2' format data stream. +`BZ2_bzBuffToBuffDecompress' tries to decompress the entirety of the +stream into the output buffer. + + For the meaning of parameters `small' and `verbosity', see +`BZ2_bzDecompressInit'. + + Because the compression ratio of the compressed data cannot be known +in advance, there is no easy way to guarantee that the output buffer +will be big enough. You may of course make arrangements in your code to +record the size of the uncompressed data, but such a mechanism is +beyond the scope of this library. + + `BZ2_bzBuffToBuffDecompress' will not write data at or beyond +`dest[*destLen]', even in case of buffer overflow. + + Possible return values: + + + BZ_CONFIG_ERROR + if the library has been mis-compiled + BZ_PARAM_ERROR + if dest is NULL or destLen is NULL + or small != 0 && small != 1 + or verbosity < 0 or verbosity > 4 + BZ_MEM_ERROR + if insufficient memory is available + BZ_OUTBUFF_FULL + if the size of the compressed data exceeds *destLen + BZ_DATA_ERROR + if a data integrity error was detected in the compressed data + BZ_DATA_ERROR_MAGIC + if the compressed data doesn't begin with the right magic bytes + BZ_UNEXPECTED_EOF + if the compressed data ends unexpectedly + BZ_OK + otherwise + + +File: manual.info, Node: zlib compatibility functions, Next: Using the library in a stdio-free environment, Prev: Utility functions, Up: Programming with libbzip2 + +3.6 zlib compatibility functions +================================ + +Yoshioka Tsuneo has contributed some functions to give better `zlib' +compatibility. These functions are `BZ2_bzopen', `BZ2_bzread', +`BZ2_bzwrite', `BZ2_bzflush', `BZ2_bzclose', `BZ2_bzerror' and +`BZ2_bzlibVersion'. These functions are not (yet) officially part of +the library. If they break, you get to keep all the pieces. +Nevertheless, I think they work ok. + + + typedef void BZFILE; + + const char * BZ2_bzlibVersion ( void ); + + Returns a string indicating the library version. + + + BZFILE * BZ2_bzopen ( const char *path, const char *mode ); + BZFILE * BZ2_bzdopen ( int fd, const char *mode ); + + Opens a `.bz2' file for reading or writing, using either its name or +a pre-existing file descriptor. Analogous to `fopen' and `fdopen'. + + + int BZ2_bzread ( BZFILE* b, void* buf, int len ); + int BZ2_bzwrite ( BZFILE* b, void* buf, int len ); + + Reads/writes data from/to a previously opened `BZFILE'. Analogous to +`fread' and `fwrite'. + + + int BZ2_bzflush ( BZFILE* b ); + void BZ2_bzclose ( BZFILE* b ); + + Flushes/closes a `BZFILE'. `BZ2_bzflush' doesn't actually do +anything. Analogous to `fflush' and `fclose'. + + + const char * BZ2_bzerror ( BZFILE *b, int *errnum ) + + Returns a string describing the more recent error status of `b', and +also sets `*errnum' to its numerical value. + + +File: manual.info, Node: Using the library in a stdio-free environment, Next: Making a Windows DLL, Prev: zlib compatibility functions, Up: Programming with libbzip2 + +3.7 Using the library in a stdio-free environment +================================================= + +* Menu: + +* Getting rid of stdio:: +* Critical error handling:: + + +File: manual.info, Node: Getting rid of stdio, Next: Critical error handling, Up: Using the library in a stdio-free environment + +3.7.1 Getting rid of stdio +-------------------------- + +In a deeply embedded application, you might want to use just the +memory-to-memory functions. You can do this conveniently by compiling +the library with preprocessor symbol `BZ_NO_STDIO' defined. Doing this +gives you a library containing only the following eight functions: + + `BZ2_bzCompressInit', `BZ2_bzCompress', `BZ2_bzCompressEnd' +`BZ2_bzDecompressInit', `BZ2_bzDecompress', `BZ2_bzDecompressEnd' +`BZ2_bzBuffToBuffCompress', `BZ2_bzBuffToBuffDecompress' + + When compiled like this, all functions will ignore `verbosity' +settings. + + +File: manual.info, Node: Critical error handling, Prev: Getting rid of stdio, Up: Using the library in a stdio-free environment + +3.7.2 Critical error handling +----------------------------- + +`libbzip2' contains a number of internal assertion checks which should, +needless to say, never be activated. Nevertheless, if an assertion +should fail, behaviour depends on whether or not the library was +compiled with `BZ_NO_STDIO' set. + + For a normal compile, an assertion failure yields the message: + + bzip2/libbzip2: internal error number N. + + This is a bug in bzip2/libbzip2, 1.0.3 of 15 February 2005. + Please report it to me at: jseward@bzip.org. If this happened when + you were using some program which uses libbzip2 as a component, + you should also report this bug to the author(s) of that program. + Please make an effort to report this bug; timely and accurate bug + reports eventually lead to higher quality software. Thanks. Julian + Seward, 15 February 2005. + + where `N' is some error code number. If `N == 1007', it also prints +some extra text advising the reader that unreliable memory is often +associated with internal error 1007. (This is a +frequently-observed-phenomenon with versions 1.0.0/1.0.1). + + `exit(3)' is then called. + + For a `stdio'-free library, assertion failures result in a call to a +function declared as: + + + extern void bz_internal_error ( int errcode ); + + The relevant code is passed as a parameter. You should supply such a +function. + + In either case, once an assertion failure has occurred, any +`bz_stream' records involved can be regarded as invalid. You should not +attempt to resume normal operation with them. + + You may, of course, change critical error handling to suit your +needs. As I said above, critical errors indicate bugs in the library +and should not occur. All "normal" error situations are indicated via +error return codes from functions, and can be recovered from. + + +File: manual.info, Node: Making a Windows DLL, Prev: Using the library in a stdio-free environment, Up: Programming with libbzip2 + +3.8 Making a Windows DLL +======================== + +Everything related to Windows has been contributed by Yoshioka Tsuneo +(`QWF00133@niftyserve.or.jp' / `tsuneo-y@is.aist-nara.ac.jp'), so you +should send your queries to him (but perhaps Cc: me, +`jseward@bzip.org'). + + My vague understanding of what to do is: using Visual C++ 5.0, open +the project file `libbz2.dsp', and build. That's all. + + If you can't open the project file for some reason, make a new one, +naming these files: `blocksort.c', `bzlib.c', `compress.c', +`crctable.c', `decompress.c', `huffman.c', `randtable.c' and +`libbz2.def'. You will also need to name the header files `bzlib.h' and +`bzlib_private.h'. + + If you don't use VC++, you may need to define the proprocessor symbol +`_WIN32'. + + Finally, `dlltest.c' is a sample program using the DLL. It has a +project file, `dlltest.dsp'. + + If you just want a makefile for Visual C, have a look at +`makefile.msc'. + + Be aware that if you compile `bzip2' itself on Win32, you must set +`BZ_UNIX' to 0 and `BZ_LCCWIN32' to 1, in the file `bzip2.c', before +compiling. Otherwise the resulting binary won't work correctly. + + I haven't tried any of this stuff myself, but it all looks plausible. + + +File: manual.info, Node: Miscellanea, Prev: Programming with libbzip2, Up: Top + +4 Miscellanea +************* + +These are just some random thoughts of mine. Your mileage may vary. + +* Menu: + +* Limitations of the compressed file format:: +* Portability issues:: +* Reporting bugs:: +* Did you get the right package?:: +* Further Reading:: + + +File: manual.info, Node: Limitations of the compressed file format, Next: Portability issues, Up: Miscellanea + +4.1 Limitations of the compressed file format +============================================= + +`bzip2-1.0.X', `0.9.5' and `0.9.0' use exactly the same file format as +the original version, `bzip2-0.1'. This decision was made in the +interests of stability. Creating yet another incompatible compressed +file format would create further confusion and disruption for users. + + Nevertheless, this is not a painless decision. Development work +since the release of `bzip2-0.1' in August 1997 has shown complexities +in the file format which slow down decompression and, in retrospect, +are unnecessary. These are: + + * The run-length encoder, which is the first of the compression + transformations, is entirely irrelevant. The original purpose was + to protect the sorting algorithm from the very worst case input: a + string of repeated symbols. But algorithm steps Q6a and Q6b in the + original Burrows-Wheeler technical report (SRC-124) show how + repeats can be handled without difficulty in block sorting. + + * The randomisation mechanism doesn't really need to be there. Udi + Manber and Gene Myers published a suffix array construction + algorithm a few years back, which can be employed to sort any + block, no matter how repetitive, in O(N log N) time. Subsequent + work by Kunihiko Sadakane has produced a derivative O(N (log N)^2) + algorithm which usually outperforms the Manber-Myers algorithm. + + I could have changed to Sadakane's algorithm, but I find it to be + slower than `bzip2''s existing algorithm for most inputs, and the + randomisation mechanism protects adequately against bad cases. I + didn't think it was a good tradeoff to make. Partly this is due to + the fact that I was not flooded with email complaints about + `bzip2-0.1''s performance on repetitive data, so perhaps it isn't + a problem for real inputs. + + Probably the best long-term solution, and the one I have + incorporated into 0.9.5 and above, is to use the existing sorting + algorithm initially, and fall back to a O(N (log N)^2) algorithm + if the standard algorithm gets into difficulties. + + * The compressed file format was never designed to be handled by a + library, and I have had to jump though some hoops to produce an + efficient implementation of decompression. It's a bit hairy. Try + passing `decompress.c' through the C preprocessor and you'll see + what I mean. Much of this complexity could have been avoided if + the compressed size of each block of data was recorded in the data + stream. + + * An Adler-32 checksum, rather than a CRC32 checksum, would be + faster to compute. + + It would be fair to say that the `bzip2' format was frozen before I +properly and fully understood the performance consequences of doing so. + + Improvements which I was able to incorporate into 0.9.0, despite +using the same file format, are: + + * Single array implementation of the inverse BWT. This significantly + speeds up decompression, presumably because it reduces the number + of cache misses. + + * Faster inverse MTF transform for large MTF values. The new + implementation is based on the notion of sliding blocks of values. + + * `bzip2-0.9.0' now reads and writes files with `fread' and + `fwrite'; version 0.1 used `putc' and `getc'. Duh! Well, you live + and learn. + + Further ahead, it would be nice to be able to do random access into +files. This will require some careful design of compressed file formats. + + +File: manual.info, Node: Portability issues, Next: Reporting bugs, Prev: Limitations of the compressed file format, Up: Miscellanea + +4.2 Portability issues +====================== + +After some consideration, I have decided not to use GNU `autoconf' to +configure 0.9.5 or 1.0. + + `autoconf', admirable and wonderful though it is, mainly assists +with portability problems between Unix-like platforms. But `bzip2' +doesn't have much in the way of portability problems on Unix; most of +the difficulties appear when porting to the Mac, or to Microsoft's +operating systems. `autoconf' doesn't help in those cases, and brings +in a whole load of new complexity. + + Most people should be able to compile the library and program under +Unix straight out-of-the-box, so to speak, especially if you have a +version of GNU C available. + + There are a couple of `__inline__' directives in the code. GNU C +(`gcc') should be able to handle them. If you're not using GNU C, your +C compiler shouldn't see them at all. If your compiler does, for some +reason, see them and doesn't like them, just `#define' `__inline__' to +be `/* */'. One easy way to do this is to compile with the flag +`-D__inline__=', which should be understood by most Unix compilers. + + If you still have difficulties, try compiling with the macro +`BZ_STRICT_ANSI' defined. This should enable you to build the library +in a strictly ANSI compliant environment. Building the program itself +like this is dangerous and not supported, since you remove `bzip2''s +checks against compressing directories, symbolic links, devices, and +other not-really-a-file entities. This could cause filesystem +corruption! + + One other thing: if you create a `bzip2' binary for public +distribution, please consider linking it statically (`gcc -static'). +This avoids all sorts of library-version issues that others may +encounter later on. + + If you build `bzip2' on Win32, you must set `BZ_UNIX' to 0 and +`BZ_LCCWIN32' to 1, in the file `bzip2.c', before compiling. Otherwise +the resulting binary won't work correctly. + + +File: manual.info, Node: Reporting bugs, Next: Did you get the right package?, Prev: Portability issues, Up: Miscellanea + +4.3 Reporting bugs +================== + +I tried pretty hard to make sure `bzip2' is bug free, both by design +and by testing. Hopefully you'll never need to read this section for +real. + + Nevertheless, if `bzip2' dies with a segmentation fault, a bus error +or an internal assertion failure, it will ask you to email me a bug +report. Experience from years of feedback of bzip2 users indicates that +almost all these problems can be traced to either compiler bugs or +hardware problems. + + * Recompile the program with no optimisation, and see if it works. + And/or try a different compiler. I heard all sorts of stories + about various flavours of GNU C (and other compilers) generating + bad code for `bzip2', and I've run across two such examples myself. + + 2.7.X versions of GNU C are known to generate bad code from time + to time, at high optimisation levels. If you get problems, try + using the flags `-O2' `-fomit-frame-pointer' + `-fno-strength-reduce'. You should specifically not use + `-funroll-loops'. + + You may notice that the Makefile runs six tests as part of the + build process. If the program passes all of these, it's a pretty + good (but not 100%) indication that the compiler has done its job + correctly. + + * If `bzip2' crashes randomly, and the crashes are not repeatable, + you may have a flaky memory subsystem. `bzip2' really hammers your + memory hierarchy, and if it's a bit marginal, you may get these + problems. Ditto if your disk or I/O subsystem is slowly failing. + Yup, this really does happen. + + Try using a different machine of the same type, and see if you can + repeat the problem. + + * This isn't really a bug, but ... If `bzip2' tells you your file is + corrupted on decompression, and you obtained the file via FTP, + there is a possibility that you forgot to tell FTP to do a binary + mode transfer. That absolutely will cause the file to be + non-decompressible. You'll have to transfer it again. + + If you've incorporated `libbzip2' into your own program and are +getting problems, please, please, please, check that the parameters you +are passing in calls to the library, are correct, and in accordance +with what the documentation says is allowable. I have tried to make +the library robust against such problems, but I'm sure I haven't +succeeded. + + Finally, if the above comments don't help, you'll have to send me a +bug report. Now, it's just amazing how many people will send me a bug +report saying something like: + + + bzip2 crashed with segmentation fault on my machine + + and absolutely nothing else. Needless to say, a such a report is +totally, utterly, completely and comprehensively 100% useless; a waste +of your time, my time, and net bandwidth. With no details at all, +there's no way I can possibly begin to figure out what the problem is. + + The rules of the game are: facts, facts, facts. Don't omit them +because "oh, they won't be relevant". At the bare minimum: + + + Machine type. Operating system version. + Exact version of bzip2 (do bzip2 -V). + Exact version of the compiler used. + Flags passed to the compiler. + + However, the most important single thing that will help me is the +file that you were trying to compress or decompress at the time the +problem happened. Without that, my ability to do anything more than +speculate about the cause, is limited. + + +File: manual.info, Node: Did you get the right package?, Next: Further Reading, Prev: Reporting bugs, Up: Miscellanea + +4.4 Did you get the right package? +================================== + +`bzip2' is a resource hog. It soaks up large amounts of CPU cycles and +memory. Also, it gives very large latencies. In the worst case, you can +feed many megabytes of uncompressed data into the library before getting +any compressed output, so this probably rules out applications +requiring interactive behaviour. + + These aren't faults of my implementation, I hope, but more an +intrinsic property of the Burrows-Wheeler transform (unfortunately). +Maybe this isn't what you want. + + If you want a compressor and/or library which is faster, uses less +memory but gets pretty good compression, and has minimal latency, +consider Jean-loup Gailly's and Mark Adler's work, `zlib-1.2.1' and +`gzip-1.2.4'. Look for them at http://www.zlib.org +(http://www.zlib.org) and http://www.gzip.org (http://www.gzip.org) +respectively. + + For something faster and lighter still, you might try Markus F X J +Oberhumer's `LZO' real-time compression/decompression library, at +http://www.oberhumer.com/opensource +(http://www.oberhumer.com/opensource). + + +File: manual.info, Node: Further Reading, Prev: Did you get the right package?, Up: Miscellanea + +4.5 Further Reading +=================== + +`bzip2' is not research work, in the sense that it doesn't present any +new ideas. Rather, it's an engineering exercise based on existing +ideas. + + Four documents describe essentially all the ideas behind `bzip2': + + Michael Burrows and D. J. Wheeler: + "A block-sorting lossless data compression algorithm" + 10th May 1994. + Digital SRC Research Report 124. + ftp://ftp.digital.com/pub/DEC/SRC/research-reports/SRC-124.ps.gz + If you have trouble finding it, try searching at the + New Zealand Digital Library, http://www.nzdl.org. + + Daniel S. Hirschberg and Debra A. LeLewer + "Efficient Decoding of Prefix Codes" + Communications of the ACM, April 1990, Vol 33, Number 4. + You might be able to get an electronic copy of this + from the ACM Digital Library. + + David J. Wheeler + Program bred3.c and accompanying document bred3.ps. + This contains the idea behind the multi-table Huffman coding scheme. + ftp://ftp.cl.cam.ac.uk/users/djw3/ + + Jon L. Bentley and Robert Sedgewick + "Fast Algorithms for Sorting and Searching Strings" + Available from Sedgewick's web page, + www.cs.princeton.edu/~rs + + The following paper gives valuable additional insights into the +algorithm, but is not immediately the basis of any code used in bzip2. + + Peter Fenwick: + Block Sorting Text Compression + Proceedings of the 19th Australasian Computer Science Conference, + Melbourne, Australia. Jan 31 - Feb 2, 1996. + ftp://ftp.cs.auckland.ac.nz/pub/peter-f/ACSC96paper.ps + + Kunihiko Sadakane's sorting algorithm, mentioned above, is available +from: + + http://naomi.is.s.u-tokyo.ac.jp/~sada/papers/Sada98b.ps.gz + + The Manber-Myers suffix array construction algorithm is described in +a paper available from: + + http://www.cs.arizona.edu/people/gene/PAPERS/suffix.ps + + Finally, the following papers document some investigations I made +into the performance of sorting and decompression algorithms: + + Julian Seward + On the Performance of BWT Sorting Algorithms + Proceedings of the IEEE Data Compression Conference 2000 + Snowbird, Utah. 28-30 March 2000. + + Julian Seward + Space-time Tradeoffs in the Inverse B-W Transform + Proceedings of the IEEE Data Compression Conference 2001 + Snowbird, Utah. 27-29 March 2001. + + + +Tag Table: +Node: Top190 +Node: Introduction1058 +Node: How to use bzip22242 +Node: NAME2631 +Node: SYNOPSIS2898 +Node: DESCRIPTION3192 +Node: OPTIONS7848 +Node: MEMORY MANAGEMENT11162 +Node: RECOVERING DATA FROM DAMAGED FILES14700 +Node: PERFORMANCE NOTES16424 +Node: CAVEATS17709 +Node: AUTHOR19007 +Node: Programming with libbzip220012 +Node: Top-level structure20673 +Node: Low-level summary21575 +Node: High-level summary22964 +Node: Utility functions summary24439 +Node: Error handling25703 +Node: >Low-level interface30791 +Node: BZ2_bzCompressInit31118 +Node: BZ2_bzCompress35825 +Node: BZ2_bzCompressEnd42895 +Node: BZ2_bzDecompressInit43299 +Node: BZ2_bzDecompress44993 +Node: BZ2_bzDecompressEnd47609 +Node: High-level interface48042 +Node: BZ2_bzReadOpen50331 +Node: BZ2_bzRead52221 +Node: BZ2_bzReadGetUnused54849 +Node: BZ2_bzReadClose55852 +Node: BZ2_bzWriteOpen56512 +Node: BZ2_bzWrite57942 +Node: BZ2_bzWriteClose58544 +Node: Handling embedded compressed data streams60676 +Node: Standard file-reading/writing code62663 +Node: Utility functions64550 +Node: BZ2_bzBuffToBuffCompress64810 +Node: BZ2_bzBuffToBuffDecompress66900 +Node: zlib compatibility functions69124 +Node: Using the library in a stdio-free environment70697 +Node: Getting rid of stdio71034 +Node: Critical error handling71761 +Node: Making a Windows DLL73720 +Node: Miscellanea75070 +Node: Limitations of the compressed file format75406 +Node: Portability issues79036 +Node: Reporting bugs81092 +Node: Did you get the right package?84632 +Node: Further Reading85859 + +End Tag Table --- a/bzip2recover.c 2010-09-11 09:18:40.000000000 +1000 +++ b/bzip2recover.c 2011-12-04 18:16:28.000000000 +1100 @@ -24,6 +24,8 @@ #include <errno.h> #include <stdlib.h> #include <string.h> +#include <fcntl.h> +#include <unistd.h> /* This program records bit locations in the file to be recovered. @@ -269,6 +271,19 @@ static Bool endsInBz2 ( Char* name ) name[n-1] == '2'); } +/*---------------------------------------------*/ +/* Open an output file safely with O_EXCL and good permissions */ +FILE* fopen_output( Char* name, const char* mode ) +{ + FILE *fp; + int fh; + + fh = open(name, O_WRONLY|O_CREAT|O_EXCL, 0600); + if (fh == -1) return NULL; + fp = fdopen(fh, mode); + if (fp == NULL) close(fh); + return fp; +} /*---------------------------------------------------*/ /*--- ---*/ @@ -486,7 +501,7 @@ Int32 main ( Int32 argc, Char** argv ) fprintf ( stderr, " writing block %d to `%s' ...\n", wrBlock+1, outFileName ); - outFile = fopen ( outFileName, "wb" ); + outFile = fopen_output ( outFileName, "wb" ); if (outFile == NULL) { fprintf ( stderr, "%s: can't write `%s'\n", progName, outFileName ); --- a/bzgrep 2007-01-03 13:00:55.000000000 +1100 +++ b/bzgrep 2011-12-04 18:16:28.000000000 +1100 @@ -1,27 +1,75 @@ #!/bin/sh -# Bzgrep wrapped for bzip2, -# adapted from zgrep by Philippe Troin <phil@fifi.org> for Debian GNU/Linux. -## zgrep notice: -## zgrep -- a wrapper around a grep program that decompresses files as needed -## Adapted from a version sent by Charles Levert <charles@comm.polymtl.ca> +# bzgrep -- a wrapper around a grep program that decompresses files as needed +# Adapted from zgrep of the Debian gzip package by Anibal Monsalve Salazar. +# Adapted from a version sent by Charles Levert <charles@comm.polymtl.ca> + +# Copyright (C) 1998, 2001, 2002 Free Software Foundation +# Copyright (C) 1993 Jean-loup Gailly + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +# 02111-1307, USA. PATH="/usr/bin:$PATH"; export PATH -prog=`echo $0 | sed 's|.*/||'` +prog=`echo "$0" | sed 's|.*/||'` case "$prog" in *egrep) grep=${EGREP-egrep} ;; *fgrep) grep=${FGREP-fgrep} ;; *) grep=${GREP-grep} ;; esac + pat="" +after_dash_dash="" +files_with_matches=0 +files_without_matches=0 +no_filename=0 +with_filename=0 + while test $# -ne 0; do - case "$1" in - -e | -f) opt="$opt $1"; shift; pat="$1" + case "$after_dash_dash$1" in + --d* | --rec*) echo >&2 "$0: $1: option not supported"; exit 2;; + --files-with-*) files_with_matches=1;; + --files-witho*) files_without_matches=1;; + --no-f*) no_filename=1;; + --wi*) with_filename=1;; + --*) ;; + -*) + case "$1" in + -*[dr]*) echo >&2 "$0: $1: option not supported"; exit 2;; + esac + case "$1" in + -*H*) with_filename=1;; + esac + case "$1" in + -*h*) no_filename=1;; + esac + case "$1" in + -*L*) files_without_matches=1;; + esac + case "$1" in + -*l*) files_with_matches=1;; + esac;; + esac + case "$after_dash_dash$1" in + -[ef]) opt="$opt $1"; shift; pat="$1" if test "$grep" = grep; then # grep is buggy with -e on SVR4 grep=egrep fi;; - -A | -B) opt="$opt $1 $2"; shift;; + -[ABCdm])opt="$opt $1 $2"; shift;; + --) opt="$opt $1"; after_dash_dash=1;; -*) opt="$opt $1";; *) if test -z "$pat"; then pat="$1" @@ -35,19 +83,9 @@ done if test -z "$pat"; then echo "grep through bzip2 files" echo "usage: $prog [grep_options] pattern [files]" - exit 1 + exit 2 fi -list=0 -silent=0 -op=`echo "$opt" | sed -e 's/ //g' -e 's/-//g'` -case "$op" in - *l*) list=1 -esac -case "$op" in - *h*) silent=1 -esac - if test $# -eq 0; then bzip2 -cdfq | $grep $opt "$pat" exit $? @@ -55,21 +93,40 @@ fi res=0 for i do - if test -f "$i"; then :; else if test -f "$i.bz2"; then i="$i.bz2"; fi; fi - if test $list -eq 1; then - bzip2 -cdfq "$i" | $grep $opt "$pat" 2>&1 > /dev/null && echo $i - r=$? - elif test $# -eq 1 -o $silent -eq 1; then - bzip2 -cdfq "$i" | $grep $opt "$pat" - r=$? - else - j=${i//\\/\\\\} - j=${j//|/\\|} - j=${j//&/\\&} - j=`printf "%s" "$j" | tr '\n' ' '` - bzip2 -cdfq "$i" | $grep $opt "$pat" | sed "s|^|${j}:|" - r=$? - fi - test "$r" -ne 0 && res="$r" + bzip2 -cdfq -- "$i" | + if test $files_with_matches -eq 1; then + $grep $opt "$pat" > /dev/null && printf "%s\n" "$i" + elif test $files_without_matches -eq 1; then + $grep $opt "$pat" > /dev/null || printf "%s\n" "$i" + elif test $with_filename -eq 0 && { test $# -eq 1 || test $no_filename -eq 1; }; then + $grep $opt "$pat" + else + i=$(echo "$i" | sed -e 's/[\\|&]/\\&/g') + if test $with_filename -eq 1; then + sed_script="s|^[^:]*:|${i}:|" + else + sed_script="s|^|${i}:|" + fi + # Hack adapted from GPLed code at + # http://home.comcast.net/~j.p.h/cus-faq-2 + # Has the same effect as the following two lines of bash: + # + # $grep $opt "$pat" | sed "$sed_script" + # exit ${PIPESTATUS[0]} + # + # Inside the `...`, fd4 goes to the pipe whose other end is read + # and passed to eval; fd1 is the normal standard output + # preserved the line before with exec 3>&1 + exec 3>&1 + eval ` + exec 4>&1 >&3 3>&- + { + $grep $opt "$pat" 4>&-; echo "r=$?;" >&4 + } | sed "$sed_script" + ` + exit $r + fi + r=$? + test $res -lt $r && res=$r done exit $res --- a/bzdiff 2007-01-03 13:00:55.000000000 +1100 +++ b/bzdiff 2011-12-04 18:16:28.000000000 +1100 @@ -37,10 +37,6 @@ if test -z "$FILES"; then echo "Usage: $prog [${comp}_options] file [file]" exit 1 fi -tmp=`mktemp ${TMPDIR:-/tmp}/bzdiff.XXXXXXXXXX` || { - echo 'cannot create a temporary file' >&2 - exit 1 -} set $FILES if test $# -eq 1; then FILE=`echo "$1" | sed 's/.bz2$//'` @@ -53,10 +49,14 @@ elif test $# -eq 2; then case "$2" in *.bz2) F=`echo "$2" | sed 's|.*/||;s|.bz2$||'` - bzip2 -cdfq "$2" > $tmp - bzip2 -cdfq "$1" | $comp $OPTIONS - $tmp + tmp=`mktemp "${TMPDIR:-/tmp}"/bzdiff.XXXXXXXXXX` || { + echo 'cannot create a temporary file' >&2 + exit 1 + } + bzip2 -cdfq "$2" > "$tmp" + bzip2 -cdfq "$1" | $comp $OPTIONS - "$tmp" STAT="$?" - /bin/rm -f $tmp;; + /bin/rm -f "$tmp";; *) bzip2 -cdfq "$1" | $comp $OPTIONS - "$2" STAT="$?";; @@ -69,8 +69,8 @@ elif test $# -eq 2; then STAT="$?";; esac;; esac - exit "$STAT" else echo "Usage: $prog [${comp}_options] file [file]" exit 1 fi +exit "$STAT" --- a/manual.xml 2010-09-11 19:36:06.000000000 +1000 +++ b/manual.xml 2011-12-04 18:16:28.000000000 +1100 @@ -159,13 +159,22 @@ else.</para> <listitem><para><computeroutput>bzip2</computeroutput> [ -cdfkqstvzVL123456789 ] [ filenames ... ]</para></listitem> + + <listitem><para><computeroutput>bzip2</computeroutput> [ + -h | --help ]</para></listitem> <listitem><para><computeroutput>bunzip2</computeroutput> [ -fkvsVL ] [ filenames ... ]</para></listitem> + <listitem><para><computeroutput>bunzip2</computeroutput> [ + -h | --help ]</para></listitem> + <listitem><para><computeroutput>bzcat</computeroutput> [ -s ] [ filenames ... ]</para></listitem> + <listitem><para><computeroutput>bzcat</computeroutput> [ + -h | --help ]</para></listitem> + <listitem><para><computeroutput>bzip2recover</computeroutput> filename</para></listitem> @@ -397,6 +406,10 @@ consistency error (eg, bug) which caused will not be suppressed.</para></listitem> </varlistentry> + <varlistentry><term><computeroutput>-h --help</computeroutput></term> + <listitem><para>Print a help message and exit.</para></listitem> + </varlistentry> + <varlistentry> <term><computeroutput>-v --verbose</computeroutput></term> <listitem><para>Verbose mode -- show the compression ratio for @@ -1162,9 +1175,9 @@ BZ_CONFIG_ERROR if the library has been mis-compiled BZ_PARAM_ERROR if strm is NULL - or blockSize < 1 or blockSize > 9 - or verbosity < 0 or verbosity > 4 - or workFactor < 0 or workFactor > 250 + or blockSize < 1 or blockSize > 9 + or verbosity < 0 or verbosity > 4 + or workFactor < 0 or workFactor > 250 BZ_MEM_ERROR if not enough memory is available BZ_OK @@ -1474,8 +1487,8 @@ could fail with BZ_CONFIG_ERROR if the library has been mis-compiled BZ_PARAM_ERROR - if ( small != 0 && small != 1 ) - or (verbosity <; 0 || verbosity > 4) + if ( small != 0 && small != 1 ) + or (verbosity < 0 || verbosity > 4) BZ_MEM_ERROR if insufficient memory is available </programlisting> @@ -1560,7 +1573,7 @@ and release memory.</para> <programlisting> BZ_PARAM_ERROR if strm is NULL or strm->s is NULL - or strm->avail_out < 1 + or strm->avail_out < 1 BZ_DATA_ERROR if a data integrity error is detected in the compressed stream BZ_DATA_ERROR_MAGIC @@ -1733,8 +1746,8 @@ BZ_CONFIG_ERROR BZ_PARAM_ERROR if f is NULL or small is neither 0 nor 1 - or ( unused == NULL && nUnused != 0 ) - or ( unused != NULL && !(0 <= nUnused <= BZ_MAX_UNUSED) ) + or ( unused == NULL && nUnused != 0 ) + or ( unused != NULL && !(0 <= nUnused <= BZ_MAX_UNUSED) ) BZ_IO_ERROR if ferror(f) is nonzero BZ_MEM_ERROR @@ -1813,7 +1826,7 @@ immediately before <programlisting> BZ_PARAM_ERROR - if b is NULL or buf is NULL or len < 0 + if b is NULL or buf is NULL or len < 0 BZ_SEQUENCE_ERROR if b was opened with BZ2_bzWriteOpen BZ_IO_ERROR @@ -1971,7 +1984,7 @@ BZ_CONFIG_ERROR if the library has been mis-compiled BZ_PARAM_ERROR if f is NULL - or blockSize100k < 1 or blockSize100k > 9 + or blockSize100k < 1 or blockSize100k > 9 BZ_IO_ERROR if ferror(f) is nonzero BZ_MEM_ERROR @@ -2018,7 +2031,7 @@ compressed and written to the file.</par <programlisting> BZ_PARAM_ERROR - if b is NULL or buf is NULL or len < 0 + if b is NULL or buf is NULL or len < 0 BZ_SEQUENCE_ERROR if b was opened with BZ2_bzReadOpen BZ_IO_ERROR @@ -2169,7 +2182,7 @@ f = fopen ( "myfile.bz2", "w" ); if ( !f ) { /* handle error */ } -b = BZ2_bzWriteOpen( &bzerror, f, 9 ); +b = BZ2_bzWriteOpen( &bzerror, f, 9 ); if (bzerror != BZ_OK) { BZ2_bzWriteClose ( b ); /* handle error */ @@ -2177,14 +2190,14 @@ if (bzerror != BZ_OK) { while ( /* condition */ ) { /* get data to write into buf, and set nBuf appropriately */ - nWritten = BZ2_bzWrite ( &bzerror, b, buf, nBuf ); + nWritten = BZ2_bzWrite ( &bzerror, b, buf, nBuf ); if (bzerror == BZ_IO_ERROR) { - BZ2_bzWriteClose ( &bzerror, b ); + BZ2_bzWriteClose ( &bzerror, b ); /* handle error */ } } -BZ2_bzWriteClose( &bzerror, b ); +BZ2_bzWriteClose( &bzerror, b ); if (bzerror == BZ_IO_ERROR) { /* handle error */ } @@ -2204,24 +2217,24 @@ f = fopen ( "myfile.bz2", "r" ); if ( !f ) { /* handle error */ } -b = BZ2_bzReadOpen ( &bzerror, f, 0, NULL, 0 ); +b = BZ2_bzReadOpen ( &bzerror, f, 0, NULL, 0 ); if ( bzerror != BZ_OK ) { - BZ2_bzReadClose ( &bzerror, b ); + BZ2_bzReadClose ( &bzerror, b ); /* handle error */ } bzerror = BZ_OK; -while ( bzerror == BZ_OK && /* arbitrary other conditions */) { - nBuf = BZ2_bzRead ( &bzerror, b, buf, /* size of buf */ ); +while ( bzerror == BZ_OK && /* arbitrary other conditions */) { + nBuf = BZ2_bzRead ( &bzerror, b, buf, /* size of buf */ ); if ( bzerror == BZ_OK ) { /* do something with buf[0 .. nBuf-1] */ } } if ( bzerror != BZ_STREAM_END ) { - BZ2_bzReadClose ( &bzerror, b ); + BZ2_bzReadClose ( &bzerror, b ); /* handle error */ } else { - BZ2_bzReadClose ( &bzerror, b ); + BZ2_bzReadClose ( &bzerror, b ); } </programlisting> @@ -2287,9 +2300,9 @@ BZ_CONFIG_ERROR if the library has been mis-compiled BZ_PARAM_ERROR if dest is NULL or destLen is NULL - or blockSize100k < 1 or blockSize100k > 9 - or verbosity < 0 or verbosity > 4 - or workFactor < 0 or workFactor > 250 + or blockSize100k < 1 or blockSize100k > 9 + or verbosity < 0 or verbosity > 4 + or workFactor < 0 or workFactor > 250 BZ_MEM_ERROR if insufficient memory is available BZ_OUTBUFF_FULL @@ -2355,8 +2368,8 @@ BZ_CONFIG_ERROR if the library has been mis-compiled BZ_PARAM_ERROR if dest is NULL or destLen is NULL - or small != 0 && small != 1 - or verbosity < 0 or verbosity > 4 + or small != 0 && small != 1 + or verbosity < 0 or verbosity > 4 BZ_MEM_ERROR if insufficient memory is available BZ_OUTBUFF_FULL --- a/bzmore 2007-01-03 13:00:55.000000000 +1100 +++ b/bzmore 2011-12-04 18:16:28.000000000 +1100 @@ -24,10 +24,10 @@ else # 'stty min 1' resets eof to ^a on both SunOS and SysV! cb='min 1 -icanon'; ncb='icanon eof ^d' fi -if test $? -eq 0 -a -n "$oldtty"; then - trap 'stty $oldtty 2>/dev/null; exit' 0 2 3 5 10 13 15 +if test $? -eq 0 && test -n "$oldtty"; then + trap 'stty $oldtty 2>/dev/null; exit' 0 INT QUIT TRAP USR1 PIPE TERM else - trap 'stty $ncb echo 2>/dev/null; exit' 0 2 3 5 10 13 15 + trap 'stty $ncb echo 2>/dev/null; exit' 0 INT QUIT TRAP USR1 PIPE TERM fi if test $# = 0; then @@ -46,7 +46,7 @@ else ANS=`dd bs=1 count=1 2>/dev/null` stty $ncb echo 2>/dev/null echo " " - if test "$ANS" = 'e' -o "$ANS" = 'q'; then + if test "$ANS" = 'e' || test "$ANS" = 'q'; then exit fi fi --- a/bzip2.c 2010-09-11 09:04:53.000000000 +1000 +++ b/bzip2.c 2011-12-04 18:16:28.000000000 +1100 @@ -1890,7 +1890,9 @@ IntNative main ( IntNative argc, Char *a case '8': blockSize100k = 8; break; case '9': blockSize100k = 9; break; case 'V': - case 'L': license(); break; + case 'L': license(); + exit ( 0 ); + break; case 'v': verbosity++; break; case 'h': usage ( progName ); exit ( 0 ); @@ -1916,8 +1918,8 @@ IntNative main ( IntNative argc, Char *a if (ISFLAG("--keep")) keepInputFiles = True; else if (ISFLAG("--small")) smallMode = True; else if (ISFLAG("--quiet")) noisy = False; else - if (ISFLAG("--version")) license(); else - if (ISFLAG("--license")) license(); else + if (ISFLAG("--version")) { license(); exit ( 0 ); } else + if (ISFLAG("--license")) { license(); exit ( 0 ); } else if (ISFLAG("--exponential")) workFactor = 1; else if (ISFLAG("--repetitive-best")) redundant(aa->name); else if (ISFLAG("--repetitive-fast")) redundant(aa->name); else @@ -2003,12 +2005,14 @@ IntNative main ( IntNative argc, Char *a testf ( aa->name ); } } - if (testFailsExist && noisy) { - fprintf ( stderr, - "\n" - "You can use the `bzip2recover' program to attempt to recover\n" - "data from undamaged sections of corrupted files.\n\n" - ); + if (testFailsExist) { + if (noisy) { + fprintf ( stderr, + "\n" + "You can use the `bzip2recover' program to attempt to recover\n" + "data from undamaged sections of corrupted files.\n\n" + ); + } setExit(2); exit(exitValue); } --- a/bzexe 2011-12-04 13:55:53.589856334 +1100 +++ b/bzexe 2011-12-04 18:16:28.000000000 +1100 @@ -0,0 +1,182 @@ +#!/bin/sh +# gzexe: compressor for Unix executables. +# Use this only for binaries that you do not use frequently. +# +# The compressed version is a shell script which decompresses itself after +# skipping $skip lines of shell commands. We try invoking the compressed +# executable with the original name (for programs looking at their name). +# We also try to retain the original file permissions on the compressed file. +# For safety reasons, gzexe will not create setuid or setgid shell scripts. + +# WARNING: the first line of this file must be either : or #!/bin/sh +# The : is required for some old versions of csh. +# On Ultrix, /bin/sh is too buggy, change the first line to: #!/bin/sh5 + + +# Copyright (C) 1998, 2002 Free Software Foundation +# Copyright (C) 1993 Jean-loup Gailly + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. + +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +# 02111-1307, USA. + + +PATH="/usr/bin:$PATH" +x=`basename $0` +if test $# = 0; then + echo compress executables. original file foo is renamed to foo~ + echo usage: ${x} [-d] files... + echo " -d decompress the executables" + exit 1 +fi + +set -C +tmp=gz$$ +trap "rm -f $tmp; exit 1" HUP INT QUIT TRAP USR1 PIPE TERM +: > $tmp || exit 1 + +decomp=0 +res=0 +test "$x" = "ungzexe" && decomp=1 +if test "x$1" = "x-d"; then + decomp=1 + shift +fi + +echo hi > zfoo1$$ || exit 1 +echo hi > zfoo2$$ || exit 1 +if test -z "`(${CPMOD-cpmod} zfoo1$$ zfoo2$$) 2>&1`"; then + cpmod=${CPMOD-cpmod} +fi +rm -f zfoo[12]$$ + +tail="" +IFS="${IFS= }"; saveifs="$IFS"; IFS="${IFS}:" +for dir in $PATH; do + test -z "$dir" && dir=. + if test -f $dir/tail; then + tail="$dir/tail" + break + fi +done +IFS="$saveifs" +if test -z "$tail"; then + echo cannot find tail + exit 1 +fi +case `echo foo | $tail -n +1 2>/dev/null` in +foo) tail="$tail -n";; +esac + +for i do + if test ! -f "$i" ; then + echo ${x}: $i not a file + res=1 + continue + fi + if test $decomp -eq 0; then + if sed -e 1d -e 2q "$i" | grep "^skip=[0-9]*$" >/dev/null; then + echo "${x}: $i is already gzexe'd" + continue + fi + fi + if ls -l "$i" | grep '^...[sS]' > /dev/null; then + echo "${x}: $i has setuid permission, unchanged" + continue + fi + if ls -l "$i" | grep '^......[sS]' > /dev/null; then + echo "${x}: $i has setgid permission, unchanged" + continue + fi + case "`basename $i`" in + bzip2 | tail | sed | chmod | ln | sleep | rm) + echo "${x}: $i would depend on itself"; continue ;; + esac + if test -z "$cpmod"; then + cp -p "$i" $tmp 2>/dev/null || cp "$i" $tmp + if test -w $tmp 2>/dev/null; then + writable=1 + else + writable=0 + chmod u+w $tmp 2>/dev/null + fi + : >| $tmp # truncate the file, ignoring set -C + fi + if test $decomp -eq 0; then + sed 1q $0 >> $tmp + sed "s|^if tail|if $tail|" >> $tmp <<'EOF' +skip=23 +set -C +umask=`umask` +umask 77 +tmpfile=`tempfile -p gztmp -d /tmp` || exit 1 +if tail +$skip "$0" | /bin/bzip2 -cd >> $tmpfile; then + umask $umask + /bin/chmod 700 $tmpfile + prog="`echo $0 | /bin/sed 's|^.*/||'`" + if /bin/ln -T $tmpfile "/tmp/$prog" 2>/dev/null; then + trap '/bin/rm -f $tmpfile "/tmp/$prog"; exit $res' 0 + (/bin/sleep 5; /bin/rm -f $tmpfile "/tmp/$prog") 2>/dev/null & + /tmp/"$prog" ${1+"$@"}; res=$? + else + trap '/bin/rm -f $tmpfile; exit $res' 0 + (/bin/sleep 5; /bin/rm -f $tmpfile) 2>/dev/null & + $tmpfile ${1+"$@"}; res=$? + fi +else + echo Cannot decompress $0; exit 1 +fi; exit $res +EOF + bzip2 -cv9 "$i" >> $tmp || { + /bin/rm -f $tmp + echo ${x}: compression not possible for $i, file unchanged. + res=1 + continue + } + + else + # decompression + skip=23 + if sed -e 1d -e 2q "$i" | grep "^skip=[0-9]*$" >/dev/null; then + eval `sed -e 1d -e 2q "$i"` + fi + if tail +$skip "$i" | bzip2 -cd > $tmp; then + : + else + echo ${x}: $i probably not in gzexe format, file unchanged. + res=1 + continue + fi + fi + rm -f "$i~" + mv "$i" "$i~" || { + echo ${x}: cannot backup $i as $i~ + rm -f $tmp + res=1 + continue + } + mv $tmp "$i" || cp -p $tmp "$i" 2>/dev/null || cp $tmp "$i" || { + echo ${x}: cannot create $i + rm -f $tmp + res=1 + continue + } + rm -f $tmp + if test -n "$cpmod"; then + $cpmod "$i~" "$i" 2>/dev/null + elif test $writable -eq 0; then + chmod u-w $i 2>/dev/null + fi +done +exit $res --- a/bzip2.1 2010-09-11 19:35:11.000000000 +1000 +++ b/bzip2.1 2011-12-04 18:16:28.000000000 +1100 @@ -14,6 +14,9 @@ bzip2recover \- recovers data from damag [ .I "filenames \&..." ] +.br +.B bzip2 +.RB [ " \-h|--help " ] .ll -8 .br .B bunzip2 @@ -22,12 +25,18 @@ bzip2recover \- recovers data from damag .I "filenames \&..." ] .br +.B bunzip2 +.RB [ " \-h|--help " ] +.br .B bzcat .RB [ " \-s " ] [ .I "filenames \&..." ] .br +.B bzcat +.RB [ " \-h|--help " ] +.br .B bzip2recover .I "filename" @@ -240,6 +249,9 @@ Verbose mode -- show the compression rat Further \-v's increase the verbosity level, spewing out lots of information which is primarily of interest for diagnostic purposes. .TP +.B \-h --help +Print a help message and exit. +.TP .B \-L --license -V --version Display the software version, license terms and conditions. .TP --- a/Makefile 2010-09-11 08:46:02.000000000 +1000 +++ b/Makefile 2011-12-04 18:16:28.000000000 +1100 @@ -12,6 +12,8 @@ # in the file LICENSE. # ------------------------------------------------------------------ +somajor=1.0 +sominor=$(somajor).4 SHELL=/bin/sh # To assist in cross-compiling @@ -21,7 +23,7 @@ RANLIB=ranlib LDFLAGS= BIGFILES=-D_FILE_OFFSET_BITS=64 -CFLAGS=-Wall -Winline -O2 -g $(BIGFILES) +CFLAGS=-Wall -Winline -O2 -g $(BIGFILES) $(DEBCFLAGS) # Where you want it installed when you do 'make install' PREFIX=/usr/local @@ -35,9 +37,9 @@ OBJS= blocksort.o \ decompress.o \ bzlib.o -all: libbz2.a bzip2 bzip2recover test +all: libbz2.a bzip2 bzip2recover # test -bzip2: libbz2.a bzip2.o +bzip2: libbz2.so bzip2.o $(CC) $(CFLAGS) $(LDFLAGS) -o bzip2 bzip2.o -L. -lbz2 bzip2recover: bzip2recover.o @@ -46,20 +48,42 @@ bzip2recover: bzip2recover.o libbz2.a: $(OBJS) rm -f libbz2.a $(AR) cq libbz2.a $(OBJS) - @if ( test -f $(RANLIB) -o -f /usr/bin/ranlib -o \ - -f /bin/ranlib -o -f /usr/ccs/bin/ranlib ) ; then \ + @if ( test -f $(RANLIB) || test -f /usr/bin/ranlib || \ + test -f /bin/ranlib || test -f /usr/ccs/bin/ranlib ) ; then \ echo $(RANLIB) libbz2.a ; \ $(RANLIB) libbz2.a ; \ fi +libbz2.so: libbz2.so.$(somajor) + ln -sf $^ $@ + +libbz2.so.$(somajor): libbz2.so.$(sominor) + ln -sf $^ $@ + +libbz2.so.$(sominor): $(OBJS:%.o=%.sho) + $(CC) -o libbz2.so.$(sominor) -shared \ + -Wl,-soname,libbz2.so.$(somajor) $^ -lc + +%.sho: %.c + $(CC) $(CFLAGS) -D_REENTRANT -fPIC -o $@ -c $< + +%.o: %.c + $(CC) $(CFLAGS) -D_REENTRANT -o $@ -c $< + check: test test: bzip2 @cat words1 + LD_LIBRARY_PATH=.:$$LD_LIBRARY_PATH \ ./bzip2 -1 < sample1.ref > sample1.rb2 + LD_LIBRARY_PATH=.:$$LD_LIBRARY_PATH \ ./bzip2 -2 < sample2.ref > sample2.rb2 + LD_LIBRARY_PATH=.:$$LD_LIBRARY_PATH \ ./bzip2 -3 < sample3.ref > sample3.rb2 + LD_LIBRARY_PATH=.:$$LD_LIBRARY_PATH \ ./bzip2 -d < sample1.bz2 > sample1.tst + LD_LIBRARY_PATH=.:$$LD_LIBRARY_PATH \ ./bzip2 -d < sample2.bz2 > sample2.tst + LD_LIBRARY_PATH=.:$$LD_LIBRARY_PATH \ ./bzip2 -ds < sample3.bz2 > sample3.tst cmp sample1.bz2 sample1.rb2 cmp sample2.bz2 sample2.rb2 @@ -69,15 +93,15 @@ test: bzip2 cmp sample3.tst sample3.ref @cat words3 -install: bzip2 bzip2recover +install: bzip2 bzip2recover libbz2.a if ( test ! -d $(PREFIX)/bin ) ; then mkdir -p $(PREFIX)/bin ; fi if ( test ! -d $(PREFIX)/lib ) ; then mkdir -p $(PREFIX)/lib ; fi if ( test ! -d $(PREFIX)/man ) ; then mkdir -p $(PREFIX)/man ; fi if ( test ! -d $(PREFIX)/man/man1 ) ; then mkdir -p $(PREFIX)/man/man1 ; fi if ( test ! -d $(PREFIX)/include ) ; then mkdir -p $(PREFIX)/include ; fi cp -f bzip2 $(PREFIX)/bin/bzip2 - cp -f bzip2 $(PREFIX)/bin/bunzip2 - cp -f bzip2 $(PREFIX)/bin/bzcat + ln $(PREFIX)/bin/bzip2 $(PREFIX)/bin/bunzip2 + ln $(PREFIX)/bin/bzip2 $(PREFIX)/bin/bzcat cp -f bzip2recover $(PREFIX)/bin/bzip2recover chmod a+x $(PREFIX)/bin/bzip2 chmod a+x $(PREFIX)/bin/bunzip2 @@ -87,8 +111,10 @@ install: bzip2 bzip2recover chmod a+r $(PREFIX)/man/man1/bzip2.1 cp -f bzlib.h $(PREFIX)/include chmod a+r $(PREFIX)/include/bzlib.h - cp -f libbz2.a $(PREFIX)/lib + cp -fa libbz2.a libbz2.so* $(PREFIX)/lib chmod a+r $(PREFIX)/lib/libbz2.a + cp -f bzexe $(PREFIX)/bin/bzexe + chmod a+x $(PREFIX)/bin/bzexe cp -f bzgrep $(PREFIX)/bin/bzgrep ln -s -f $(PREFIX)/bin/bzgrep $(PREFIX)/bin/bzegrep ln -s -f $(PREFIX)/bin/bzgrep $(PREFIX)/bin/bzfgrep @@ -99,7 +125,8 @@ install: bzip2 bzip2recover cp -f bzdiff $(PREFIX)/bin/bzdiff ln -s -f $(PREFIX)/bin/bzdiff $(PREFIX)/bin/bzcmp chmod a+x $(PREFIX)/bin/bzdiff - cp -f bzgrep.1 bzmore.1 bzdiff.1 $(PREFIX)/man/man1 + cp -f bzexe.1 bzgrep.1 bzmore.1 bzdiff.1 $(PREFIX)/man/man1 + chmod a+r $(PREFIX)/man/man1/bzexe.1 chmod a+r $(PREFIX)/man/man1/bzgrep.1 chmod a+r $(PREFIX)/man/man1/bzmore.1 chmod a+r $(PREFIX)/man/man1/bzdiff.1 @@ -109,33 +136,13 @@ install: bzip2 bzip2recover echo ".so man1/bzdiff.1" > $(PREFIX)/man/man1/bzcmp.1 clean: - rm -f *.o libbz2.a bzip2 bzip2recover \ + rm -f *.o *.sho libbz2.a libbz2.so* bzip2 bzip2recover \ sample1.rb2 sample2.rb2 sample3.rb2 \ sample1.tst sample2.tst sample3.tst -blocksort.o: blocksort.c - @cat words0 - $(CC) $(CFLAGS) -c blocksort.c -huffman.o: huffman.c - $(CC) $(CFLAGS) -c huffman.c -crctable.o: crctable.c - $(CC) $(CFLAGS) -c crctable.c -randtable.o: randtable.c - $(CC) $(CFLAGS) -c randtable.c -compress.o: compress.c - $(CC) $(CFLAGS) -c compress.c -decompress.o: decompress.c - $(CC) $(CFLAGS) -c decompress.c -bzlib.o: bzlib.c - $(CC) $(CFLAGS) -c bzlib.c -bzip2.o: bzip2.c - $(CC) $(CFLAGS) -c bzip2.c -bzip2recover.o: bzip2recover.c - $(CC) $(CFLAGS) -c bzip2recover.c - distclean: clean - rm -f manual.ps manual.html manual.pdf + #rm -f manual.ps manual.html manual.pdf DISTNAME=bzip2-1.0.6 dist: check manual @@ -187,6 +194,8 @@ dist: check manual $(DISTNAME)/bzdiff.1 \ $(DISTNAME)/bzmore \ $(DISTNAME)/bzmore.1 \ + $(DISTNAME)/bzexe \ + $(DISTNAME)/bzexe.1 \ $(DISTNAME)/bzgrep \ $(DISTNAME)/bzgrep.1 \ $(DISTNAME)/Makefile-libbz2_so \