zbackup/compression.cc

705 lines
19 KiB
C++

// Copyright (c) 2012-2014 Konstantin Isakov <ikm@zbackup.org> and ZBackup contributors, see CONTRIBUTORS
// Part of ZBackup. Licensed under GNU GPLv2 or later + OpenSSL, see LICENSE
#include <string>
#include "compression.hh"
#include "check.hh"
#include "endian.hh"
namespace Compression {
EnDecoder::EnDecoder()
{
}
EnDecoder::~EnDecoder()
{
}
CompressionMethod::~CompressionMethod()
{
}
// LZMA
#include <lzma.h>
class LZMAEnDecoder : public EnDecoder
{
protected:
static lzma_stream initValue;
lzma_stream strm;
public:
LZMAEnDecoder()
{
strm = initValue;
}
void setInput( const void* data, size_t size )
{
strm.next_in = (const uint8_t *) data;
strm.avail_in = size;
}
void setOutput( void* data, size_t size )
{
strm.next_out = (uint8_t *) data;
strm.avail_out = size;
}
size_t getAvailableInput()
{
return strm.avail_in;
}
size_t getAvailableOutput()
{
return strm.avail_out;
}
bool process( bool finish )
{
lzma_ret ret = lzma_code( &strm, ( finish ? LZMA_FINISH : LZMA_RUN ) );
CHECK( ret == LZMA_OK || ret == LZMA_STREAM_END, "lzma_code error: %d", (int) ret );
return ( ret == LZMA_STREAM_END );
}
~LZMAEnDecoder()
{
lzma_end( &strm );
}
};
lzma_stream LZMAEnDecoder::initValue = LZMA_STREAM_INIT;
class LZMAEncoder : public LZMAEnDecoder
{
public:
LZMAEncoder()
{
uint32_t preset = 6;
lzma_ret ret = lzma_easy_encoder( &strm, preset, LZMA_CHECK_CRC64 );
CHECK( ret == LZMA_OK, "lzma_easy_encoder error: %d", (int) ret );
}
LZMAEncoder( Config const & config )
{
uint32_t compressionLevel = config.GET_STORABLE( lzma, compression_level );
uint32_t preset = ( compressionLevel > 9 ) ?
( compressionLevel - 10 ) | LZMA_PRESET_EXTREME :
compressionLevel;
lzma_ret ret = lzma_easy_encoder( &strm, preset, LZMA_CHECK_CRC64 );
CHECK( ret == LZMA_OK, "lzma_easy_encoder error: %d", (int) ret );
}
};
class LZMADecoder : public LZMAEnDecoder
{
public:
LZMADecoder()
{
lzma_ret ret = lzma_stream_decoder( &strm, UINT64_MAX, 0 );
CHECK( ret == LZMA_OK,"lzma_stream_decoder error: %d", (int) ret );
}
};
class LZMACompression : public CompressionMethod
{
public:
sptr<EnDecoder> createEncoder( Config const & config ) const
{
return new LZMAEncoder( config );
}
sptr<EnDecoder> createEncoder() const
{
return new LZMAEncoder();
}
sptr<EnDecoder> createDecoder() const
{
return new LZMADecoder();
}
std::string getName() const { return "lzma"; }
};
// LZO
// liblzo implements a lot of algorithms "for unlimited backward compatibility"
// The web site says:
// "My experiments have shown that LZO1B is good with a large blocksize
// or with very redundant data, LZO1F is good with a small blocksize or
// with binary data and that LZO1X is often the best choice of all.
// LZO1Y and LZO1Z are almost identical to LZO1X - they can achieve a
// better compression ratio on some files.
// Beware, your mileage may vary."
// => I'm using LZO1X, as suggested
#include <string.h>
// Unfortunately, liblzo always works with the whole data, so it doesn't support
// the streaming approach that most other libraries use. This means that we have
// to use a big buffer for the data. The class NoStreamEnDecoder implements this
// so we can use it, if there is another library like liblzo.
// Collect all data and process it in one pass
class NoStreamEnDecoder : public EnDecoder
{
std::string accDataIn, accDataOut;
const char* dataIn;
char* dataOut;
size_t availIn, availOut;
bool processed;
size_t posInAccDataOut;
protected:
// you must implement these:
// Should we try with the existing output buffer which has availOut
// bytes of free space? If you know that this will fail, return false.
// You may peek into dataIn which contains the complete compressed data.
virtual bool shouldTryWith( const char* dataIn, size_t availIn, size_t availOut ) =0;
// We will allocate a buffer for the output data. How big should it be?
// You may peek into dataIn which contains the complete compressed data.
virtual size_t suggestOutputSize( const char* dataIn, size_t availIn ) =0;
// Is this input complete?
// An encoder should return false.
virtual bool isCompleteInput( const char* dataIn, size_t availIn ) =0;
// Process the data in dataIn and put the result into dataOut. You musn't
// write more than availOut bytes! If the output buffer is big enough,
// process the data and store the output size in outputSize. If the output
// buffer is too small, return false and we will give you a bigger one. If
// any other error occurrs, abort the program. We don't have any better
// error handling. Sorry. Do NOT return false for errors that won't be
// remedied by a bigger buffer!
virtual bool doProcess( const char* dataIn, size_t availIn,
char* dataOut, size_t availOut, size_t& outputSize ) =0;
void setUnusedInput( size_t unused )
{
this->dataIn += availIn - unused;
this->availIn = unused;
}
public:
NoStreamEnDecoder()
{
dataIn = dataOut = NULL;
availIn = availOut = posInAccDataOut = 0;
processed = false;
}
void setInput( const void* data, size_t size )
{
dataIn = (const char *) data;
availIn = size;
}
void setOutput( void* data, size_t size )
{
dataOut = (char *) data;
availOut = size;
}
size_t getAvailableInput()
{
return availIn;
}
size_t getAvailableOutput()
{
return availOut;
}
bool process( bool finish )
{
// try to process the input, if we haven't done it, yet
if ( !processed )
{
// data has not been encoded
if ( accDataIn.empty() )
{
// this is the first piece of data
if ( finish || isCompleteInput( dataIn, availIn ) )
{
// special case: all the data has been passed at once
// -> process it without using accDataIn
processFinish( dataIn, availIn );
}
}
// if we didn't process the data, put it into accumulator
if ( !processed )
{
// accumulate data in accDataIn
accDataIn.append( dataIn, availIn );
// If this was the last bit of data, we process it, now.
if ( finish || isCompleteInput( accDataIn.data(), accDataIn.size() ) )
{
processFinish( accDataIn.data(), accDataIn.size() );
}
}
}
// If the input has been processed, try to copy some of it to the output buffer.
if ( processed )
{
// data has been encoded or decoded, remaining output is in accDataOut
// -> copy to output
if ( availOut > 0 && accDataOut.size() - posInAccDataOut > 0 )
{
size_t sz = availOut;
if ( sz > accDataOut.size() - posInAccDataOut )
sz = accDataOut.size() - posInAccDataOut;
memcpy( dataOut, accDataOut.data() + posInAccDataOut, sz );
dataOut += sz;
availOut -= sz;
posInAccDataOut += sz;
}
// no more data left? -> return true
return ( accDataOut.size() - posInAccDataOut == 0 );
}
else
{
// not yet processed, so we cannot be done
return false;
}
}
private:
void processFinish( const char* dataIn, size_t availIn )
{
// should we try with the existing output buffer?
if ( shouldTryWith( dataIn, availIn, availOut ) )
{
size_t outputSize;
if ( doProcess( dataIn, availIn, dataOut, availOut, outputSize ) )
{
// it worked :-)
processed = true;
availOut -= outputSize;
return ;
}
}
// we use our own buffer
size_t bufferSize = suggestOutputSize( dataIn, availIn );
do {
accDataOut.resize( bufferSize );
size_t outputSize;
//TODO doc says we mustn't modify the pointer returned by data()...
if ( doProcess( dataIn, availIn,
(char*) accDataOut.data(), bufferSize, outputSize ) )
{
// buffer is big enough
accDataOut.resize( outputSize );
processed = true;
return ;
}
// try a bigger one
bufferSize *= 2;
} while (true);
}
};
#ifdef __APPLE__
#include <machine/endian.h>
#elif __FreeBSD__
#include <sys/endian.h>
#else
#include <endian.h>
#endif
// like NoStreamEnDecoder, but also adds the uncompressed size before the stream
//NOTE You should make sure that the compression function doesn't overwrite any
// memory, if this information is corrupted! This could be exploited by a
// malicious person and there is nothing I can do about it. I could check for
// an overflow, but when control gets back to this class, it is already too
// late, as one 'ret' instruction is enough to do harm.
class NoStreamAndUnknownSizeDecoder : public NoStreamEnDecoder
{
protected:
// You implement this one:
// If you don't know the real decoded size, don't change outputSize.
virtual bool doProcessNoSize( const char* dataIn, size_t availIn,
char* dataOut, size_t availOut, size_t& outputSize ) =0;
bool shouldTryWith( const char* dataIn, size_t availIn, size_t availOut )
{
return suggestOutputSize( dataIn, availIn ) <= availOut;
}
// Is this input complete?
bool isCompleteInput( const char* dataIn, size_t availIn )
{
if ( availIn < 2*sizeof(uint64_t) )
return false;
dataIn += sizeof(uint64_t);
size_t inputSize = le32toh( *(uint32_t*) dataIn );
return ( availIn >= inputSize + 2*sizeof(uint64_t) );
}
size_t suggestOutputSize( const char* dataIn, size_t availIn )
{
CHECK( availIn >= sizeof(uint64_t), "not enough input data" );
// We're not using size_t because we need a type that has the same size on all
// architectures. A 32-bit host won't be able to open files with more than
// 4GB (actually much less), so 4 byte are enough. Even a 64-bit host would
// have some trouble with allocating 8GB of RAM just for our buffers ;-)
//NOTE If your compiler doesn't accept this cast, your size_t is smaller than
// uint32_t. In that case, you are in trouble...
size_t outputSize = le32toh( *(uint32_t*) dataIn );
return outputSize;
}
bool doProcess( const char* dataIn, size_t availIn,
char* dataOut, size_t availOut, size_t& outputSize )
{
if ( availIn < 2*sizeof( uint64_t ) )
return false;
//NOTE We skip 8 bytes. If we later decide to drop compatibility with 32-bit
// hosts, we can save a 64-bit size. Well, that will be much later, when
// we can easily hold two copies of a 4GB file in main memory :-D
size_t neededOutputSize = le32toh( *(uint32_t*) dataIn );
dataIn += sizeof(uint64_t);
size_t inputSize = le32toh( *(uint32_t*) dataIn );
dataIn += sizeof(uint64_t);
/*if ( outputSize < neededOutputSize )
return false;*/
outputSize = neededOutputSize;
availIn -= 2*sizeof( uint64_t );
// We might not need all of our input data.
setUnusedInput( availIn - inputSize );
availIn = inputSize;
size_t reportedOutputSize = neededOutputSize;
if ( !doProcessNoSize( dataIn, availIn, dataOut, availOut, reportedOutputSize ) )
return false;
CHECK( reportedOutputSize == neededOutputSize,
"Size of decoded data is different than expected" );
return true;
}
};
// encoder for NoStreamAndUnknownSizeDecoder
class NoStreamAndUnknownSizeEncoder : public NoStreamEnDecoder
{
protected:
// You implement this one:
virtual bool doProcessNoSize( const char* dataIn, size_t availIn,
char* dataOut, size_t availOut, size_t& outputSize ) =0;
bool shouldTryWith( const char*, size_t, size_t availOut )
{
// If the compression doesn't use any spaces...
return availOut > sizeof( uint64_t );
}
bool isCompleteInput( const char* dataIn, size_t availIn )
{
// We cannot know whether the user wants to send more data.
// -> return false; user must use finish=true to signal end of data
return false;
}
size_t getOverhead()
{
return 2*sizeof( uint64_t );
}
size_t suggestOutputSize( const char*, size_t availIn )
{
// We assume that the compression won't make the data any bigger.
return availIn + getOverhead();
}
bool doProcess( const char* dataIn, size_t availIn,
char* dataOut, size_t availOut, size_t& outputSize )
{
CHECK( availIn <= UINT32_MAX,
"You want to compress more than 4GB of data?! Sorry, we don't support that, yet." );
memcpy(dataOut, "ABCDEFGHIJKLMNOP", 16);
// store size
*(uint32_t*)dataOut = htole32( availIn );
uint32_t* compressedSize = (uint32_t*) ( dataOut + sizeof( uint64_t ) );
// compressed data goes after the size
// We skip more than we actually use; see NoStreamAndUnknownSizeDecoder::doProcess(...).
dataOut += getOverhead();
availOut -= getOverhead();
if ( !doProcessNoSize( dataIn, availIn, dataOut, availOut, outputSize ) )
return false;
CHECK( outputSize <= UINT32_MAX,
"The compressed data is more than 4GB?! Sorry, we don't support that, yet." );
*compressedSize = htole32( (uint32_t) outputSize );
outputSize += getOverhead();
return true;
}
};
#ifdef HAVE_LIBLZO
#include <lzo/lzo1x.h>
// finally, we can implement lzo
class LZO1X_1_Decoder : public NoStreamAndUnknownSizeDecoder
{
protected:
bool doProcessNoSize( const char* dataIn, size_t availIn,
char* dataOut, size_t availOut, size_t& outputSize )
{
// same argument is used for available output size and size of decompressed data
outputSize = availOut;
int ret = lzo1x_decompress_safe( (const lzo_bytep) dataIn, availIn,
(lzo_bytep) dataOut, (lzo_uintp) &outputSize, NULL );
if ( ret == LZO_E_OUTPUT_OVERRUN )
return false;
CHECK( ret >= LZO_E_OK, "lzo1x_decompress_safe failed (code %d)", ret );
return true;
}
};
class LZO1X_1_Compression;
class LZO1X_1_Encoder : public NoStreamAndUnknownSizeEncoder
{
const LZO1X_1_Compression* compression;
static size_t calcMaxCompressedSize( size_t availIn );
public:
LZO1X_1_Encoder( const LZO1X_1_Compression* compression )
{
this->compression = compression;
}
protected:
bool doProcessNoSize( const char* dataIn, size_t availIn,
char* dataOut, size_t availOut, size_t& outputSize );
bool shouldTryWith( const char*, size_t, size_t availOut );
size_t suggestOutputSize( const char*, size_t availIn );
};
class LZO1X_1_Compression : public CompressionMethod
{
static bool initialized;
static void init()
{
//TODO This is not thread-safe. Does it have to be?
if (!initialized)
{
int ret = lzo_init();
CHECK( ret == LZO_E_OK, "lzo_init failed (%d)", ret );
initialized = true;
}
}
public:
sptr< EnDecoder > createEncoder( Config const & config ) const
{
init();
return new LZO1X_1_Encoder(this);
}
sptr< EnDecoder > createEncoder() const
{
init();
return new LZO1X_1_Encoder(this);
}
sptr< EnDecoder > createDecoder() const
{
init();
return new LZO1X_1_Decoder();
}
std::string getName() const { return "lzo1x_1"; }
lzo_voidp getWorkmem( size_t size ) const
{
return new char[size];
}
void giveBackWorkmem( lzo_voidp wrkmem ) const
{
//TODO I think we should keep the memory around and reuse it. After all
// it is only a few kilobytes and we will need it a lot. However, I
// won't risk anything here because I don't know whether this will be
// called by more than one thread.
delete[] (char*)wrkmem;
}
};
bool LZO1X_1_Compression::initialized = false;
size_t LZO1X_1_Encoder::calcMaxCompressedSize( size_t availIn )
{
// It seems that lzo1x_1_compress does NOT check whether the buffer is big enough.
// The documentation refers to example/simple.c which says:
// "Because the input block may be incompressible, we must provide a little more
// output space in case that compression is not possible."
// -> We use the same formula.
return (availIn + availIn / 16 + 64 + 3);
}
bool LZO1X_1_Encoder::shouldTryWith( const char* dataIn, size_t availIn, size_t availOut )
{
return availOut >= suggestOutputSize( dataIn, availIn );
}
size_t LZO1X_1_Encoder::suggestOutputSize( const char*, size_t availIn )
{
// It seems that lzo1x_1_compress does NOT check whether the buffer is big enough.
// The documentation refers to example/simple.c which says:
// "Because the input block may be incompressible, we must provide a little more
// output space in case that compression is not possible."
// -> We use the same formula.
return calcMaxCompressedSize( availIn ) + getOverhead();
}
bool LZO1X_1_Encoder::doProcessNoSize( const char* dataIn, size_t availIn,
char* dataOut, size_t availOut, size_t& outputSize )
{
// It seems that lzo1x_1_compress does NOT check whether the buffer is big enough.
// Therefore, we won't try it unless we are sure that the buffer is big enough.
if ( availOut < calcMaxCompressedSize( availIn ) )
return false;
// same argument is used for available output size (haha, see above)
// and size of decompressed data
outputSize = availOut;
lzo_voidp wrkmem = compression->getWorkmem(LZO1X_1_MEM_COMPRESS);
int ret = lzo1x_1_compress( (const lzo_bytep) dataIn, availIn,
(lzo_bytep) dataOut, (lzo_uintp) &outputSize, wrkmem );
compression->giveBackWorkmem(wrkmem);
if ( ret == LZO_E_OUTPUT_OVERRUN )
return false;
CHECK( ret >= LZO_E_OK, "lzo1x_1_compress failed (code %d)", ret );
return true;
}
#endif // HAVE_LIBLZO
// register them
const_sptr< CompressionMethod > const CompressionMethod::compressions[] = {
new LZMACompression(),
# ifdef HAVE_LIBLZO
new LZO1X_1_Compression(),
# endif
// NULL entry marks end of list. Don't remove it!
NULL
};
const_sptr< CompressionMethod > CompressionMethod::selectedCompression =
compressions[ 0 ];
const_sptr< CompressionMethod > CompressionMethod::findCompression(
const std::string& name, bool optional )
{
for ( const const_sptr<CompressionMethod>* c = compressions + 0; *c; ++c )
{
if ( (*c)->getName() == name )
{
return (*c);
}
}
if ( !optional )
{
throw exUnsupportedCompressionMethod( name );
}
return NULL;
}
// iterator over compressions
CompressionMethod::iterator::iterator( const const_sptr< CompressionMethod > * ptr ):
ptr( ptr )
{
}
CompressionMethod::iterator::iterator( const iterator & it ):
ptr( it.ptr )
{
}
CompressionMethod::iterator& CompressionMethod::iterator::operator =( const iterator& it )
{
this->ptr = it.ptr;
return *this;
}
bool CompressionMethod::iterator::operator ==( const iterator& other ) const
{
// special case: one has ptr==NULL (end iterator returned by end()) and the
// other has *ptr==NULL (end iterator obtained by calling ++)
if ( !ptr && ( !other.ptr || !*other.ptr ) )
return true;
else if ( !other.ptr && ( !ptr || !*ptr ) )
return true;
else
return (ptr == other.ptr);
}
bool CompressionMethod::iterator::operator !=( const iterator& other ) const
{
return !( *this == other );
}
bool CompressionMethod::iterator::atEnd() const
{
return !ptr || !*ptr;
}
CompressionMethod::iterator& CompressionMethod::iterator::operator ++()
{
CHECK( ptr && *ptr, "Cannot increment the end iterator" );
++ptr;
return *this;
}
const_sptr<CompressionMethod> CompressionMethod::iterator::operator *()
{
CHECK( ptr && *ptr, "Cannot dereference the end iterator" );
return *ptr;
}
CompressionMethod::iterator CompressionMethod::begin()
{
return iterator(compressions);
}
CompressionMethod::iterator CompressionMethod::end()
{
return iterator(NULL);
}
}