blob: ef728a9741c0358bb595d4be2d32814f1a22357e [file] [log] [blame]
/*
* Decoder for dedup files
*
* Copyright 2010 Princeton University.
* All rights reserved.
*
* Written by Christian Bienia.
*/
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include "decoder.h"
#include "dedupdef.h"
#include "config.h"
#include "util.h"
#include "hashtable.h"
#include "mbuffer.h"
#include "debug.h"
#ifdef ENABLE_GZIP_COMPRESSION
#include <zlib.h>
#endif //ENABLE_GZIP_COMPRESSION
#ifdef ENABLE_BZIP2_COMPRESSION
#include <bzlib.h>
#endif //ENABLE_BZIP2_COMPRESSION
#ifdef ENABLE_PARSEC_HOOKS
#include <hooks.h>
#endif //ENABLE_PARSEC_HOOKS
//The configuration block defined in main
config_t * conf;
//Hash table data structure & utility functions
struct hashtable *cache;
static unsigned int hash_from_key_fn( void *k ) {
//NOTE: sha1 sum is integer-aligned
return ((unsigned int *)k)[0];
}
static int keys_equal_fn ( void *key1, void *key2 ) {
return (memcmp(key1, key2, SHA1_LEN) == 0);
}
/*
* Helper function which reads the next chunk from the input file
*
* Returns the size of the data read
*/
static int read_chunk(int fd, chunk_t *chunk) {
int r;
assert(chunk!=NULL);
assert(fd>=0);
u_char type;
r=xread(fd, &type, sizeof(type));
if(r < 0) EXIT_TRACE("xread type fails\n")
else if(r == 0) return 0;
u_long len;
r=xread(fd, &len, sizeof(len));
if(r < 0) EXIT_TRACE("xread length fails\n")
else if(r == 0) EXIT_TRACE("incomplete chunk\n");
switch(type) {
case TYPE_FINGERPRINT:
if(len!=SHA1_LEN) EXIT_TRACE("incorrect size of SHA1 sum\n");
r=xread(fd, (unsigned char *)(chunk->sha1), SHA1_LEN);
if(r < 0) EXIT_TRACE("xread SHA1 sum fails\n")
else if(r == 0) EXIT_TRACE("incomplete chunk\n");
chunk->header.isDuplicate = TRUE;
break;
case TYPE_COMPRESS:
if(len<=0) EXIT_TRACE("illegal size of data chunk\n");
r = mbuffer_create(&chunk->compressed_data, len);
if(r != 0) EXIT_TRACE("Creation of input buffer failed.\n");
r=xread(fd, (unsigned char *)(chunk->compressed_data.ptr), chunk->compressed_data.n);
if(r < 0) EXIT_TRACE("xread data chunk fails\n")
else if(r == 0) EXIT_TRACE("incomplete chunk\n");
chunk->header.isDuplicate = FALSE;
break;
default:
EXIT_TRACE("unknown chunk type\n");
}
return len;
}
/* Helper function which uncompresses a data chunk
*
* Returns the size of the uncompressed data
*/
static int uncompress_chunk(chunk_t *chunk) {
int r;
assert(chunk!=NULL);
assert(!chunk->header.isDuplicate);
//uncompress the item
switch (conf->compress_type) {
case COMPRESS_NONE: {
//Simply duplicate the data
r = mbuffer_create(&chunk->uncompressed_data, chunk->compressed_data.n);
if(r != 0) EXIT_TRACE("Creation of decompression buffer failed.\n");
//copy the block
memcpy(chunk->uncompressed_data.ptr, chunk->compressed_data.ptr, chunk->compressed_data.n);
break;
}
#ifdef ENABLE_GZIP_COMPRESSION
case COMPRESS_GZIP: {
unsigned long len_64 = UNCOMPRESS_BOUND;
r = mbuffer_create(&chunk->uncompressed_data, len_64);
if(r != 0) EXIT_TRACE("Creation of decompression buffer failed.\n");
r = uncompress(chunk->uncompressed_data.ptr, &len_64, chunk->compressed_data.ptr, chunk->compressed_data.n);
//TODO: Automatically enlarge buffer if return value is Z_BUF_ERROR
if(r!=Z_OK) EXIT_TRACE("error uncompressing chunk data\n");
//Shrink buffer to actual size
if(len_64 < chunk->uncompressed_data.n) {
r = mbuffer_realloc(&chunk->uncompressed_data, len_64);
assert(r == 0);
}
break;
}
#else
case COMPRESS_GZIP: {
EXIT_TRACE("Gzip compression used by input file not supported.\n");
break;
}
#endif //ENABLE_GZIP_COMPRESSION
#ifdef ENABLE_BZIP2_COMPRESSION
case COMPRESS_BZIP2: {
unsigned int len_32 = UNCOMPRESS_BOUND;
r = mbuffer_create(&chunk->uncompressed_data, len_32);
if(r != 0) EXIT_TRACE("Creation of decompression buffer failed.\n");
r = BZ2_bzBuffToBuffDecompress(chunk->uncompressed_data.ptr, &len_32, chunk->compressed_data.ptr, chunk->compressed_data.n, 0, 0);
//TODO: Automatically enlarge buffer if return value is BZ_OUTBUFF_FULL
if(r!=BZ_OK) EXIT_TRACE("error uncompressing chunk data\n");
//Shrink buffer to actual size
if(len_32 < chunk->uncompressed_data.n) {
r = mbuffer_realloc(&chunk->uncompressed_data, len_32);
assert(r == 0);
}
break;
}
#else
case COMPRESS_BZIP2: {
EXIT_TRACE("Bzip2 compression used by input file not supported.\n");
break;
}
#endif //ENABLE_BZIP2_COMPRESSION
default: {
EXIT_TRACE("unknown compression type\n");
break;
}
}
mbuffer_free(&chunk->compressed_data);
return chunk->uncompressed_data.n;
}
void Decode(config_t * _conf) {
int fd_in;
int fd_out;
chunk_t *chunk=NULL;
int r;
conf = _conf;
//Create chunk cache
cache = hashtable_create(65536, hash_from_key_fn, keys_equal_fn, FALSE);
if(cache == NULL) {
printf("ERROR: Out of memory\n");
exit(1);
}
mbuffer_system_init();
//Open input & output files
fd_in = open(conf->infile, O_RDONLY|O_LARGEFILE);
if (fd_in < 0) {
perror("infile open");
exit(1);
}
byte compress_type;
if (read_header(fd_in, &compress_type)) {
EXIT_TRACE("Cannot read input file header.\n");
}
//Ignore any compression settings given at the command line, use type used during encoding
conf->compress_type = compress_type;
fd_out = open(conf->outfile, O_CREAT|O_WRONLY|O_TRUNC, ~(S_ISUID | S_ISGID |S_IXGRP | S_IXUSR | S_IXOTH));
if (fd_out < 0) {
perror("outfile open");
close(fd_in);
exit(1);
}
#ifdef ENABLE_PARSEC_HOOKS
__parsec_roi_begin();
#endif
while(TRUE) {
//chunks are 'consumed' if they are added to the hash table
//only duplicate chunks can get reused, malloc a new one otherwise
if(chunk==NULL) {
chunk = (chunk_t *)malloc(sizeof(chunk_t));
if(chunk==NULL) EXIT_TRACE("Memory allocation failed.\n");
}
//get input data
r=read_chunk(fd_in, chunk);
if(r<0) EXIT_TRACE("error reading from input file")
else if(r==0) break;
//process input data & assing chunk with corresponding uncompresse data to 'entry' variable
chunk_t *entry;
if(!chunk->header.isDuplicate) {
//We got the compressed data, use it to get original data back
r=uncompress_chunk(chunk);
if(r<=0) EXIT_TRACE("error uncompressing data")
//Compute SHA1 sum and add new chunk with uncompressed data to cache
SHA1_Digest(chunk->uncompressed_data.ptr, chunk->uncompressed_data.n, (unsigned char *)(chunk->sha1));
if(hashtable_insert(cache, (void *)(chunk->sha1), (void *)chunk) == 0) {
EXIT_TRACE("hashtable_insert failed");
}
entry=chunk;
chunk=NULL;
} else {
//We got a SHA1 key, use it to retrieve unique counterpart with uncompressed data
entry=(chunk_t *)hashtable_search(cache, (void *)(chunk->sha1));
if(entry==NULL) {
EXIT_TRACE("Encountered a duplicate chunk in input file but not its unique counterpart. Maybe data is out of order?");
}
}
//We now have the uncompressed data in 'entry', write uncompressed data to output file
if(xwrite(fd_out, entry->uncompressed_data.ptr, entry->uncompressed_data.n)<entry->uncompressed_data.n) {
EXIT_TRACE("error writing to output file");
}
}
#ifdef ENABLE_PARSEC_HOOKS
__parsec_roi_end();
#endif
close(fd_in);
close(fd_out);
free(chunk);
mbuffer_system_destroy();
//NOTE: Would have to iterate through hashtable and manually free all buffers. Calling
// hashtable_destroy will cause those buffers to be reported as leaked memory.
//hashtable_destroy(cache, TRUE);
}