7z as a much better archiver than gz/bz2

Andreas Stolcke stolcke at speech.sri.com
Sun Nov 11 08:27:29 PST 2007


In message <244d59a50711101259v60acb8e8tf3743520f2d92aa6 at mail.gmail.com>you wro
te:
> Alexy,
>  I strongly believe this mailing list is not suitable for spams like
> this. If you want to present to public which compression utilities you
> use or how much of cpu time your particular computation took, please,
> use your personal blog or something like that. I'm pretty much sure
> that vast majority of people in this list is not interested in
> receiving messages of this kind.
> 
> Best,
>  Kamadev
> 
> On Nov 10, 2007 7:05 PM, Alexy Khrabrov <deliverable at gmail.com> wrote:
> > Greetings -- I've switched to 7z for most of corpora compression, as
> > it gives results which are whole number of times better than gz, and
> > 1.1-1.5 better than bz2.  Would be nice to see it used more,
> > especially for the huge kind of things we do here.  E.g., a 4.0 GB lm
> > file was compressed by 7za (a command line version for linux) to 642
> > MB.  7za is multi-core CPU aware and knows all about locales and
> > encodings as well.
> >
> > http://www.7-zip.org/
> >
> > Cheers,
> > Alexy
> >

I actually think that Alexy's message is relevent to this list, since 
managing large LMs is a nontrivial problem.

I had not heard of 7-zip before, and took a look.  It does seem to produce
slightly smaller files than bzip2, so it is definitely of interest for 
LM compression.  One drawback is longer compression times (the software even
uses multithreading on multi-cpu machines to speed that up).
But, in any case, it was easy enough to add reading/writing of 7z files
to the relevant library code.  You simply have to replace the attached two
files in $SRILM/misc/src.  BTW, I tested this with the Unix port of 7z found at
http://p7zip.sourceforge.net/ .  I have NOT tested it on Windows using the 
original 7-zip software.

Also, BTW, if you are concerned with LM reading/writing speed (and decent
"compression" compared to text format), I would recommend the binary LM format.

Andreas 

-------------- next part --------------
/*
    File:   zio.h
    Author: Andreas Stolcke
    Date:   Wed Feb 15 15:19:44 PST 1995
   
    Description:

    Copyright (c) 1994-2007, SRI International.  All Rights Reserved.

    RCS ID: $Id: zio.h,v 1.13 2007/11/11 16:06:53 stolcke Exp $
*/

/*
 *  $Log: zio.h,v $
 *  Revision 1.13  2007/11/11 16:06:53  stolcke
 *  7zip compression support
 *
 *  Revision 1.12  2006/08/04 23:59:09  stolcke
 *  MSVC portability
 *
 *  Revision 1.11  2006/03/28 01:15:10  stolcke
 *  include sys/signal.h to check for SIGPIPE
 *
 *  Revision 1.10  2006/03/06 05:46:43  stolcke
 *  define NO_ZIO in zio.h instead of zio.c
 *
 *  Revision 1.9  2006/03/01 00:45:45  stolcke
 *  allow disabling of zio for windows environment (NO_ZIO)
 *
 *  Revision 1.8  2005/12/16 23:30:09  stolcke
 *  added support for bzip2-compressed files
 *
 *  Revision 1.7  2003/02/21 20:18:53  stolcke
 *  avoid conflict if zopen is already defined in library
 *
 *  Revision 1.6  1999/10/13 09:07:13  stolcke
 *  make filename checking functions public
 *
 *  Revision 1.5  1995/06/22 19:58:26  stolcke
 *  ansi-fied
 *
 *  Revision 1.4  1995/06/12 22:56:37  tmk
 *  Added ifdef around the redefinitions of fopen() and fclose().
 *
 */

/*******************************************************************
   Copyright 1994 SRI International.  All rights reserved.
   This is an unpublished work of SRI International and is not to be
   used or disclosed except as provided in a license agreement or
   nondisclosure agreement with SRI International.
 ********************************************************************/


#ifndef _ZIO_H
#define _ZIO_H

#ifdef __cplusplus
extern "C" {
#endif

/* Include declarations files. */

#include <stdio.h>
#include <signal.h>		// to check for SIGPIPE

/* Avoid conflict with library function */
#ifdef HAVE_ZOPEN
#define zopen my_zopen
#endif

/* Constants */
#if !defined(SIGPIPE)
#define NO_ZIO
#endif

#ifdef NO_ZIO
# define COMPRESS_SUFFIX  ""
# define GZIP_SUFFIX	  ""
# define OLD_GZIP_SUFFIX  ""
# define BZIP2_SUFFIX	  ""
# define SEVENZIP_SUFFIX  ""
#else
# define COMPRESS_SUFFIX  ".Z"
# define GZIP_SUFFIX	  ".gz"
# define OLD_GZIP_SUFFIX  ".z"
# define BZIP2_SUFFIX	  ".bz2"
# define SEVENZIP_SUFFIX  ".7z"
#endif /* NO_ZIO */

/* Define function prototypes. */

int	stdio_filename_p (const char *name);
int	compressed_filename_p (const char *name);
int 	gzipped_filename_p (const char *name);
int 	bzipped_filename_p (const char *name);
int 	sevenzipped_filename_p (const char *name);

FILE *	zopen (const char *name, const char *mode);
int	zclose (FILE *stream);

/* Users of this header implicitly always use zopen/zclose in stdio */

#ifdef ZIO_HACK
#define fopen(name,mode)	zopen(name,mode)
#define fclose(stream)		zclose(stream)
#endif

#ifdef __cplusplus
}
#endif

#endif /* _ZIO_H */

-------------- next part --------------
/*
    File:   zio.c
    Author: Andreas Stolcke
    Date:   Wed Feb 15 15:19:44 PST 1995
   
    Description:
                 Compressed file stdio extension
*/

#ifndef lint
static char Copyright[] = "Copyright (c) 1995-2007 SRI International.  All Rights Reserved.";
static char RcsId[] = "@(#)$Header: /home/srilm/devel/misc/src/RCS/zio.c,v 1.25 2007/11/11 16:06:53 stolcke Exp $";
#endif

/*
 * $Log: zio.c,v $
 * Revision 1.25  2007/11/11 16:06:53  stolcke
 * 7zip compression support
 *
 * Revision 1.24  2006/03/06 05:46:43  stolcke
 * define NO_ZIO in zio.h instead of zio.c
 *
 * Revision 1.23  2006/03/01 00:45:45  stolcke
 * allow disabling of zio for windows environment (NO_ZIO)
 *
 * Revision 1.22  2006/01/09 17:39:03  stolcke
 * MSVC port
 *
 * Revision 1.21  2006/01/05 19:32:42  stolcke
 * ms visual c portability
 *
 * Revision 1.20  2005/12/16 23:30:09  stolcke
 * added support for bzip2-compressed files
 *
 * Revision 1.19  2005/07/28 21:08:15  stolcke
 * include signal.h for portability
 *
 * Revision 1.18  2005/07/28 18:37:47  stolcke
 * portability for systems w/o pipes
 *
 * Revision 1.17  2004/01/31 01:17:51  stolcke
 * don't declare errno, get it from errno.h
 *
 * Revision 1.16  2003/11/09 21:09:11  stolcke
 * use gunzip -f to allow uncompressed files ending in .gz
 *
 * Revision 1.15  2003/11/01 06:18:30  stolcke
 * issue stdin/stdout warning only once
 *
 * Revision 1.14  1999/10/13 09:07:13  stolcke
 * make filename checking functions public
 *
 * Revision 1.13  1997/06/07 15:58:47  stolcke
 * fixed some gcc warnings
 *
 * Revision 1.13  1997/06/07 15:56:24  stolcke
 * fixed some gcc warnings
 *
 * Revision 1.12  1997/01/23 20:38:35  stolcke
 * *** empty log message ***
 *
 * Revision 1.11  1997/01/23 20:02:59  stolcke
 * handle SIGPIPE termination
 *
 * Revision 1.10  1997/01/22 07:52:08  stolcke
 * warn about multiple uses of -
 *
 * Revision 1.9  1996/11/30 21:08:59  stolcke
 * use exec in compress commands
 *
 * Revision 1.8  1995/07/19 16:51:31  stolcke
 * remove PATH assignment to account for local setup
 *
 * Revision 1.7  1995/06/22 20:47:16  stolcke
 * dup stdio descriptors so fclose won't disturb them
 *
 * Revision 1.6  1995/06/22 20:44:39  stolcke
 * return more error info
 *
 * Revision 1.5  1995/06/22 19:58:11  stolcke
 * ansi-fied
 *
 * Revision 1.4  1995/06/12 22:57:12  tmk
 * Added ifdef around the redefinitions of fopen() and fclose().
 *
 */

/*******************************************************************
   Copyright 1994,1997 SRI International.  All rights reserved.
   This is an unpublished work of SRI International and is not to be
   used or disclosed except as provided in a license agreement or
   nondisclosure agreement with SRI International.
 ********************************************************************/

#include <stdio.h>
#include <string.h>
#ifndef _MSC_VER
#include <unistd.h>
#include <sys/param.h>
#endif
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <signal.h>
#include <errno.h>

#ifndef MAXPATHLEN
#define MAXPATHLEN 1024
#endif

#include "zio.h"

#ifdef ZIO_HACK
#undef fopen
#undef fclose
#endif

#define STDIO_NAME	  "-"

#define STD_PATH    ":"   /* "PATH=/usr/bin:/usr/ucb:/usr/bsd:/usr/local/bin" */

#define COMPRESS_CMD	  "exec compress -c"
#define UNCOMPRESS_CMD	  "exec uncompress -c"

#define GZIP_CMD	  "exec gzip -c"
#define GUNZIP_CMD	  "exec gunzip -cf"

#define BZIP2_CMD	  "exec bzip2"
#define BUNZIP2_CMD	  "exec bunzip2 -c"

#define SEVENZIP_CMD	  "exec 7z a -si"
#define SEVENUNZIP_CMD	  "exec 7z x -so"

/*
 * Does the filename refer to stdin/stdout ?
 */
int
stdio_filename_p (const char *name)
{
    return (strcmp(name, STDIO_NAME) == 0);
}

/*
 * Does the filename refer to a compressed file ?
 */
int
compressed_filename_p (const char *name)
{
    unsigned len = strlen(name);

    return
	(sizeof(COMPRESS_SUFFIX) > 1) &&
	    (len > sizeof(COMPRESS_SUFFIX)-1) &&
		(strcmp(name + len - (sizeof(COMPRESS_SUFFIX)-1),
			COMPRESS_SUFFIX) == 0);
}

/*
 * Does the filename refer to a gzipped file ?
 */
int
gzipped_filename_p (const char *name)
{
    unsigned len = strlen(name);

    return 
	(sizeof(GZIP_SUFFIX) > 1) &&
	    (len > sizeof(GZIP_SUFFIX)-1) &&
		(strcmp(name + len - (sizeof(GZIP_SUFFIX)-1),
			GZIP_SUFFIX) == 0) ||
	(sizeof(OLD_GZIP_SUFFIX) > 1) &&
	    (len > sizeof(OLD_GZIP_SUFFIX)-1) &&
		(strcmp(name + len - (sizeof(OLD_GZIP_SUFFIX)-1),
			OLD_GZIP_SUFFIX) == 0);
}

/*
 * Does the filename refer to a bzipped file ?
 */
int
bzipped_filename_p (const char *name)
{
    unsigned len = strlen(name);

    return 
	(sizeof(BZIP2_SUFFIX) > 1) &&
	    (len > sizeof(BZIP2_SUFFIX)-1) &&
		(strcmp(name + len - (sizeof(BZIP2_SUFFIX)-1),
			BZIP2_SUFFIX) == 0);
}

/*
 * Does the filename refer to a 7-zip file ?
 */
int
sevenzipped_filename_p (const char *name)
{
    unsigned len = strlen(name);

    return 
	(sizeof(SEVENZIP_SUFFIX) > 1) &&
	    (len > sizeof(SEVENZIP_SUFFIX)-1) &&
		(strcmp(name + len - (sizeof(SEVENZIP_SUFFIX)-1),
			SEVENZIP_SUFFIX) == 0);
}

/*
 * Check file readability
 */
static int
readable_p (const char *name)
{
    int fd = open(name, O_RDONLY);

    if (fd < 0)
        return 0;
    else {
        close(fd);
	return 1;
    }
}

/*
 * Check file writability
 */
static int
writable_p (const char *name)
{
    int fd = open(name, O_WRONLY|O_CREAT, 0666);

    if (fd < 0)
        return 0;
    else {
        close(fd);
	return 1;
    }
}

/*
 * Open a stdio stream, handling special filenames
 */
FILE *zopen(const char *name, const char *mode)
{
    char command[MAXPATHLEN + 100];

    if (stdio_filename_p(name)) {
	/*
	 * Return stream to stdin or stdout
	 */
	if (*mode == 'r') {
		static int stdin_used = 0;
		static int stdin_warning = 0;
		int fd;

		if (stdin_used) {
		    if (!stdin_warning) {
			fprintf(stderr,
				"warning: '-' used multiple times for input\n");
			stdin_warning = 1;
		    }
		} else {
		    stdin_used = 1;
		}

		fd = dup(0);
		return fd < 0 ? NULL : fdopen(fd, mode);
	} else if (*mode == 'w' || *mode == 'a') {
		static int stdout_used = 0;
		static int stdout_warning = 0;
		int fd;

		if (stdout_used) {
		    if (!stdout_warning) {
			fprintf(stderr,
				"warning: '-' used multiple times for output\n");
			stdout_warning = 1;
		    }
		} else {
		    stdout_used = 1;
		}

		fd = dup(1);
		return fd < 0 ? NULL : fdopen(fd, mode);
	} else {
		return NULL;
	}
    } else {
	char *compress_cmd = NULL;
	char *uncompress_cmd = NULL;
	int zip_to_stdout = 1;
	
	if (compressed_filename_p(name)) {
	    compress_cmd = COMPRESS_CMD;
	    uncompress_cmd = UNCOMPRESS_CMD;
	} else if (gzipped_filename_p(name)) {
	    compress_cmd = GZIP_CMD;
	    uncompress_cmd = GUNZIP_CMD;
	} else if (bzipped_filename_p(name)) {
	    compress_cmd = BZIP2_CMD;
	    uncompress_cmd = BUNZIP2_CMD;
	} else if (sevenzipped_filename_p(name)) {
	    compress_cmd = SEVENZIP_CMD;
	    uncompress_cmd = SEVENUNZIP_CMD;
	    zip_to_stdout = 0;
	}

	if (compress_cmd != NULL) {
#ifdef NO_ZIO
	    fprintf(stderr, "Sorry, compressed I/O not available on this machine\n");
	    errno = EINVAL;
	    return NULL;
#else /* !NO_ZIO */
	    /*
	     * Return stream to compress pipe
	     */
	    if (*mode == 'r') {
		if (!readable_p(name))
		    return NULL;
		sprintf(command, "%s;%s %s", STD_PATH, uncompress_cmd, name);
		return popen(command, mode);
	    } else if (*mode == 'w') {
		if (!writable_p(name))
		    return NULL;
		if (zip_to_stdout) {
		    sprintf(command, "%s;%s >%s", STD_PATH, compress_cmd, name);
		} else {
		    /*
		     * This is necessary because the compression program might
		     * complain if a zero-length file already exists.
		     * However, it means that existing file owner & permission
		     * attributes are not preserved.
		     */
		    unlink(name);
		    sprintf(command, "%s;%s %s", STD_PATH, compress_cmd, name);
		}
		return popen(command, mode);
	    } else {
		return NULL;
	    }
#endif /* !NO_ZIO */
	} else {
	    return fopen(name, mode);
	}
    }
}

/*
 * Close a stream created by zopen()
 */
int
zclose(FILE *stream)
{
#ifdef NO_ZIO
     return fclose(stream);
#else /* !NO_ZIO */

    int status;
    struct stat statb;

    /*
     * pclose(), according to the man page, should diagnose streams not 
     * created by popen() and return -1.  however, on SGIs, it core dumps
     * in that case.  So we better be careful and try to figure out
     * what type of stream it is.
     */
    if (fstat(fileno(stream), &statb) < 0)
	return -1;

    /*
     * First try pclose().  It will tell us if stream is not a pipe
     */
    if ((statb.st_mode & S_IFMT) != S_IFIFO ||
        fileno(stream) == 0 || fileno(stream) == 1)
    {
        return fclose(stream);
    } else {
	status = pclose(stream);
	if (status == -1) {
	    /*
	     * stream was not created by popen(), but popen() does fclose
	     * for us in thise case.
	     */
	    return ferror(stream);
	} else if (status == SIGPIPE) {
	    /*
	     * It's normal for the uncompressor to terminate by SIGPIPE,
	     * i.e., if the user program closed the file before reaching
	     * EOF. 
	     */
	     return 0;
	} else {
	    /*
	     * The compressor program terminated with an error, and supposedly
	     * has printed a message to stderr.
	     * Set errno to a generic error code if it hasn't been set already.
	     */
	    if (errno == 0) {
		errno = EIO;
	    }
	    return status;
	}
    }
#endif /* NO_ZIO */
}

#ifdef STAND
int
main (argc, argv)
    int argc;
    char **argv;
{
    int dowrite = 0;
    char buffer[BUFSIZ];
    int nread;
    FILE *stream;

    if (argc < 3) {
	printf("usage: %s file {r|w}\n", argv[0]);
 	exit(2);
    }

    if (*argv[2] == 'r') {
	stream = zopen(argv[1], argv[2]);

	if (!stream) {
		perror(argv[1]);
		exit(1);
	}

	while (!ferror(stream) && !feof(stream) &&!ferror(stdout)) {
		nread = fread(buffer, 1, sizeof(buffer), stream);
		(void)fwrite(buffer, 1, nread, stdout);
	}
    } else {
	stream = zopen(argv[1], argv[2]);

	if (!stream) {
		perror(argv[1]);
		exit(1);
	}

	while (!ferror(stdin) && !feof(stdin) && !ferror(stream)) {
		nread = fread(buffer, 1, sizeof(buffer), stdin);
		(void)fwrite(buffer, 1, nread, stream);
	}
   }
   if (ferror(stdin)) {
	perror("stdin");
   } else if (ferror(stdout)) {
	perror("stdout");
   } else if (ferror(stream)) {
	perror(argv[1]);
   }
   zclose(stream);
   
}
#endif /* STAND */


More information about the SRILM-User mailing list