Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

gzip.py

Last change on this file was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 18.3 KB

Rev	Line
[2]	1	"""Functions that read and write gzipped files.
	2
	3	The user of the file doesn't have to worry about the compression,
	4	but random access is not allowed."""
	5
	6	# based on Andrew Kuchling's minigzip.py distributed with the zlib module
	7
[391]	8	import struct, sys, time, os
[2]	9	import zlib
[391]	10	import io
[2]	11	import __builtin__
	12
	13	__all__ = ["GzipFile","open"]
	14
	15	FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
	16
	17	READ, WRITE = 1, 2
	18
	19	def write32u(output, value):
	20	# The L format writes the bit pattern correctly whether signed
	21	# or unsigned.
	22	output.write(struct.pack("<L", value))
	23
	24	def read32(input):
	25	return struct.unpack("<I", input.read(4))[0]
	26
	27	def open(filename, mode="rb", compresslevel=9):
	28	"""Shorthand for GzipFile(filename, mode, compresslevel).
	29
	30	The filename argument is required; mode defaults to 'rb'
	31	and compresslevel defaults to 9.
	32
	33	"""
	34	return GzipFile(filename, mode, compresslevel)
	35
[391]	36	class GzipFile(io.BufferedIOBase):
[2]	37	"""The GzipFile class simulates most of the methods of a file object with
	38	the exception of the readinto() and truncate() methods.
	39
	40	"""
	41
	42	myfileobj = None
	43	max_read_chunk = 10 * 1024 * 1024 # 10Mb
	44
	45	def __init__(self, filename=None, mode=None,
[391]	46	compresslevel=9, fileobj=None, mtime=None):
[2]	47	"""Constructor for the GzipFile class.
	48
	49	At least one of fileobj and filename must be given a
	50	non-trivial value.
	51
	52	The new class instance is based on fileobj, which can be a regular
	53	file, a StringIO object, or any other object which simulates a file.
	54	It defaults to None, in which case filename is opened to provide
	55	a file object.
	56
	57	When fileobj is not None, the filename argument is only used to be
	58	included in the gzip file header, which may includes the original
	59	filename of the uncompressed file. It defaults to the filename of
	60	fileobj, if discernible; otherwise, it defaults to the empty string,
	61	and in this case the original filename is not included in the header.
	62
	63	The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
	64	depending on whether the file will be read or written. The default
	65	is the mode of fileobj if discernible; otherwise, the default is 'rb'.
	66	Be aware that only the 'rb', 'ab', and 'wb' values should be used
	67	for cross-platform portability.
	68
[391]	69	The compresslevel argument is an integer from 0 to 9 controlling the
[2]	70	level of compression; 1 is fastest and produces the least compression,
[391]	71	and 9 is slowest and produces the most compression. 0 is no compression
	72	at all. The default is 9.
[2]	73
[391]	74	The mtime argument is an optional numeric timestamp to be written
	75	to the stream when compressing. All gzip compressed streams
	76	are required to contain a timestamp. If omitted or None, the
	77	current time is used. This module ignores the timestamp when
	78	decompressing; however, some programs, such as gunzip, make use
	79	of it. The format of the timestamp is the same as that of the
	80	return value of time.time() and of the st_mtime member of the
	81	object returned by os.stat().
	82
[2]	83	"""
	84
[391]	85	# Make sure we don't inadvertently enable universal newlines on the
	86	# underlying file object - in read mode, this causes data corruption.
	87	if mode:
	88	mode = mode.replace('U', '')
[2]	89	# guarantee the file is opened in binary mode on platforms
	90	# that care about that sort of thing
	91	if mode and 'b' not in mode:
	92	mode += 'b'
	93	if fileobj is None:
	94	fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
	95	if filename is None:
[391]	96	# Issue #13781: os.fdopen() creates a fileobj with a bogus name
	97	# attribute. Avoid saving this in the gzip header's filename field.
	98	if hasattr(fileobj, 'name') and fileobj.name != '<fdopen>':
	99	filename = fileobj.name
	100	else:
	101	filename = ''
[2]	102	if mode is None:
	103	if hasattr(fileobj, 'mode'): mode = fileobj.mode
	104	else: mode = 'rb'
	105
	106	if mode[0:1] == 'r':
	107	self.mode = READ
	108	# Set flag indicating start of a new member
	109	self._new_member = True
[391]	110	# Buffer data read from gzip file. extrastart is offset in
	111	# stream where buffer starts. extrasize is number of
	112	# bytes remaining in buffer from current stream position.
[2]	113	self.extrabuf = ""
	114	self.extrasize = 0
[391]	115	self.extrastart = 0
[2]	116	self.name = filename
	117	# Starts small, scales exponentially
	118	self.min_readsize = 100
	119
	120	elif mode[0:1] == 'w' or mode[0:1] == 'a':
	121	self.mode = WRITE
	122	self._init_write(filename)
	123	self.compress = zlib.compressobj(compresslevel,
	124	zlib.DEFLATED,
	125	-zlib.MAX_WBITS,
	126	zlib.DEF_MEM_LEVEL,
	127	0)
	128	else:
	129	raise IOError, "Mode " + mode + " not supported"
	130
	131	self.fileobj = fileobj
	132	self.offset = 0
[391]	133	self.mtime = mtime
[2]	134
	135	if self.mode == WRITE:
	136	self._write_gzip_header()
	137
	138	@property
	139	def filename(self):
	140	import warnings
	141	warnings.warn("use the name attribute", DeprecationWarning, 2)
	142	if self.mode == WRITE and self.name[-3:] != ".gz":
	143	return self.name + ".gz"
	144	return self.name
	145
	146	def __repr__(self):
	147	s = repr(self.fileobj)
	148	return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
	149
[391]	150	def _check_closed(self):
	151	"""Raises a ValueError if the underlying file object has been closed.
	152
	153	"""
	154	if self.closed:
	155	raise ValueError('I/O operation on closed file.')
	156
[2]	157	def _init_write(self, filename):
	158	self.name = filename
	159	self.crc = zlib.crc32("") & 0xffffffffL
	160	self.size = 0
	161	self.writebuf = []
	162	self.bufsize = 0
	163
	164	def _write_gzip_header(self):
	165	self.fileobj.write('\037\213') # magic header
	166	self.fileobj.write('\010') # compression method
[391]	167	fname = os.path.basename(self.name)
[2]	168	if fname.endswith(".gz"):
	169	fname = fname[:-3]
	170	flags = 0
	171	if fname:
	172	flags = FNAME
	173	self.fileobj.write(chr(flags))
[391]	174	mtime = self.mtime
	175	if mtime is None:
	176	mtime = time.time()
	177	write32u(self.fileobj, long(mtime))
[2]	178	self.fileobj.write('\002')
	179	self.fileobj.write('\377')
	180	if fname:
	181	self.fileobj.write(fname + '\000')
	182
	183	def _init_read(self):
	184	self.crc = zlib.crc32("") & 0xffffffffL
	185	self.size = 0
	186
	187	def _read_gzip_header(self):
	188	magic = self.fileobj.read(2)
	189	if magic != '\037\213':
	190	raise IOError, 'Not a gzipped file'
	191	method = ord( self.fileobj.read(1) )
	192	if method != 8:
	193	raise IOError, 'Unknown compression method'
	194	flag = ord( self.fileobj.read(1) )
[391]	195	self.mtime = read32(self.fileobj)
[2]	196	# extraflag = self.fileobj.read(1)
	197	# os = self.fileobj.read(1)
[391]	198	self.fileobj.read(2)
[2]	199
	200	if flag & FEXTRA:
	201	# Read & discard the extra field, if present
	202	xlen = ord(self.fileobj.read(1))
	203	xlen = xlen + 256*ord(self.fileobj.read(1))
	204	self.fileobj.read(xlen)
	205	if flag & FNAME:
	206	# Read and discard a null-terminated string containing the filename
	207	while True:
	208	s = self.fileobj.read(1)
	209	if not s or s=='\000':
	210	break
	211	if flag & FCOMMENT:
	212	# Read and discard a null-terminated string containing a comment
	213	while True:
	214	s = self.fileobj.read(1)
	215	if not s or s=='\000':
	216	break
	217	if flag & FHCRC:
	218	self.fileobj.read(2) # Read & discard the 16-bit header CRC
	219
	220	def write(self,data):
[391]	221	self._check_closed()
[2]	222	if self.mode != WRITE:
	223	import errno
	224	raise IOError(errno.EBADF, "write() on read-only GzipFile object")
	225
	226	if self.fileobj is None:
	227	raise ValueError, "write() on closed GzipFile object"
[391]	228
	229	# Convert data type if called by io.BufferedWriter.
	230	if isinstance(data, memoryview):
	231	data = data.tobytes()
	232
[2]	233	if len(data) > 0:
	234	self.size = self.size + len(data)
	235	self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
	236	self.fileobj.write( self.compress.compress(data) )
	237	self.offset += len(data)
	238
[391]	239	return len(data)
	240
[2]	241	def read(self, size=-1):
[391]	242	self._check_closed()
[2]	243	if self.mode != READ:
	244	import errno
	245	raise IOError(errno.EBADF, "read() on write-only GzipFile object")
	246
	247	if self.extrasize <= 0 and self.fileobj is None:
	248	return ''
	249
	250	readsize = 1024
	251	if size < 0: # get the whole thing
	252	try:
	253	while True:
	254	self._read(readsize)
	255	readsize = min(self.max_read_chunk, readsize * 2)
	256	except EOFError:
	257	size = self.extrasize
	258	else: # just get some more of it
	259	try:
	260	while size > self.extrasize:
	261	self._read(readsize)
	262	readsize = min(self.max_read_chunk, readsize * 2)
	263	except EOFError:
	264	if size > self.extrasize:
	265	size = self.extrasize
	266
[391]	267	offset = self.offset - self.extrastart
	268	chunk = self.extrabuf[offset: offset + size]
[2]	269	self.extrasize = self.extrasize - size
	270
	271	self.offset += size
	272	return chunk
	273
	274	def _unread(self, buf):
	275	self.extrasize = len(buf) + self.extrasize
	276	self.offset -= len(buf)
	277
	278	def _read(self, size=1024):
	279	if self.fileobj is None:
	280	raise EOFError, "Reached EOF"
	281
	282	if self._new_member:
	283	# If the _new_member flag is set, we have to
	284	# jump to the next member, if there is one.
	285	#
	286	# First, check if we're at the end of the file;
	287	# if so, it's time to stop; no more members to read.
	288	pos = self.fileobj.tell() # Save current position
	289	self.fileobj.seek(0, 2) # Seek to end of file
	290	if pos == self.fileobj.tell():
	291	raise EOFError, "Reached EOF"
	292	else:
	293	self.fileobj.seek( pos ) # Return to original position
	294
	295	self._init_read()
	296	self._read_gzip_header()
	297	self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
	298	self._new_member = False
	299
	300	# Read a chunk of data from the file
	301	buf = self.fileobj.read(size)
	302
	303	# If the EOF has been reached, flush the decompression object
	304	# and mark this object as finished.
	305
	306	if buf == "":
	307	uncompress = self.decompress.flush()
	308	self._read_eof()
	309	self._add_read_data( uncompress )
	310	raise EOFError, 'Reached EOF'
	311
	312	uncompress = self.decompress.decompress(buf)
	313	self._add_read_data( uncompress )
	314
	315	if self.decompress.unused_data != "":
	316	# Ending case: we've come to the end of a member in the file,
	317	# so seek back to the start of the unused data, finish up
	318	# this member, and read a new gzip header.
	319	# (The number of bytes to seek back is the length of the unused
	320	# data, minus 8 because _read_eof() will rewind a further 8 bytes)
	321	self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
	322
	323	# Check the CRC and file size, and set the flag so we read
	324	# a new member on the next call
	325	self._read_eof()
	326	self._new_member = True
	327
	328	def _add_read_data(self, data):
	329	self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
[391]	330	offset = self.offset - self.extrastart
	331	self.extrabuf = self.extrabuf[offset:] + data
[2]	332	self.extrasize = self.extrasize + len(data)
[391]	333	self.extrastart = self.offset
[2]	334	self.size = self.size + len(data)
	335
	336	def _read_eof(self):
	337	# We've read to the end of the file, so we have to rewind in order
	338	# to reread the 8 bytes containing the CRC and the file size.
	339	# We check the that the computed CRC and size of the
	340	# uncompressed data matches the stored values. Note that the size
	341	# stored is the true file size mod 2**32.
	342	self.fileobj.seek(-8, 1)
	343	crc32 = read32(self.fileobj)
	344	isize = read32(self.fileobj) # may exceed 2GB
	345	if crc32 != self.crc:
	346	raise IOError("CRC check failed %s != %s" % (hex(crc32),
	347	hex(self.crc)))
	348	elif isize != (self.size & 0xffffffffL):
	349	raise IOError, "Incorrect length of data produced"
	350
[391]	351	# Gzip files can be padded with zeroes and still have archives.
	352	# Consume all zero bytes and set the file position to the first
	353	# non-zero byte. See http://www.gzip.org/#faq8
	354	c = "\x00"
	355	while c == "\x00":
	356	c = self.fileobj.read(1)
	357	if c:
	358	self.fileobj.seek(-1, 1)
	359
	360	@property
	361	def closed(self):
	362	return self.fileobj is None
	363
[2]	364	def close(self):
	365	if self.fileobj is None:
	366	return
	367	if self.mode == WRITE:
	368	self.fileobj.write(self.compress.flush())
	369	write32u(self.fileobj, self.crc)
	370	# self.size may exceed 2GB, or even 4GB
	371	write32u(self.fileobj, self.size & 0xffffffffL)
	372	self.fileobj = None
	373	elif self.mode == READ:
	374	self.fileobj = None
	375	if self.myfileobj:
	376	self.myfileobj.close()
	377	self.myfileobj = None
	378
	379	def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
[391]	380	self._check_closed()
[2]	381	if self.mode == WRITE:
	382	# Ensure the compressor's buffer is flushed
	383	self.fileobj.write(self.compress.flush(zlib_mode))
[391]	384	self.fileobj.flush()
[2]	385
	386	def fileno(self):
	387	"""Invoke the underlying file object's fileno() method.
	388
	389	This will raise AttributeError if the underlying file object
	390	doesn't support fileno().
	391	"""
	392	return self.fileobj.fileno()
	393
	394	def rewind(self):
	395	'''Return the uncompressed stream file position indicator to the
	396	beginning of the file'''
	397	if self.mode != READ:
	398	raise IOError("Can't rewind in write mode")
	399	self.fileobj.seek(0)
	400	self._new_member = True
	401	self.extrabuf = ""
	402	self.extrasize = 0
[391]	403	self.extrastart = 0
[2]	404	self.offset = 0
	405
[391]	406	def readable(self):
	407	return self.mode == READ
	408
	409	def writable(self):
	410	return self.mode == WRITE
	411
	412	def seekable(self):
	413	return True
	414
[2]	415	def seek(self, offset, whence=0):
	416	if whence:
	417	if whence == 1:
	418	offset = self.offset + offset
	419	else:
	420	raise ValueError('Seek from end not supported')
	421	if self.mode == WRITE:
	422	if offset < self.offset:
	423	raise IOError('Negative seek in write mode')
	424	count = offset - self.offset
[391]	425	for i in xrange(count // 1024):
[2]	426	self.write(1024 * '\0')
	427	self.write((count % 1024) * '\0')
	428	elif self.mode == READ:
	429	if offset < self.offset:
	430	# for negative seek, rewind and do positive seek
	431	self.rewind()
	432	count = offset - self.offset
[391]	433	for i in xrange(count // 1024):
[2]	434	self.read(1024)
	435	self.read(count % 1024)
	436
[391]	437	return self.offset
	438
[2]	439	def readline(self, size=-1):
	440	if size < 0:
[391]	441	# Shortcut common case - newline found in buffer.
	442	offset = self.offset - self.extrastart
	443	i = self.extrabuf.find('\n', offset) + 1
	444	if i > 0:
	445	self.extrasize -= i - offset
	446	self.offset += i - offset
	447	return self.extrabuf[offset: i]
	448
[2]	449	size = sys.maxint
	450	readsize = self.min_readsize
	451	else:
	452	readsize = size
	453	bufs = []
	454	while size != 0:
	455	c = self.read(readsize)
	456	i = c.find('\n')
	457
	458	# We set i=size to break out of the loop under two
	459	# conditions: 1) there's no newline, and the chunk is
	460	# larger than size, or 2) there is a newline, but the
	461	# resulting line would be longer than 'size'.
	462	if (size <= i) or (i == -1 and len(c) > size):
	463	i = size - 1
	464
	465	if i >= 0 or c == '':
	466	bufs.append(c[:i + 1]) # Add portion of last chunk
	467	self._unread(c[i + 1:]) # Push back rest of chunk
	468	break
	469
	470	# Append chunk to list, decrease 'size',
	471	bufs.append(c)
	472	size = size - len(c)
	473	readsize = min(size, readsize * 2)
	474	if readsize > self.min_readsize:
	475	self.min_readsize = min(readsize, self.min_readsize * 2, 512)
	476	return ''.join(bufs) # Return resulting line
	477
	478
	479	def _test():
	480	# Act like gzip; with -d, act like gunzip.
	481	# The input file is not deleted, however, nor are any other gzip
	482	# options or features supported.
	483	args = sys.argv[1:]
	484	decompress = args and args[0] == "-d"
	485	if decompress:
	486	args = args[1:]
	487	if not args:
	488	args = ["-"]
	489	for arg in args:
	490	if decompress:
	491	if arg == "-":
	492	f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
	493	g = sys.stdout
	494	else:
	495	if arg[-3:] != ".gz":
	496	print "filename doesn't end in .gz:", repr(arg)
	497	continue
	498	f = open(arg, "rb")
	499	g = __builtin__.open(arg[:-3], "wb")
	500	else:
	501	if arg == "-":
	502	f = sys.stdin
	503	g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
	504	else:
	505	f = __builtin__.open(arg, "rb")
	506	g = open(arg + ".gz", "wb")
	507	while True:
	508	chunk = f.read(1024)
	509	if not chunk:
	510	break
	511	g.write(chunk)
	512	if g is not sys.stdout:
	513	g.close()
	514	if f is not sys.stdin:
	515	f.close()
	516
	517	if __name__ == '__main__':
	518	_test()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/gzip.py

Download in other formats: