Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

gzip.py@ 389

Last change on this file since 389 was 2, checked in by Yuri Dario, 15 years ago
Initial import for vendor code.
Property svn:eol-style set to `native`
File size: 16.4 KB

Line
1	"""Functions that read and write gzipped files.
2
3	The user of the file doesn't have to worry about the compression,
4	but random access is not allowed."""
5
6	# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
8	import struct, sys, time
9	import zlib
10	import __builtin__
11
12	__all__ = ["GzipFile","open"]
13
14	FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
15
16	READ, WRITE = 1, 2
17
18	def write32u(output, value):
19	# The L format writes the bit pattern correctly whether signed
20	# or unsigned.
21	output.write(struct.pack("<L", value))
22
23	def read32(input):
24	return struct.unpack("<I", input.read(4))[0]
25
26	def open(filename, mode="rb", compresslevel=9):
27	"""Shorthand for GzipFile(filename, mode, compresslevel).
28
29	The filename argument is required; mode defaults to 'rb'
30	and compresslevel defaults to 9.
31
32	"""
33	return GzipFile(filename, mode, compresslevel)
34
35	class GzipFile:
36	"""The GzipFile class simulates most of the methods of a file object with
37	the exception of the readinto() and truncate() methods.
38
39	"""
40
41	myfileobj = None
42	max_read_chunk = 10 * 1024 * 1024 # 10Mb
43
44	def __init__(self, filename=None, mode=None,
45	compresslevel=9, fileobj=None):
46	"""Constructor for the GzipFile class.
47
48	At least one of fileobj and filename must be given a
49	non-trivial value.
50
51	The new class instance is based on fileobj, which can be a regular
52	file, a StringIO object, or any other object which simulates a file.
53	It defaults to None, in which case filename is opened to provide
54	a file object.
55
56	When fileobj is not None, the filename argument is only used to be
57	included in the gzip file header, which may includes the original
58	filename of the uncompressed file. It defaults to the filename of
59	fileobj, if discernible; otherwise, it defaults to the empty string,
60	and in this case the original filename is not included in the header.
61
62	The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
63	depending on whether the file will be read or written. The default
64	is the mode of fileobj if discernible; otherwise, the default is 'rb'.
65	Be aware that only the 'rb', 'ab', and 'wb' values should be used
66	for cross-platform portability.
67
68	The compresslevel argument is an integer from 1 to 9 controlling the
69	level of compression; 1 is fastest and produces the least compression,
70	and 9 is slowest and produces the most compression. The default is 9.
71
72	"""
73
74	# guarantee the file is opened in binary mode on platforms
75	# that care about that sort of thing
76	if mode and 'b' not in mode:
77	mode += 'b'
78	if fileobj is None:
79	fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
80	if filename is None:
81	if hasattr(fileobj, 'name'): filename = fileobj.name
82	else: filename = ''
83	if mode is None:
84	if hasattr(fileobj, 'mode'): mode = fileobj.mode
85	else: mode = 'rb'
86
87	if mode[0:1] == 'r':
88	self.mode = READ
89	# Set flag indicating start of a new member
90	self._new_member = True
91	self.extrabuf = ""
92	self.extrasize = 0
93	self.name = filename
94	# Starts small, scales exponentially
95	self.min_readsize = 100
96
97	elif mode[0:1] == 'w' or mode[0:1] == 'a':
98	self.mode = WRITE
99	self._init_write(filename)
100	self.compress = zlib.compressobj(compresslevel,
101	zlib.DEFLATED,
102	-zlib.MAX_WBITS,
103	zlib.DEF_MEM_LEVEL,
104	0)
105	else:
106	raise IOError, "Mode " + mode + " not supported"
107
108	self.fileobj = fileobj
109	self.offset = 0
110
111	if self.mode == WRITE:
112	self._write_gzip_header()
113
114	@property
115	def filename(self):
116	import warnings
117	warnings.warn("use the name attribute", DeprecationWarning, 2)
118	if self.mode == WRITE and self.name[-3:] != ".gz":
119	return self.name + ".gz"
120	return self.name
121
122	def __repr__(self):
123	s = repr(self.fileobj)
124	return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
125
126	def _init_write(self, filename):
127	self.name = filename
128	self.crc = zlib.crc32("") & 0xffffffffL
129	self.size = 0
130	self.writebuf = []
131	self.bufsize = 0
132
133	def _write_gzip_header(self):
134	self.fileobj.write('\037\213') # magic header
135	self.fileobj.write('\010') # compression method
136	fname = self.name
137	if fname.endswith(".gz"):
138	fname = fname[:-3]
139	flags = 0
140	if fname:
141	flags = FNAME
142	self.fileobj.write(chr(flags))
143	write32u(self.fileobj, long(time.time()))
144	self.fileobj.write('\002')
145	self.fileobj.write('\377')
146	if fname:
147	self.fileobj.write(fname + '\000')
148
149	def _init_read(self):
150	self.crc = zlib.crc32("") & 0xffffffffL
151	self.size = 0
152
153	def _read_gzip_header(self):
154	magic = self.fileobj.read(2)
155	if magic != '\037\213':
156	raise IOError, 'Not a gzipped file'
157	method = ord( self.fileobj.read(1) )
158	if method != 8:
159	raise IOError, 'Unknown compression method'
160	flag = ord( self.fileobj.read(1) )
161	# modtime = self.fileobj.read(4)
162	# extraflag = self.fileobj.read(1)
163	# os = self.fileobj.read(1)
164	self.fileobj.read(6)
165
166	if flag & FEXTRA:
167	# Read & discard the extra field, if present
168	xlen = ord(self.fileobj.read(1))
169	xlen = xlen + 256*ord(self.fileobj.read(1))
170	self.fileobj.read(xlen)
171	if flag & FNAME:
172	# Read and discard a null-terminated string containing the filename
173	while True:
174	s = self.fileobj.read(1)
175	if not s or s=='\000':
176	break
177	if flag & FCOMMENT:
178	# Read and discard a null-terminated string containing a comment
179	while True:
180	s = self.fileobj.read(1)
181	if not s or s=='\000':
182	break
183	if flag & FHCRC:
184	self.fileobj.read(2) # Read & discard the 16-bit header CRC
185
186
187	def write(self,data):
188	if self.mode != WRITE:
189	import errno
190	raise IOError(errno.EBADF, "write() on read-only GzipFile object")
191
192	if self.fileobj is None:
193	raise ValueError, "write() on closed GzipFile object"
194	if len(data) > 0:
195	self.size = self.size + len(data)
196	self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
197	self.fileobj.write( self.compress.compress(data) )
198	self.offset += len(data)
199
200	def read(self, size=-1):
201	if self.mode != READ:
202	import errno
203	raise IOError(errno.EBADF, "read() on write-only GzipFile object")
204
205	if self.extrasize <= 0 and self.fileobj is None:
206	return ''
207
208	readsize = 1024
209	if size < 0: # get the whole thing
210	try:
211	while True:
212	self._read(readsize)
213	readsize = min(self.max_read_chunk, readsize * 2)
214	except EOFError:
215	size = self.extrasize
216	else: # just get some more of it
217	try:
218	while size > self.extrasize:
219	self._read(readsize)
220	readsize = min(self.max_read_chunk, readsize * 2)
221	except EOFError:
222	if size > self.extrasize:
223	size = self.extrasize
224
225	chunk = self.extrabuf[:size]
226	self.extrabuf = self.extrabuf[size:]
227	self.extrasize = self.extrasize - size
228
229	self.offset += size
230	return chunk
231
232	def _unread(self, buf):
233	self.extrabuf = buf + self.extrabuf
234	self.extrasize = len(buf) + self.extrasize
235	self.offset -= len(buf)
236
237	def _read(self, size=1024):
238	if self.fileobj is None:
239	raise EOFError, "Reached EOF"
240
241	if self._new_member:
242	# If the _new_member flag is set, we have to
243	# jump to the next member, if there is one.
244	#
245	# First, check if we're at the end of the file;
246	# if so, it's time to stop; no more members to read.
247	pos = self.fileobj.tell() # Save current position
248	self.fileobj.seek(0, 2) # Seek to end of file
249	if pos == self.fileobj.tell():
250	raise EOFError, "Reached EOF"
251	else:
252	self.fileobj.seek( pos ) # Return to original position
253
254	self._init_read()
255	self._read_gzip_header()
256	self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
257	self._new_member = False
258
259	# Read a chunk of data from the file
260	buf = self.fileobj.read(size)
261
262	# If the EOF has been reached, flush the decompression object
263	# and mark this object as finished.
264
265	if buf == "":
266	uncompress = self.decompress.flush()
267	self._read_eof()
268	self._add_read_data( uncompress )
269	raise EOFError, 'Reached EOF'
270
271	uncompress = self.decompress.decompress(buf)
272	self._add_read_data( uncompress )
273
274	if self.decompress.unused_data != "":
275	# Ending case: we've come to the end of a member in the file,
276	# so seek back to the start of the unused data, finish up
277	# this member, and read a new gzip header.
278	# (The number of bytes to seek back is the length of the unused
279	# data, minus 8 because _read_eof() will rewind a further 8 bytes)
280	self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
281
282	# Check the CRC and file size, and set the flag so we read
283	# a new member on the next call
284	self._read_eof()
285	self._new_member = True
286
287	def _add_read_data(self, data):
288	self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
289	self.extrabuf = self.extrabuf + data
290	self.extrasize = self.extrasize + len(data)
291	self.size = self.size + len(data)
292
293	def _read_eof(self):
294	# We've read to the end of the file, so we have to rewind in order
295	# to reread the 8 bytes containing the CRC and the file size.
296	# We check the that the computed CRC and size of the
297	# uncompressed data matches the stored values. Note that the size
298	# stored is the true file size mod 2**32.
299	self.fileobj.seek(-8, 1)
300	crc32 = read32(self.fileobj)
301	isize = read32(self.fileobj) # may exceed 2GB
302	if crc32 != self.crc:
303	raise IOError("CRC check failed %s != %s" % (hex(crc32),
304	hex(self.crc)))
305	elif isize != (self.size & 0xffffffffL):
306	raise IOError, "Incorrect length of data produced"
307
308	def close(self):
309	if self.fileobj is None:
310	return
311	if self.mode == WRITE:
312	self.fileobj.write(self.compress.flush())
313	write32u(self.fileobj, self.crc)
314	# self.size may exceed 2GB, or even 4GB
315	write32u(self.fileobj, self.size & 0xffffffffL)
316	self.fileobj = None
317	elif self.mode == READ:
318	self.fileobj = None
319	if self.myfileobj:
320	self.myfileobj.close()
321	self.myfileobj = None
322
323	def __del__(self):
324	try:
325	if (self.myfileobj is None and
326	self.fileobj is None):
327	return
328	except AttributeError:
329	return
330	self.close()
331
332	def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
333	if self.mode == WRITE:
334	# Ensure the compressor's buffer is flushed
335	self.fileobj.write(self.compress.flush(zlib_mode))
336	self.fileobj.flush()
337
338	def fileno(self):
339	"""Invoke the underlying file object's fileno() method.
340
341	This will raise AttributeError if the underlying file object
342	doesn't support fileno().
343	"""
344	return self.fileobj.fileno()
345
346	def isatty(self):
347	return False
348
349	def tell(self):
350	return self.offset
351
352	def rewind(self):
353	'''Return the uncompressed stream file position indicator to the
354	beginning of the file'''
355	if self.mode != READ:
356	raise IOError("Can't rewind in write mode")
357	self.fileobj.seek(0)
358	self._new_member = True
359	self.extrabuf = ""
360	self.extrasize = 0
361	self.offset = 0
362
363	def seek(self, offset, whence=0):
364	if whence:
365	if whence == 1:
366	offset = self.offset + offset
367	else:
368	raise ValueError('Seek from end not supported')
369	if self.mode == WRITE:
370	if offset < self.offset:
371	raise IOError('Negative seek in write mode')
372	count = offset - self.offset
373	for i in range(count // 1024):
374	self.write(1024 * '\0')
375	self.write((count % 1024) * '\0')
376	elif self.mode == READ:
377	if offset < self.offset:
378	# for negative seek, rewind and do positive seek
379	self.rewind()
380	count = offset - self.offset
381	for i in range(count // 1024):
382	self.read(1024)
383	self.read(count % 1024)
384
385	def readline(self, size=-1):
386	if size < 0:
387	size = sys.maxint
388	readsize = self.min_readsize
389	else:
390	readsize = size
391	bufs = []
392	while size != 0:
393	c = self.read(readsize)
394	i = c.find('\n')
395
396	# We set i=size to break out of the loop under two
397	# conditions: 1) there's no newline, and the chunk is
398	# larger than size, or 2) there is a newline, but the
399	# resulting line would be longer than 'size'.
400	if (size <= i) or (i == -1 and len(c) > size):
401	i = size - 1
402
403	if i >= 0 or c == '':
404	bufs.append(c[:i + 1]) # Add portion of last chunk
405	self._unread(c[i + 1:]) # Push back rest of chunk
406	break
407
408	# Append chunk to list, decrease 'size',
409	bufs.append(c)
410	size = size - len(c)
411	readsize = min(size, readsize * 2)
412	if readsize > self.min_readsize:
413	self.min_readsize = min(readsize, self.min_readsize * 2, 512)
414	return ''.join(bufs) # Return resulting line
415
416	def readlines(self, sizehint=0):
417	# Negative numbers result in reading all the lines
418	if sizehint <= 0:
419	sizehint = sys.maxint
420	L = []
421	while sizehint > 0:
422	line = self.readline()
423	if line == "":
424	break
425	L.append(line)
426	sizehint = sizehint - len(line)
427
428	return L
429
430	def writelines(self, L):
431	for line in L:
432	self.write(line)
433
434	def __iter__(self):
435	return self
436
437	def next(self):
438	line = self.readline()
439	if line:
440	return line
441	else:
442	raise StopIteration
443
444
445	def _test():
446	# Act like gzip; with -d, act like gunzip.
447	# The input file is not deleted, however, nor are any other gzip
448	# options or features supported.
449	args = sys.argv[1:]
450	decompress = args and args[0] == "-d"
451	if decompress:
452	args = args[1:]
453	if not args:
454	args = ["-"]
455	for arg in args:
456	if decompress:
457	if arg == "-":
458	f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
459	g = sys.stdout
460	else:
461	if arg[-3:] != ".gz":
462	print "filename doesn't end in .gz:", repr(arg)
463	continue
464	f = open(arg, "rb")
465	g = __builtin__.open(arg[:-3], "wb")
466	else:
467	if arg == "-":
468	f = sys.stdin
469	g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
470	else:
471	f = __builtin__.open(arg, "rb")
472	g = open(arg + ".gz", "wb")
473	while True:
474	chunk = f.read(1024)
475	if not chunk:
476	break
477	g.write(chunk)
478	if g is not sys.stdout:
479	g.close()
480	if f is not sys.stdin:
481	f.close()
482
483	if __name__ == '__main__':
484	_test()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/gzip.py@ 389

Download in other formats: