Context Navigation

gzip.py

Visit:

Last change on this file was 3225, checked in by bird, 18 years ago
Python 2.5
File size: 16.5 KB

Line
1	"""Functions that read and write gzipped files.
2
3	The user of the file doesn't have to worry about the compression,
4	but random access is not allowed."""
5
6	# based on Andrew Kuchling's minigzip.py distributed with the zlib module
7
8	import struct, sys, time
9	import zlib
10	import __builtin__
11
12	__all__ = ["GzipFile","open"]
13
14	FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
15
16	READ, WRITE = 1, 2
17
18	def U32(i):
19	"""Return i as an unsigned integer, assuming it fits in 32 bits.
20
21	If it's >= 2GB when viewed as a 32-bit unsigned int, return a long.
22	"""
23	if i < 0:
24	i += 1L << 32
25	return i
26
27	def LOWU32(i):
28	"""Return the low-order 32 bits of an int, as a non-negative int."""
29	return i & 0xFFFFFFFFL
30
31	def write32(output, value):
32	output.write(struct.pack("<l", value))
33
34	def write32u(output, value):
35	# The L format writes the bit pattern correctly whether signed
36	# or unsigned.
37	output.write(struct.pack("<L", value))
38
39	def read32(input):
40	return struct.unpack("<l", input.read(4))[0]
41
42	def open(filename, mode="rb", compresslevel=9):
43	"""Shorthand for GzipFile(filename, mode, compresslevel).
44
45	The filename argument is required; mode defaults to 'rb'
46	and compresslevel defaults to 9.
47
48	"""
49	return GzipFile(filename, mode, compresslevel)
50
51	class GzipFile:
52	"""The GzipFile class simulates most of the methods of a file object with
53	the exception of the readinto() and truncate() methods.
54
55	"""
56
57	myfileobj = None
58	max_read_chunk = 10 * 1024 * 1024 # 10Mb
59
60	def __init__(self, filename=None, mode=None,
61	compresslevel=9, fileobj=None):
62	"""Constructor for the GzipFile class.
63
64	At least one of fileobj and filename must be given a
65	non-trivial value.
66
67	The new class instance is based on fileobj, which can be a regular
68	file, a StringIO object, or any other object which simulates a file.
69	It defaults to None, in which case filename is opened to provide
70	a file object.
71
72	When fileobj is not None, the filename argument is only used to be
73	included in the gzip file header, which may includes the original
74	filename of the uncompressed file. It defaults to the filename of
75	fileobj, if discernible; otherwise, it defaults to the empty string,
76	and in this case the original filename is not included in the header.
77
78	The mode argument can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb',
79	depending on whether the file will be read or written. The default
80	is the mode of fileobj if discernible; otherwise, the default is 'rb'.
81	Be aware that only the 'rb', 'ab', and 'wb' values should be used
82	for cross-platform portability.
83
84	The compresslevel argument is an integer from 1 to 9 controlling the
85	level of compression; 1 is fastest and produces the least compression,
86	and 9 is slowest and produces the most compression. The default is 9.
87
88	"""
89
90	# guarantee the file is opened in binary mode on platforms
91	# that care about that sort of thing
92	if mode and 'b' not in mode:
93	mode += 'b'
94	if fileobj is None:
95	fileobj = self.myfileobj = __builtin__.open(filename, mode or 'rb')
96	if filename is None:
97	if hasattr(fileobj, 'name'): filename = fileobj.name
98	else: filename = ''
99	if mode is None:
100	if hasattr(fileobj, 'mode'): mode = fileobj.mode
101	else: mode = 'rb'
102
103	if mode[0:1] == 'r':
104	self.mode = READ
105	# Set flag indicating start of a new member
106	self._new_member = True
107	self.extrabuf = ""
108	self.extrasize = 0
109	self.filename = filename
110	# Starts small, scales exponentially
111	self.min_readsize = 100
112
113	elif mode[0:1] == 'w' or mode[0:1] == 'a':
114	self.mode = WRITE
115	self._init_write(filename)
116	self.compress = zlib.compressobj(compresslevel,
117	zlib.DEFLATED,
118	-zlib.MAX_WBITS,
119	zlib.DEF_MEM_LEVEL,
120	0)
121	else:
122	raise IOError, "Mode " + mode + " not supported"
123
124	self.fileobj = fileobj
125	self.offset = 0
126
127	if self.mode == WRITE:
128	self._write_gzip_header()
129
130	def __repr__(self):
131	s = repr(self.fileobj)
132	return '<gzip ' + s[1:-1] + ' ' + hex(id(self)) + '>'
133
134	def _init_write(self, filename):
135	if filename[-3:] != '.gz':
136	filename = filename + '.gz'
137	self.filename = filename
138	self.crc = zlib.crc32("")
139	self.size = 0
140	self.writebuf = []
141	self.bufsize = 0
142
143	def _write_gzip_header(self):
144	self.fileobj.write('\037\213') # magic header
145	self.fileobj.write('\010') # compression method
146	fname = self.filename[:-3]
147	flags = 0
148	if fname:
149	flags = FNAME
150	self.fileobj.write(chr(flags))
151	write32u(self.fileobj, long(time.time()))
152	self.fileobj.write('\002')
153	self.fileobj.write('\377')
154	if fname:
155	self.fileobj.write(fname + '\000')
156
157	def _init_read(self):
158	self.crc = zlib.crc32("")
159	self.size = 0
160
161	def _read_gzip_header(self):
162	magic = self.fileobj.read(2)
163	if magic != '\037\213':
164	raise IOError, 'Not a gzipped file'
165	method = ord( self.fileobj.read(1) )
166	if method != 8:
167	raise IOError, 'Unknown compression method'
168	flag = ord( self.fileobj.read(1) )
169	# modtime = self.fileobj.read(4)
170	# extraflag = self.fileobj.read(1)
171	# os = self.fileobj.read(1)
172	self.fileobj.read(6)
173
174	if flag & FEXTRA:
175	# Read & discard the extra field, if present
176	xlen = ord(self.fileobj.read(1))
177	xlen = xlen + 256*ord(self.fileobj.read(1))
178	self.fileobj.read(xlen)
179	if flag & FNAME:
180	# Read and discard a null-terminated string containing the filename
181	while True:
182	s = self.fileobj.read(1)
183	if not s or s=='\000':
184	break
185	if flag & FCOMMENT:
186	# Read and discard a null-terminated string containing a comment
187	while True:
188	s = self.fileobj.read(1)
189	if not s or s=='\000':
190	break
191	if flag & FHCRC:
192	self.fileobj.read(2) # Read & discard the 16-bit header CRC
193
194
195	def write(self,data):
196	if self.mode != WRITE:
197	import errno
198	raise IOError(errno.EBADF, "write() on read-only GzipFile object")
199
200	if self.fileobj is None:
201	raise ValueError, "write() on closed GzipFile object"
202	if len(data) > 0:
203	self.size = self.size + len(data)
204	self.crc = zlib.crc32(data, self.crc)
205	self.fileobj.write( self.compress.compress(data) )
206	self.offset += len(data)
207
208	def read(self, size=-1):
209	if self.mode != READ:
210	import errno
211	raise IOError(errno.EBADF, "read() on write-only GzipFile object")
212
213	if self.extrasize <= 0 and self.fileobj is None:
214	return ''
215
216	readsize = 1024
217	if size < 0: # get the whole thing
218	try:
219	while True:
220	self._read(readsize)
221	readsize = min(self.max_read_chunk, readsize * 2)
222	except EOFError:
223	size = self.extrasize
224	else: # just get some more of it
225	try:
226	while size > self.extrasize:
227	self._read(readsize)
228	readsize = min(self.max_read_chunk, readsize * 2)
229	except EOFError:
230	if size > self.extrasize:
231	size = self.extrasize
232
233	chunk = self.extrabuf[:size]
234	self.extrabuf = self.extrabuf[size:]
235	self.extrasize = self.extrasize - size
236
237	self.offset += size
238	return chunk
239
240	def _unread(self, buf):
241	self.extrabuf = buf + self.extrabuf
242	self.extrasize = len(buf) + self.extrasize
243	self.offset -= len(buf)
244
245	def _read(self, size=1024):
246	if self.fileobj is None:
247	raise EOFError, "Reached EOF"
248
249	if self._new_member:
250	# If the _new_member flag is set, we have to
251	# jump to the next member, if there is one.
252	#
253	# First, check if we're at the end of the file;
254	# if so, it's time to stop; no more members to read.
255	pos = self.fileobj.tell() # Save current position
256	self.fileobj.seek(0, 2) # Seek to end of file
257	if pos == self.fileobj.tell():
258	raise EOFError, "Reached EOF"
259	else:
260	self.fileobj.seek( pos ) # Return to original position
261
262	self._init_read()
263	self._read_gzip_header()
264	self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
265	self._new_member = False
266
267	# Read a chunk of data from the file
268	buf = self.fileobj.read(size)
269
270	# If the EOF has been reached, flush the decompression object
271	# and mark this object as finished.
272
273	if buf == "":
274	uncompress = self.decompress.flush()
275	self._read_eof()
276	self._add_read_data( uncompress )
277	raise EOFError, 'Reached EOF'
278
279	uncompress = self.decompress.decompress(buf)
280	self._add_read_data( uncompress )
281
282	if self.decompress.unused_data != "":
283	# Ending case: we've come to the end of a member in the file,
284	# so seek back to the start of the unused data, finish up
285	# this member, and read a new gzip header.
286	# (The number of bytes to seek back is the length of the unused
287	# data, minus 8 because _read_eof() will rewind a further 8 bytes)
288	self.fileobj.seek( -len(self.decompress.unused_data)+8, 1)
289
290	# Check the CRC and file size, and set the flag so we read
291	# a new member on the next call
292	self._read_eof()
293	self._new_member = True
294
295	def _add_read_data(self, data):
296	self.crc = zlib.crc32(data, self.crc)
297	self.extrabuf = self.extrabuf + data
298	self.extrasize = self.extrasize + len(data)
299	self.size = self.size + len(data)
300
301	def _read_eof(self):
302	# We've read to the end of the file, so we have to rewind in order
303	# to reread the 8 bytes containing the CRC and the file size.
304	# We check the that the computed CRC and size of the
305	# uncompressed data matches the stored values. Note that the size
306	# stored is the true file size mod 2**32.
307	self.fileobj.seek(-8, 1)
308	crc32 = read32(self.fileobj)
309	isize = U32(read32(self.fileobj)) # may exceed 2GB
310	if U32(crc32) != U32(self.crc):
311	raise IOError, "CRC check failed"
312	elif isize != LOWU32(self.size):
313	raise IOError, "Incorrect length of data produced"
314
315	def close(self):
316	if self.mode == WRITE:
317	self.fileobj.write(self.compress.flush())
318	# The native zlib crc is an unsigned 32-bit integer, but
319	# the Python wrapper implicitly casts that to a signed C
320	# long. So, on a 32-bit box self.crc may "look negative",
321	# while the same crc on a 64-bit box may "look positive".
322	# To avoid irksome warnings from the `struct` module, force
323	# it to look positive on all boxes.
324	write32u(self.fileobj, LOWU32(self.crc))
325	# self.size may exceed 2GB, or even 4GB
326	write32u(self.fileobj, LOWU32(self.size))
327	self.fileobj = None
328	elif self.mode == READ:
329	self.fileobj = None
330	if self.myfileobj:
331	self.myfileobj.close()
332	self.myfileobj = None
333
334	def __del__(self):
335	try:
336	if (self.myfileobj is None and
337	self.fileobj is None):
338	return
339	except AttributeError:
340	return
341	self.close()
342
343	def flush(self,zlib_mode=zlib.Z_SYNC_FLUSH):
344	if self.mode == WRITE:
345	# Ensure the compressor's buffer is flushed
346	self.fileobj.write(self.compress.flush(zlib_mode))
347	self.fileobj.flush()
348
349	def fileno(self):
350	"""Invoke the underlying file object's fileno() method.
351
352	This will raise AttributeError if the underlying file object
353	doesn't support fileno().
354	"""
355	return self.fileobj.fileno()
356
357	def isatty(self):
358	return False
359
360	def tell(self):
361	return self.offset
362
363	def rewind(self):
364	'''Return the uncompressed stream file position indicator to the
365	beginning of the file'''
366	if self.mode != READ:
367	raise IOError("Can't rewind in write mode")
368	self.fileobj.seek(0)
369	self._new_member = True
370	self.extrabuf = ""
371	self.extrasize = 0
372	self.offset = 0
373
374	def seek(self, offset):
375	if self.mode == WRITE:
376	if offset < self.offset:
377	raise IOError('Negative seek in write mode')
378	count = offset - self.offset
379	for i in range(count // 1024):
380	self.write(1024 * '\0')
381	self.write((count % 1024) * '\0')
382	elif self.mode == READ:
383	if offset < self.offset:
384	# for negative seek, rewind and do positive seek
385	self.rewind()
386	count = offset - self.offset
387	for i in range(count // 1024):
388	self.read(1024)
389	self.read(count % 1024)
390
391	def readline(self, size=-1):
392	if size < 0:
393	size = sys.maxint
394	readsize = self.min_readsize
395	else:
396	readsize = size
397	bufs = []
398	while size != 0:
399	c = self.read(readsize)
400	i = c.find('\n')
401
402	# We set i=size to break out of the loop under two
403	# conditions: 1) there's no newline, and the chunk is
404	# larger than size, or 2) there is a newline, but the
405	# resulting line would be longer than 'size'.
406	if (size <= i) or (i == -1 and len(c) > size):
407	i = size - 1
408
409	if i >= 0 or c == '':
410	bufs.append(c[:i + 1]) # Add portion of last chunk
411	self._unread(c[i + 1:]) # Push back rest of chunk
412	break
413
414	# Append chunk to list, decrease 'size',
415	bufs.append(c)
416	size = size - len(c)
417	readsize = min(size, readsize * 2)
418	if readsize > self.min_readsize:
419	self.min_readsize = min(readsize, self.min_readsize * 2, 512)
420	return ''.join(bufs) # Return resulting line
421
422	def readlines(self, sizehint=0):
423	# Negative numbers result in reading all the lines
424	if sizehint <= 0:
425	sizehint = sys.maxint
426	L = []
427	while sizehint > 0:
428	line = self.readline()
429	if line == "":
430	break
431	L.append(line)
432	sizehint = sizehint - len(line)
433
434	return L
435
436	def writelines(self, L):
437	for line in L:
438	self.write(line)
439
440	def __iter__(self):
441	return self
442
443	def next(self):
444	line = self.readline()
445	if line:
446	return line
447	else:
448	raise StopIteration
449
450
451	def _test():
452	# Act like gzip; with -d, act like gunzip.
453	# The input file is not deleted, however, nor are any other gzip
454	# options or features supported.
455	args = sys.argv[1:]
456	decompress = args and args[0] == "-d"
457	if decompress:
458	args = args[1:]
459	if not args:
460	args = ["-"]
461	for arg in args:
462	if decompress:
463	if arg == "-":
464	f = GzipFile(filename="", mode="rb", fileobj=sys.stdin)
465	g = sys.stdout
466	else:
467	if arg[-3:] != ".gz":
468	print "filename doesn't end in .gz:", repr(arg)
469	continue
470	f = open(arg, "rb")
471	g = __builtin__.open(arg[:-3], "wb")
472	else:
473	if arg == "-":
474	f = sys.stdin
475	g = GzipFile(filename="", mode="wb", fileobj=sys.stdout)
476	else:
477	f = __builtin__.open(arg, "rb")
478	g = open(arg + ".gz", "wb")
479	while True:
480	chunk = f.read(1024)
481	if not chunk:
482	break
483	g.write(chunk)
484	if g is not sys.stdout:
485	g.close()
486	if f is not sys.stdin:
487	f.close()
488
489	if __name__ == '__main__':
490	_test()

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/essentials/dev-lang/python/Lib/gzip.py

Download in other formats: