Context Navigation

csv.py

Last change on this file was 2, checked in by Yuri Dario, 15 years ago
Initial import for vendor code.
Property svn:eol-style set to `native`
File size: 15.4 KB

Line
1
2	"""
3	csv.py - read/write/investigate CSV files
4	"""
5
6	import re
7	from functools import reduce
8	from _csv import Error, __version__, writer, reader, register_dialect, \
9	unregister_dialect, get_dialect, list_dialects, \
10	field_size_limit, \
11	QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
12	__doc__
13	from _csv import Dialect as _Dialect
14
15	try:
16	from cStringIO import StringIO
17	except ImportError:
18	from StringIO import StringIO
19
20	__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
21	"Error", "Dialect", "__doc__", "excel", "excel_tab",
22	"field_size_limit", "reader", "writer",
23	"register_dialect", "get_dialect", "list_dialects", "Sniffer",
24	"unregister_dialect", "__version__", "DictReader", "DictWriter" ]
25
26	class Dialect:
27	"""Describe an Excel dialect.
28
29	This must be subclassed (see csv.excel). Valid attributes are:
30	delimiter, quotechar, escapechar, doublequote, skipinitialspace,
31	lineterminator, quoting.
32
33	"""
34	_name = ""
35	_valid = False
36	# placeholders
37	delimiter = None
38	quotechar = None
39	escapechar = None
40	doublequote = None
41	skipinitialspace = None
42	lineterminator = None
43	quoting = None
44
45	def __init__(self):
46	if self.__class__ != Dialect:
47	self._valid = True
48	self._validate()
49
50	def _validate(self):
51	try:
52	_Dialect(self)
53	except TypeError, e:
54	# We do this for compatibility with py2.3
55	raise Error(str(e))
56
57	class excel(Dialect):
58	"""Describe the usual properties of Excel-generated CSV files."""
59	delimiter = ','
60	quotechar = '"'
61	doublequote = True
62	skipinitialspace = False
63	lineterminator = '\r\n'
64	quoting = QUOTE_MINIMAL
65	register_dialect("excel", excel)
66
67	class excel_tab(excel):
68	"""Describe the usual properties of Excel-generated TAB-delimited files."""
69	delimiter = '\t'
70	register_dialect("excel-tab", excel_tab)
71
72
73	class DictReader:
74	def __init__(self, f, fieldnames=None, restkey=None, restval=None,
75	dialect="excel", args, *kwds):
76	self._fieldnames = fieldnames # list of keys for the dict
77	self.restkey = restkey # key to catch long rows
78	self.restval = restval # default value for short rows
79	self.reader = reader(f, dialect, args, *kwds)
80	self.dialect = dialect
81	self.line_num = 0
82
83	def __iter__(self):
84	return self
85
86	@property
87	def fieldnames(self):
88	if self._fieldnames is None:
89	try:
90	self._fieldnames = self.reader.next()
91	except StopIteration:
92	pass
93	self.line_num = self.reader.line_num
94	return self._fieldnames
95
96	@fieldnames.setter
97	def fieldnames(self, value):
98	self._fieldnames = value
99
100	def next(self):
101	if self.line_num == 0:
102	# Used only for its side effect.
103	self.fieldnames
104	row = self.reader.next()
105	self.line_num = self.reader.line_num
106
107	# unlike the basic reader, we prefer not to return blanks,
108	# because we will typically wind up with a dict full of None
109	# values
110	while row == []:
111	row = self.reader.next()
112	d = dict(zip(self.fieldnames, row))
113	lf = len(self.fieldnames)
114	lr = len(row)
115	if lf < lr:
116	d[self.restkey] = row[lf:]
117	elif lf > lr:
118	for key in self.fieldnames[lr:]:
119	d[key] = self.restval
120	return d
121
122
123	class DictWriter:
124	def __init__(self, f, fieldnames, restval="", extrasaction="raise",
125	dialect="excel", args, *kwds):
126	self.fieldnames = fieldnames # list of keys for the dict
127	self.restval = restval # for writing short dicts
128	if extrasaction.lower() not in ("raise", "ignore"):
129	raise ValueError, \
130	("extrasaction (%s) must be 'raise' or 'ignore'" %
131	extrasaction)
132	self.extrasaction = extrasaction
133	self.writer = writer(f, dialect, args, *kwds)
134
135	def _dict_to_list(self, rowdict):
136	if self.extrasaction == "raise":
137	wrong_fields = [k for k in rowdict if k not in self.fieldnames]
138	if wrong_fields:
139	raise ValueError("dict contains fields not in fieldnames: " +
140	", ".join(wrong_fields))
141	return [rowdict.get(key, self.restval) for key in self.fieldnames]
142
143	def writerow(self, rowdict):
144	return self.writer.writerow(self._dict_to_list(rowdict))
145
146	def writerows(self, rowdicts):
147	rows = []
148	for rowdict in rowdicts:
149	rows.append(self._dict_to_list(rowdict))
150	return self.writer.writerows(rows)
151
152	# Guard Sniffer's type checking against builds that exclude complex()
153	try:
154	complex
155	except NameError:
156	complex = float
157
158	class Sniffer:
159	'''
160	"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
161	Returns a Dialect object.
162	'''
163	def __init__(self):
164	# in case there is more than one possible delimiter
165	self.preferred = [',', '\t', ';', ' ', ':']
166
167
168	def sniff(self, sample, delimiters=None):
169	"""
170	Returns a dialect (or None) corresponding to the sample
171	"""
172
173	quotechar, delimiter, skipinitialspace = \
174	self._guess_quote_and_delimiter(sample, delimiters)
175	if not delimiter:
176	delimiter, skipinitialspace = self._guess_delimiter(sample,
177	delimiters)
178
179	if not delimiter:
180	raise Error, "Could not determine delimiter"
181
182	class dialect(Dialect):
183	_name = "sniffed"
184	lineterminator = '\r\n'
185	quoting = QUOTE_MINIMAL
186	# escapechar = ''
187	doublequote = False
188
189	dialect.delimiter = delimiter
190	# _csv.reader won't accept a quotechar of ''
191	dialect.quotechar = quotechar or '"'
192	dialect.skipinitialspace = skipinitialspace
193
194	return dialect
195
196
197	def _guess_quote_and_delimiter(self, data, delimiters):
198	"""
199	Looks for text enclosed between two identical quotes
200	(the probable quotechar) which are preceded and followed
201	by the same character (the probable delimiter).
202	For example:
203	,'some text',
204	The quote with the most wins, same with the delimiter.
205	If there is no quotechar the delimiter can't be determined
206	this way.
207	"""
208
209	matches = []
210	for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?P=delim)', # ,".?",
211	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".?",
212	'(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?:$\|\n)', # ,".?"
213	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?:$\|\n)'): # ".?" (no delim, no space)
214	regexp = re.compile(restr, re.DOTALL \| re.MULTILINE)
215	matches = regexp.findall(data)
216	if matches:
217	break
218
219	if not matches:
220	return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
221
222	quotes = {}
223	delims = {}
224	spaces = 0
225	for m in matches:
226	n = regexp.groupindex['quote'] - 1
227	key = m[n]
228	if key:
229	quotes[key] = quotes.get(key, 0) + 1
230	try:
231	n = regexp.groupindex['delim'] - 1
232	key = m[n]
233	except KeyError:
234	continue
235	if key and (delimiters is None or key in delimiters):
236	delims[key] = delims.get(key, 0) + 1
237	try:
238	n = regexp.groupindex['space'] - 1
239	except KeyError:
240	continue
241	if m[n]:
242	spaces += 1
243
244	quotechar = reduce(lambda a, b, quotes = quotes:
245	(quotes[a] > quotes[b]) and a or b, quotes.keys())
246
247	if delims:
248	delim = reduce(lambda a, b, delims = delims:
249	(delims[a] > delims[b]) and a or b, delims.keys())
250	skipinitialspace = delims[delim] == spaces
251	if delim == '\n': # most likely a file with a single column
252	delim = ''
253	else:
254	# there is no delimiter, it's a single column of quoted data
255	delim = ''
256	skipinitialspace = 0
257
258	return (quotechar, delim, skipinitialspace)
259
260
261	def _guess_delimiter(self, data, delimiters):
262	"""
263	The delimiter /should/ occur the same number of times on
264	each row. However, due to malformed data, it may not. We don't want
265	an all or nothing approach, so we allow for small variations in this
266	number.
267	1) build a table of the frequency of each character on every line.
268	2) build a table of freqencies of this frequency (meta-frequency?),
269	e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
270	7 times in 2 rows'
271	3) use the mode of the meta-frequency to determine the /expected/
272	frequency for that character
273	4) find out how often the character actually meets that goal
274	5) the character that best meets its goal is the delimiter
275	For performance reasons, the data is evaluated in chunks, so it can
276	try and evaluate the smallest portion of the data possible, evaluating
277	additional chunks as necessary.
278	"""
279
280	data = filter(None, data.split('\n'))
281
282	ascii = [chr(c) for c in range(127)] # 7-bit ASCII
283
284	# build frequency tables
285	chunkLength = min(10, len(data))
286	iteration = 0
287	charFrequency = {}
288	modes = {}
289	delims = {}
290	start, end = 0, min(chunkLength, len(data))
291	while start < len(data):
292	iteration += 1
293	for line in data[start:end]:
294	for char in ascii:
295	metaFrequency = charFrequency.get(char, {})
296	# must count even if frequency is 0
297	freq = line.count(char)
298	# value is the mode
299	metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
300	charFrequency[char] = metaFrequency
301
302	for char in charFrequency.keys():
303	items = charFrequency[char].items()
304	if len(items) == 1 and items[0][0] == 0:
305	continue
306	# get the mode of the frequencies
307	if len(items) > 1:
308	modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
309	items)
310	# adjust the mode - subtract the sum of all
311	# other frequencies
312	items.remove(modes[char])
313	modes[char] = (modes[char][0], modes[char][1]
314	- reduce(lambda a, b: (0, a[1] + b[1]),
315	items)[1])
316	else:
317	modes[char] = items[0]
318
319	# build a list of possible delimiters
320	modeList = modes.items()
321	total = float(chunkLength * iteration)
322	# (rows of consistent data) / (number of rows) = 100%
323	consistency = 1.0
324	# minimum consistency threshold
325	threshold = 0.9
326	while len(delims) == 0 and consistency >= threshold:
327	for k, v in modeList:
328	if v[0] > 0 and v[1] > 0:
329	if ((v[1]/total) >= consistency and
330	(delimiters is None or k in delimiters)):
331	delims[k] = v
332	consistency -= 0.01
333
334	if len(delims) == 1:
335	delim = delims.keys()[0]
336	skipinitialspace = (data[0].count(delim) ==
337	data[0].count("%c " % delim))
338	return (delim, skipinitialspace)
339
340	# analyze another chunkLength lines
341	start = end
342	end += chunkLength
343
344	if not delims:
345	return ('', 0)
346
347	# if there's more than one, fall back to a 'preferred' list
348	if len(delims) > 1:
349	for d in self.preferred:
350	if d in delims.keys():
351	skipinitialspace = (data[0].count(d) ==
352	data[0].count("%c " % d))
353	return (d, skipinitialspace)
354
355	# nothing else indicates a preference, pick the character that
356	# dominates(?)
357	items = [(v,k) for (k,v) in delims.items()]
358	items.sort()
359	delim = items[-1][1]
360
361	skipinitialspace = (data[0].count(delim) ==
362	data[0].count("%c " % delim))
363	return (delim, skipinitialspace)
364
365
366	def has_header(self, sample):
367	# Creates a dictionary of types of data in each column. If any
368	# column is of a single type (say, integers), except for the first
369	# row, then the first row is presumed to be labels. If the type
370	# can't be determined, it is assumed to be a string in which case
371	# the length of the string is the determining factor: if all of the
372	# rows except for the first are the same length, it's a header.
373	# Finally, a 'vote' is taken at the end for each column, adding or
374	# subtracting from the likelihood of the first row being a header.
375
376	rdr = reader(StringIO(sample), self.sniff(sample))
377
378	header = rdr.next() # assume first row is header
379
380	columns = len(header)
381	columnTypes = {}
382	for i in range(columns): columnTypes[i] = None
383
384	checked = 0
385	for row in rdr:
386	# arbitrary number of rows to check, to keep it sane
387	if checked > 20:
388	break
389	checked += 1
390
391	if len(row) != columns:
392	continue # skip rows that have irregular number of columns
393
394	for col in columnTypes.keys():
395
396	for thisType in [int, long, float, complex]:
397	try:
398	thisType(row[col])
399	break
400	except (ValueError, OverflowError):
401	pass
402	else:
403	# fallback to length of string
404	thisType = len(row[col])
405
406	# treat longs as ints
407	if thisType == long:
408	thisType = int
409
410	if thisType != columnTypes[col]:
411	if columnTypes[col] is None: # add new column type
412	columnTypes[col] = thisType
413	else:
414	# type is inconsistent, remove column from
415	# consideration
416	del columnTypes[col]
417
418	# finally, compare results against first row and "vote"
419	# on whether it's a header
420	hasHeader = 0
421	for col, colType in columnTypes.items():
422	if type(colType) == type(0): # it's a length
423	if len(header[col]) != colType:
424	hasHeader += 1
425	else:
426	hasHeader -= 1
427	else: # attempt typecast
428	try:
429	colType(header[col])
430	except (ValueError, TypeError):
431	hasHeader += 1
432	else:
433	hasHeader -= 1
434
435	return hasHeader > 0

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/vendor/Python-2.6.5/Lib/csv.py

Download in other formats: