Context Navigation

csv.py@ 611

Last change on this file since 611 was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 16.0 KB

Line
1
2	"""
3	csv.py - read/write/investigate CSV files
4	"""
5
6	import re
7	from functools import reduce
8	from _csv import Error, __version__, writer, reader, register_dialect, \
9	unregister_dialect, get_dialect, list_dialects, \
10	field_size_limit, \
11	QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
12	__doc__
13	from _csv import Dialect as _Dialect
14
15	try:
16	from cStringIO import StringIO
17	except ImportError:
18	from StringIO import StringIO
19
20	__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
21	"Error", "Dialect", "__doc__", "excel", "excel_tab",
22	"field_size_limit", "reader", "writer",
23	"register_dialect", "get_dialect", "list_dialects", "Sniffer",
24	"unregister_dialect", "__version__", "DictReader", "DictWriter" ]
25
26	class Dialect:
27	"""Describe an Excel dialect.
28
29	This must be subclassed (see csv.excel). Valid attributes are:
30	delimiter, quotechar, escapechar, doublequote, skipinitialspace,
31	lineterminator, quoting.
32
33	"""
34	_name = ""
35	_valid = False
36	# placeholders
37	delimiter = None
38	quotechar = None
39	escapechar = None
40	doublequote = None
41	skipinitialspace = None
42	lineterminator = None
43	quoting = None
44
45	def __init__(self):
46	if self.__class__ != Dialect:
47	self._valid = True
48	self._validate()
49
50	def _validate(self):
51	try:
52	_Dialect(self)
53	except TypeError, e:
54	# We do this for compatibility with py2.3
55	raise Error(str(e))
56
57	class excel(Dialect):
58	"""Describe the usual properties of Excel-generated CSV files."""
59	delimiter = ','
60	quotechar = '"'
61	doublequote = True
62	skipinitialspace = False
63	lineterminator = '\r\n'
64	quoting = QUOTE_MINIMAL
65	register_dialect("excel", excel)
66
67	class excel_tab(excel):
68	"""Describe the usual properties of Excel-generated TAB-delimited files."""
69	delimiter = '\t'
70	register_dialect("excel-tab", excel_tab)
71
72
73	class DictReader:
74	def __init__(self, f, fieldnames=None, restkey=None, restval=None,
75	dialect="excel", args, *kwds):
76	self._fieldnames = fieldnames # list of keys for the dict
77	self.restkey = restkey # key to catch long rows
78	self.restval = restval # default value for short rows
79	self.reader = reader(f, dialect, args, *kwds)
80	self.dialect = dialect
81	self.line_num = 0
82
83	def __iter__(self):
84	return self
85
86	@property
87	def fieldnames(self):
88	if self._fieldnames is None:
89	try:
90	self._fieldnames = self.reader.next()
91	except StopIteration:
92	pass
93	self.line_num = self.reader.line_num
94	return self._fieldnames
95
96	@fieldnames.setter
97	def fieldnames(self, value):
98	self._fieldnames = value
99
100	def next(self):
101	if self.line_num == 0:
102	# Used only for its side effect.
103	self.fieldnames
104	row = self.reader.next()
105	self.line_num = self.reader.line_num
106
107	# unlike the basic reader, we prefer not to return blanks,
108	# because we will typically wind up with a dict full of None
109	# values
110	while row == []:
111	row = self.reader.next()
112	d = dict(zip(self.fieldnames, row))
113	lf = len(self.fieldnames)
114	lr = len(row)
115	if lf < lr:
116	d[self.restkey] = row[lf:]
117	elif lf > lr:
118	for key in self.fieldnames[lr:]:
119	d[key] = self.restval
120	return d
121
122
123	class DictWriter:
124	def __init__(self, f, fieldnames, restval="", extrasaction="raise",
125	dialect="excel", args, *kwds):
126	self.fieldnames = fieldnames # list of keys for the dict
127	self.restval = restval # for writing short dicts
128	if extrasaction.lower() not in ("raise", "ignore"):
129	raise ValueError, \
130	("extrasaction (%s) must be 'raise' or 'ignore'" %
131	extrasaction)
132	self.extrasaction = extrasaction
133	self.writer = writer(f, dialect, args, *kwds)
134
135	def writeheader(self):
136	header = dict(zip(self.fieldnames, self.fieldnames))
137	self.writerow(header)
138
139	def _dict_to_list(self, rowdict):
140	if self.extrasaction == "raise":
141	wrong_fields = [k for k in rowdict if k not in self.fieldnames]
142	if wrong_fields:
143	raise ValueError("dict contains fields not in fieldnames: " +
144	", ".join(wrong_fields))
145	return [rowdict.get(key, self.restval) for key in self.fieldnames]
146
147	def writerow(self, rowdict):
148	return self.writer.writerow(self._dict_to_list(rowdict))
149
150	def writerows(self, rowdicts):
151	rows = []
152	for rowdict in rowdicts:
153	rows.append(self._dict_to_list(rowdict))
154	return self.writer.writerows(rows)
155
156	# Guard Sniffer's type checking against builds that exclude complex()
157	try:
158	complex
159	except NameError:
160	complex = float
161
162	class Sniffer:
163	'''
164	"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
165	Returns a Dialect object.
166	'''
167	def __init__(self):
168	# in case there is more than one possible delimiter
169	self.preferred = [',', '\t', ';', ' ', ':']
170
171
172	def sniff(self, sample, delimiters=None):
173	"""
174	Returns a dialect (or None) corresponding to the sample
175	"""
176
177	quotechar, doublequote, delimiter, skipinitialspace = \
178	self._guess_quote_and_delimiter(sample, delimiters)
179	if not delimiter:
180	delimiter, skipinitialspace = self._guess_delimiter(sample,
181	delimiters)
182
183	if not delimiter:
184	raise Error, "Could not determine delimiter"
185
186	class dialect(Dialect):
187	_name = "sniffed"
188	lineterminator = '\r\n'
189	quoting = QUOTE_MINIMAL
190	# escapechar = ''
191
192	dialect.doublequote = doublequote
193	dialect.delimiter = delimiter
194	# _csv.reader won't accept a quotechar of ''
195	dialect.quotechar = quotechar or '"'
196	dialect.skipinitialspace = skipinitialspace
197
198	return dialect
199
200
201	def _guess_quote_and_delimiter(self, data, delimiters):
202	"""
203	Looks for text enclosed between two identical quotes
204	(the probable quotechar) which are preceded and followed
205	by the same character (the probable delimiter).
206	For example:
207	,'some text',
208	The quote with the most wins, same with the delimiter.
209	If there is no quotechar the delimiter can't be determined
210	this way.
211	"""
212
213	matches = []
214	for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?P=delim)', # ,".?",
215	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".?",
216	'(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?:$\|\n)', # ,".?"
217	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?:$\|\n)'): # ".?" (no delim, no space)
218	regexp = re.compile(restr, re.DOTALL \| re.MULTILINE)
219	matches = regexp.findall(data)
220	if matches:
221	break
222
223	if not matches:
224	# (quotechar, doublequote, delimiter, skipinitialspace)
225	return ('', False, None, 0)
226	quotes = {}
227	delims = {}
228	spaces = 0
229	for m in matches:
230	n = regexp.groupindex['quote'] - 1
231	key = m[n]
232	if key:
233	quotes[key] = quotes.get(key, 0) + 1
234	try:
235	n = regexp.groupindex['delim'] - 1
236	key = m[n]
237	except KeyError:
238	continue
239	if key and (delimiters is None or key in delimiters):
240	delims[key] = delims.get(key, 0) + 1
241	try:
242	n = regexp.groupindex['space'] - 1
243	except KeyError:
244	continue
245	if m[n]:
246	spaces += 1
247
248	quotechar = reduce(lambda a, b, quotes = quotes:
249	(quotes[a] > quotes[b]) and a or b, quotes.keys())
250
251	if delims:
252	delim = reduce(lambda a, b, delims = delims:
253	(delims[a] > delims[b]) and a or b, delims.keys())
254	skipinitialspace = delims[delim] == spaces
255	if delim == '\n': # most likely a file with a single column
256	delim = ''
257	else:
258	# there is no delimiter, it's a single column of quoted data
259	delim = ''
260	skipinitialspace = 0
261
262	# if we see an extra quote between delimiters, we've got a
263	# double quoted format
264	dq_regexp = re.compile(
265	r"((%(delim)s)\|^)\W%(quote)s[^%(delim)s\n]%(quote)s[^%(delim)s\n]%(quote)s\W((%(delim)s)\|$)" % \
266	{'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
267
268
269
270	if dq_regexp.search(data):
271	doublequote = True
272	else:
273	doublequote = False
274
275	return (quotechar, doublequote, delim, skipinitialspace)
276
277
278	def _guess_delimiter(self, data, delimiters):
279	"""
280	The delimiter /should/ occur the same number of times on
281	each row. However, due to malformed data, it may not. We don't want
282	an all or nothing approach, so we allow for small variations in this
283	number.
284	1) build a table of the frequency of each character on every line.
285	2) build a table of frequencies of this frequency (meta-frequency?),
286	e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
287	7 times in 2 rows'
288	3) use the mode of the meta-frequency to determine the /expected/
289	frequency for that character
290	4) find out how often the character actually meets that goal
291	5) the character that best meets its goal is the delimiter
292	For performance reasons, the data is evaluated in chunks, so it can
293	try and evaluate the smallest portion of the data possible, evaluating
294	additional chunks as necessary.
295	"""
296
297	data = filter(None, data.split('\n'))
298
299	ascii = [chr(c) for c in range(127)] # 7-bit ASCII
300
301	# build frequency tables
302	chunkLength = min(10, len(data))
303	iteration = 0
304	charFrequency = {}
305	modes = {}
306	delims = {}
307	start, end = 0, min(chunkLength, len(data))
308	while start < len(data):
309	iteration += 1
310	for line in data[start:end]:
311	for char in ascii:
312	metaFrequency = charFrequency.get(char, {})
313	# must count even if frequency is 0
314	freq = line.count(char)
315	# value is the mode
316	metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
317	charFrequency[char] = metaFrequency
318
319	for char in charFrequency.keys():
320	items = charFrequency[char].items()
321	if len(items) == 1 and items[0][0] == 0:
322	continue
323	# get the mode of the frequencies
324	if len(items) > 1:
325	modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
326	items)
327	# adjust the mode - subtract the sum of all
328	# other frequencies
329	items.remove(modes[char])
330	modes[char] = (modes[char][0], modes[char][1]
331	- reduce(lambda a, b: (0, a[1] + b[1]),
332	items)[1])
333	else:
334	modes[char] = items[0]
335
336	# build a list of possible delimiters
337	modeList = modes.items()
338	total = float(chunkLength * iteration)
339	# (rows of consistent data) / (number of rows) = 100%
340	consistency = 1.0
341	# minimum consistency threshold
342	threshold = 0.9
343	while len(delims) == 0 and consistency >= threshold:
344	for k, v in modeList:
345	if v[0] > 0 and v[1] > 0:
346	if ((v[1]/total) >= consistency and
347	(delimiters is None or k in delimiters)):
348	delims[k] = v
349	consistency -= 0.01
350
351	if len(delims) == 1:
352	delim = delims.keys()[0]
353	skipinitialspace = (data[0].count(delim) ==
354	data[0].count("%c " % delim))
355	return (delim, skipinitialspace)
356
357	# analyze another chunkLength lines
358	start = end
359	end += chunkLength
360
361	if not delims:
362	return ('', 0)
363
364	# if there's more than one, fall back to a 'preferred' list
365	if len(delims) > 1:
366	for d in self.preferred:
367	if d in delims.keys():
368	skipinitialspace = (data[0].count(d) ==
369	data[0].count("%c " % d))
370	return (d, skipinitialspace)
371
372	# nothing else indicates a preference, pick the character that
373	# dominates(?)
374	items = [(v,k) for (k,v) in delims.items()]
375	items.sort()
376	delim = items[-1][1]
377
378	skipinitialspace = (data[0].count(delim) ==
379	data[0].count("%c " % delim))
380	return (delim, skipinitialspace)
381
382
383	def has_header(self, sample):
384	# Creates a dictionary of types of data in each column. If any
385	# column is of a single type (say, integers), except for the first
386	# row, then the first row is presumed to be labels. If the type
387	# can't be determined, it is assumed to be a string in which case
388	# the length of the string is the determining factor: if all of the
389	# rows except for the first are the same length, it's a header.
390	# Finally, a 'vote' is taken at the end for each column, adding or
391	# subtracting from the likelihood of the first row being a header.
392
393	rdr = reader(StringIO(sample), self.sniff(sample))
394
395	header = rdr.next() # assume first row is header
396
397	columns = len(header)
398	columnTypes = {}
399	for i in range(columns): columnTypes[i] = None
400
401	checked = 0
402	for row in rdr:
403	# arbitrary number of rows to check, to keep it sane
404	if checked > 20:
405	break
406	checked += 1
407
408	if len(row) != columns:
409	continue # skip rows that have irregular number of columns
410
411	for col in columnTypes.keys():
412
413	for thisType in [int, long, float, complex]:
414	try:
415	thisType(row[col])
416	break
417	except (ValueError, OverflowError):
418	pass
419	else:
420	# fallback to length of string
421	thisType = len(row[col])
422
423	# treat longs as ints
424	if thisType == long:
425	thisType = int
426
427	if thisType != columnTypes[col]:
428	if columnTypes[col] is None: # add new column type
429	columnTypes[col] = thisType
430	else:
431	# type is inconsistent, remove column from
432	# consideration
433	del columnTypes[col]
434
435	# finally, compare results against first row and "vote"
436	# on whether it's a header
437	hasHeader = 0
438	for col, colType in columnTypes.items():
439	if type(colType) == type(0): # it's a length
440	if len(header[col]) != colType:
441	hasHeader += 1
442	else:
443	hasHeader -= 1
444	else: # attempt typecast
445	try:
446	colType(header[col])
447	except (ValueError, TypeError):
448	hasHeader += 1
449	else:
450	hasHeader -= 1
451
452	return hasHeader > 0

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/csv.py@ 611

Download in other formats: