Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

csv.py

Last change on this file was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 16.0 KB

Rev	Line
[2]	1
	2	"""
	3	csv.py - read/write/investigate CSV files
	4	"""
	5
	6	import re
	7	from functools import reduce
	8	from _csv import Error, __version__, writer, reader, register_dialect, \
	9	unregister_dialect, get_dialect, list_dialects, \
	10	field_size_limit, \
	11	QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
	12	__doc__
	13	from _csv import Dialect as _Dialect
	14
	15	try:
	16	from cStringIO import StringIO
	17	except ImportError:
	18	from StringIO import StringIO
	19
	20	__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
	21	"Error", "Dialect", "__doc__", "excel", "excel_tab",
	22	"field_size_limit", "reader", "writer",
	23	"register_dialect", "get_dialect", "list_dialects", "Sniffer",
	24	"unregister_dialect", "__version__", "DictReader", "DictWriter" ]
	25
	26	class Dialect:
	27	"""Describe an Excel dialect.
	28
	29	This must be subclassed (see csv.excel). Valid attributes are:
	30	delimiter, quotechar, escapechar, doublequote, skipinitialspace,
	31	lineterminator, quoting.
	32
	33	"""
	34	_name = ""
	35	_valid = False
	36	# placeholders
	37	delimiter = None
	38	quotechar = None
	39	escapechar = None
	40	doublequote = None
	41	skipinitialspace = None
	42	lineterminator = None
	43	quoting = None
	44
	45	def __init__(self):
	46	if self.__class__ != Dialect:
	47	self._valid = True
	48	self._validate()
	49
	50	def _validate(self):
	51	try:
	52	_Dialect(self)
	53	except TypeError, e:
	54	# We do this for compatibility with py2.3
	55	raise Error(str(e))
	56
	57	class excel(Dialect):
	58	"""Describe the usual properties of Excel-generated CSV files."""
	59	delimiter = ','
	60	quotechar = '"'
	61	doublequote = True
	62	skipinitialspace = False
	63	lineterminator = '\r\n'
	64	quoting = QUOTE_MINIMAL
	65	register_dialect("excel", excel)
	66
	67	class excel_tab(excel):
	68	"""Describe the usual properties of Excel-generated TAB-delimited files."""
	69	delimiter = '\t'
	70	register_dialect("excel-tab", excel_tab)
	71
	72
	73	class DictReader:
	74	def __init__(self, f, fieldnames=None, restkey=None, restval=None,
	75	dialect="excel", args, *kwds):
	76	self._fieldnames = fieldnames # list of keys for the dict
	77	self.restkey = restkey # key to catch long rows
	78	self.restval = restval # default value for short rows
	79	self.reader = reader(f, dialect, args, *kwds)
	80	self.dialect = dialect
	81	self.line_num = 0
	82
	83	def __iter__(self):
	84	return self
	85
	86	@property
	87	def fieldnames(self):
	88	if self._fieldnames is None:
	89	try:
	90	self._fieldnames = self.reader.next()
	91	except StopIteration:
	92	pass
	93	self.line_num = self.reader.line_num
	94	return self._fieldnames
	95
	96	@fieldnames.setter
	97	def fieldnames(self, value):
	98	self._fieldnames = value
	99
	100	def next(self):
	101	if self.line_num == 0:
	102	# Used only for its side effect.
	103	self.fieldnames
	104	row = self.reader.next()
	105	self.line_num = self.reader.line_num
	106
	107	# unlike the basic reader, we prefer not to return blanks,
	108	# because we will typically wind up with a dict full of None
	109	# values
	110	while row == []:
	111	row = self.reader.next()
	112	d = dict(zip(self.fieldnames, row))
	113	lf = len(self.fieldnames)
	114	lr = len(row)
	115	if lf < lr:
	116	d[self.restkey] = row[lf:]
	117	elif lf > lr:
	118	for key in self.fieldnames[lr:]:
	119	d[key] = self.restval
	120	return d
	121
	122
	123	class DictWriter:
	124	def __init__(self, f, fieldnames, restval="", extrasaction="raise",
	125	dialect="excel", args, *kwds):
	126	self.fieldnames = fieldnames # list of keys for the dict
	127	self.restval = restval # for writing short dicts
	128	if extrasaction.lower() not in ("raise", "ignore"):
	129	raise ValueError, \
	130	("extrasaction (%s) must be 'raise' or 'ignore'" %
	131	extrasaction)
	132	self.extrasaction = extrasaction
	133	self.writer = writer(f, dialect, args, *kwds)
	134
[391]	135	def writeheader(self):
	136	header = dict(zip(self.fieldnames, self.fieldnames))
	137	self.writerow(header)
	138
[2]	139	def _dict_to_list(self, rowdict):
	140	if self.extrasaction == "raise":
	141	wrong_fields = [k for k in rowdict if k not in self.fieldnames]
	142	if wrong_fields:
	143	raise ValueError("dict contains fields not in fieldnames: " +
	144	", ".join(wrong_fields))
	145	return [rowdict.get(key, self.restval) for key in self.fieldnames]
	146
	147	def writerow(self, rowdict):
	148	return self.writer.writerow(self._dict_to_list(rowdict))
	149
	150	def writerows(self, rowdicts):
	151	rows = []
	152	for rowdict in rowdicts:
	153	rows.append(self._dict_to_list(rowdict))
	154	return self.writer.writerows(rows)
	155
	156	# Guard Sniffer's type checking against builds that exclude complex()
	157	try:
	158	complex
	159	except NameError:
	160	complex = float
	161
	162	class Sniffer:
	163	'''
	164	"Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
	165	Returns a Dialect object.
	166	'''
	167	def __init__(self):
	168	# in case there is more than one possible delimiter
	169	self.preferred = [',', '\t', ';', ' ', ':']
	170
	171
	172	def sniff(self, sample, delimiters=None):
	173	"""
	174	Returns a dialect (or None) corresponding to the sample
	175	"""
	176
[391]	177	quotechar, doublequote, delimiter, skipinitialspace = \
[2]	178	self._guess_quote_and_delimiter(sample, delimiters)
	179	if not delimiter:
	180	delimiter, skipinitialspace = self._guess_delimiter(sample,
	181	delimiters)
	182
	183	if not delimiter:
	184	raise Error, "Could not determine delimiter"
	185
	186	class dialect(Dialect):
	187	_name = "sniffed"
	188	lineterminator = '\r\n'
	189	quoting = QUOTE_MINIMAL
	190	# escapechar = ''
	191
[391]	192	dialect.doublequote = doublequote
[2]	193	dialect.delimiter = delimiter
	194	# _csv.reader won't accept a quotechar of ''
	195	dialect.quotechar = quotechar or '"'
	196	dialect.skipinitialspace = skipinitialspace
	197
	198	return dialect
	199
	200
	201	def _guess_quote_and_delimiter(self, data, delimiters):
	202	"""
	203	Looks for text enclosed between two identical quotes
	204	(the probable quotechar) which are preceded and followed
	205	by the same character (the probable delimiter).
	206	For example:
	207	,'some text',
	208	The quote with the most wins, same with the delimiter.
	209	If there is no quotechar the delimiter can't be determined
	210	this way.
	211	"""
	212
	213	matches = []
	214	for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?P=delim)', # ,".?",
	215	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".?",
	216	'(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).?(?P=quote)(?:$\|\n)', # ,".?"
	217	'(?:^\|\n)(?P<quote>["\']).?(?P=quote)(?:$\|\n)'): # ".?" (no delim, no space)
	218	regexp = re.compile(restr, re.DOTALL \| re.MULTILINE)
	219	matches = regexp.findall(data)
	220	if matches:
	221	break
	222
	223	if not matches:
[391]	224	# (quotechar, doublequote, delimiter, skipinitialspace)
	225	return ('', False, None, 0)
[2]	226	quotes = {}
	227	delims = {}
	228	spaces = 0
	229	for m in matches:
	230	n = regexp.groupindex['quote'] - 1
	231	key = m[n]
	232	if key:
	233	quotes[key] = quotes.get(key, 0) + 1
	234	try:
	235	n = regexp.groupindex['delim'] - 1
	236	key = m[n]
	237	except KeyError:
	238	continue
	239	if key and (delimiters is None or key in delimiters):
	240	delims[key] = delims.get(key, 0) + 1
	241	try:
	242	n = regexp.groupindex['space'] - 1
	243	except KeyError:
	244	continue
	245	if m[n]:
	246	spaces += 1
	247
	248	quotechar = reduce(lambda a, b, quotes = quotes:
	249	(quotes[a] > quotes[b]) and a or b, quotes.keys())
	250
	251	if delims:
	252	delim = reduce(lambda a, b, delims = delims:
	253	(delims[a] > delims[b]) and a or b, delims.keys())
	254	skipinitialspace = delims[delim] == spaces
	255	if delim == '\n': # most likely a file with a single column
	256	delim = ''
	257	else:
	258	# there is no delimiter, it's a single column of quoted data
	259	delim = ''
	260	skipinitialspace = 0
	261
[391]	262	# if we see an extra quote between delimiters, we've got a
	263	# double quoted format
	264	dq_regexp = re.compile(
	265	r"((%(delim)s)\|^)\W%(quote)s[^%(delim)s\n]%(quote)s[^%(delim)s\n]%(quote)s\W((%(delim)s)\|$)" % \
	266	{'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
[2]	267
	268
[391]	269
	270	if dq_regexp.search(data):
	271	doublequote = True
	272	else:
	273	doublequote = False
	274
	275	return (quotechar, doublequote, delim, skipinitialspace)
	276
	277
[2]	278	def _guess_delimiter(self, data, delimiters):
	279	"""
	280	The delimiter /should/ occur the same number of times on
	281	each row. However, due to malformed data, it may not. We don't want
	282	an all or nothing approach, so we allow for small variations in this
	283	number.
	284	1) build a table of the frequency of each character on every line.
[391]	285	2) build a table of frequencies of this frequency (meta-frequency?),
[2]	286	e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
	287	7 times in 2 rows'
	288	3) use the mode of the meta-frequency to determine the /expected/
	289	frequency for that character
	290	4) find out how often the character actually meets that goal
	291	5) the character that best meets its goal is the delimiter
	292	For performance reasons, the data is evaluated in chunks, so it can
	293	try and evaluate the smallest portion of the data possible, evaluating
	294	additional chunks as necessary.
	295	"""
	296
	297	data = filter(None, data.split('\n'))
	298
	299	ascii = [chr(c) for c in range(127)] # 7-bit ASCII
	300
	301	# build frequency tables
	302	chunkLength = min(10, len(data))
	303	iteration = 0
	304	charFrequency = {}
	305	modes = {}
	306	delims = {}
	307	start, end = 0, min(chunkLength, len(data))
	308	while start < len(data):
	309	iteration += 1
	310	for line in data[start:end]:
	311	for char in ascii:
	312	metaFrequency = charFrequency.get(char, {})
	313	# must count even if frequency is 0
	314	freq = line.count(char)
	315	# value is the mode
	316	metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
	317	charFrequency[char] = metaFrequency
	318
	319	for char in charFrequency.keys():
	320	items = charFrequency[char].items()
	321	if len(items) == 1 and items[0][0] == 0:
	322	continue
	323	# get the mode of the frequencies
	324	if len(items) > 1:
	325	modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
	326	items)
	327	# adjust the mode - subtract the sum of all
	328	# other frequencies
	329	items.remove(modes[char])
	330	modes[char] = (modes[char][0], modes[char][1]
	331	- reduce(lambda a, b: (0, a[1] + b[1]),
	332	items)[1])
	333	else:
	334	modes[char] = items[0]
	335
	336	# build a list of possible delimiters
	337	modeList = modes.items()
	338	total = float(chunkLength * iteration)
	339	# (rows of consistent data) / (number of rows) = 100%
	340	consistency = 1.0
	341	# minimum consistency threshold
	342	threshold = 0.9
	343	while len(delims) == 0 and consistency >= threshold:
	344	for k, v in modeList:
	345	if v[0] > 0 and v[1] > 0:
	346	if ((v[1]/total) >= consistency and
	347	(delimiters is None or k in delimiters)):
	348	delims[k] = v
	349	consistency -= 0.01
	350
	351	if len(delims) == 1:
	352	delim = delims.keys()[0]
	353	skipinitialspace = (data[0].count(delim) ==
	354	data[0].count("%c " % delim))
	355	return (delim, skipinitialspace)
	356
	357	# analyze another chunkLength lines
	358	start = end
	359	end += chunkLength
	360
	361	if not delims:
	362	return ('', 0)
	363
	364	# if there's more than one, fall back to a 'preferred' list
	365	if len(delims) > 1:
	366	for d in self.preferred:
	367	if d in delims.keys():
	368	skipinitialspace = (data[0].count(d) ==
	369	data[0].count("%c " % d))
	370	return (d, skipinitialspace)
	371
	372	# nothing else indicates a preference, pick the character that
	373	# dominates(?)
	374	items = [(v,k) for (k,v) in delims.items()]
	375	items.sort()
	376	delim = items[-1][1]
	377
	378	skipinitialspace = (data[0].count(delim) ==
	379	data[0].count("%c " % delim))
	380	return (delim, skipinitialspace)
	381
	382
	383	def has_header(self, sample):
	384	# Creates a dictionary of types of data in each column. If any
	385	# column is of a single type (say, integers), except for the first
	386	# row, then the first row is presumed to be labels. If the type
	387	# can't be determined, it is assumed to be a string in which case
	388	# the length of the string is the determining factor: if all of the
	389	# rows except for the first are the same length, it's a header.
	390	# Finally, a 'vote' is taken at the end for each column, adding or
	391	# subtracting from the likelihood of the first row being a header.
	392
	393	rdr = reader(StringIO(sample), self.sniff(sample))
	394
	395	header = rdr.next() # assume first row is header
	396
	397	columns = len(header)
	398	columnTypes = {}
	399	for i in range(columns): columnTypes[i] = None
	400
	401	checked = 0
	402	for row in rdr:
	403	# arbitrary number of rows to check, to keep it sane
	404	if checked > 20:
	405	break
	406	checked += 1
	407
	408	if len(row) != columns:
	409	continue # skip rows that have irregular number of columns
	410
	411	for col in columnTypes.keys():
	412
	413	for thisType in [int, long, float, complex]:
	414	try:
	415	thisType(row[col])
	416	break
	417	except (ValueError, OverflowError):
	418	pass
	419	else:
	420	# fallback to length of string
	421	thisType = len(row[col])
	422
	423	# treat longs as ints
	424	if thisType == long:
	425	thisType = int
	426
	427	if thisType != columnTypes[col]:
	428	if columnTypes[col] is None: # add new column type
	429	columnTypes[col] = thisType
	430	else:
	431	# type is inconsistent, remove column from
	432	# consideration
	433	del columnTypes[col]
	434
	435	# finally, compare results against first row and "vote"
	436	# on whether it's a header
	437	hasHeader = 0
	438	for col, colType in columnTypes.items():
	439	if type(colType) == type(0): # it's a length
	440	if len(header[col]) != colType:
	441	hasHeader += 1
	442	else:
	443	hasHeader -= 1
	444	else: # attempt typecast
	445	try:
	446	colType(header[col])
	447	except (ValueError, TypeError):
	448	hasHeader += 1
	449	else:
	450	hasHeader -= 1
	451
	452	return hasHeader > 0

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/csv.py

Download in other formats: