source: python/trunk/Lib/csv.py@ 611

Last change on this file since 611 was 391, checked in by dmik, 11 years ago

python: Merge vendor 2.7.6 to trunk.

  • Property svn:eol-style set to native
File size: 16.0 KB
Line 
1
2"""
3csv.py - read/write/investigate CSV files
4"""
5
6import re
7from functools import reduce
8from _csv import Error, __version__, writer, reader, register_dialect, \
9 unregister_dialect, get_dialect, list_dialects, \
10 field_size_limit, \
11 QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
12 __doc__
13from _csv import Dialect as _Dialect
14
15try:
16 from cStringIO import StringIO
17except ImportError:
18 from StringIO import StringIO
19
20__all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
21 "Error", "Dialect", "__doc__", "excel", "excel_tab",
22 "field_size_limit", "reader", "writer",
23 "register_dialect", "get_dialect", "list_dialects", "Sniffer",
24 "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
25
26class Dialect:
27 """Describe an Excel dialect.
28
29 This must be subclassed (see csv.excel). Valid attributes are:
30 delimiter, quotechar, escapechar, doublequote, skipinitialspace,
31 lineterminator, quoting.
32
33 """
34 _name = ""
35 _valid = False
36 # placeholders
37 delimiter = None
38 quotechar = None
39 escapechar = None
40 doublequote = None
41 skipinitialspace = None
42 lineterminator = None
43 quoting = None
44
45 def __init__(self):
46 if self.__class__ != Dialect:
47 self._valid = True
48 self._validate()
49
50 def _validate(self):
51 try:
52 _Dialect(self)
53 except TypeError, e:
54 # We do this for compatibility with py2.3
55 raise Error(str(e))
56
57class excel(Dialect):
58 """Describe the usual properties of Excel-generated CSV files."""
59 delimiter = ','
60 quotechar = '"'
61 doublequote = True
62 skipinitialspace = False
63 lineterminator = '\r\n'
64 quoting = QUOTE_MINIMAL
65register_dialect("excel", excel)
66
67class excel_tab(excel):
68 """Describe the usual properties of Excel-generated TAB-delimited files."""
69 delimiter = '\t'
70register_dialect("excel-tab", excel_tab)
71
72
73class DictReader:
74 def __init__(self, f, fieldnames=None, restkey=None, restval=None,
75 dialect="excel", *args, **kwds):
76 self._fieldnames = fieldnames # list of keys for the dict
77 self.restkey = restkey # key to catch long rows
78 self.restval = restval # default value for short rows
79 self.reader = reader(f, dialect, *args, **kwds)
80 self.dialect = dialect
81 self.line_num = 0
82
83 def __iter__(self):
84 return self
85
86 @property
87 def fieldnames(self):
88 if self._fieldnames is None:
89 try:
90 self._fieldnames = self.reader.next()
91 except StopIteration:
92 pass
93 self.line_num = self.reader.line_num
94 return self._fieldnames
95
96 @fieldnames.setter
97 def fieldnames(self, value):
98 self._fieldnames = value
99
100 def next(self):
101 if self.line_num == 0:
102 # Used only for its side effect.
103 self.fieldnames
104 row = self.reader.next()
105 self.line_num = self.reader.line_num
106
107 # unlike the basic reader, we prefer not to return blanks,
108 # because we will typically wind up with a dict full of None
109 # values
110 while row == []:
111 row = self.reader.next()
112 d = dict(zip(self.fieldnames, row))
113 lf = len(self.fieldnames)
114 lr = len(row)
115 if lf < lr:
116 d[self.restkey] = row[lf:]
117 elif lf > lr:
118 for key in self.fieldnames[lr:]:
119 d[key] = self.restval
120 return d
121
122
123class DictWriter:
124 def __init__(self, f, fieldnames, restval="", extrasaction="raise",
125 dialect="excel", *args, **kwds):
126 self.fieldnames = fieldnames # list of keys for the dict
127 self.restval = restval # for writing short dicts
128 if extrasaction.lower() not in ("raise", "ignore"):
129 raise ValueError, \
130 ("extrasaction (%s) must be 'raise' or 'ignore'" %
131 extrasaction)
132 self.extrasaction = extrasaction
133 self.writer = writer(f, dialect, *args, **kwds)
134
135 def writeheader(self):
136 header = dict(zip(self.fieldnames, self.fieldnames))
137 self.writerow(header)
138
139 def _dict_to_list(self, rowdict):
140 if self.extrasaction == "raise":
141 wrong_fields = [k for k in rowdict if k not in self.fieldnames]
142 if wrong_fields:
143 raise ValueError("dict contains fields not in fieldnames: " +
144 ", ".join(wrong_fields))
145 return [rowdict.get(key, self.restval) for key in self.fieldnames]
146
147 def writerow(self, rowdict):
148 return self.writer.writerow(self._dict_to_list(rowdict))
149
150 def writerows(self, rowdicts):
151 rows = []
152 for rowdict in rowdicts:
153 rows.append(self._dict_to_list(rowdict))
154 return self.writer.writerows(rows)
155
156# Guard Sniffer's type checking against builds that exclude complex()
157try:
158 complex
159except NameError:
160 complex = float
161
162class Sniffer:
163 '''
164 "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
165 Returns a Dialect object.
166 '''
167 def __init__(self):
168 # in case there is more than one possible delimiter
169 self.preferred = [',', '\t', ';', ' ', ':']
170
171
172 def sniff(self, sample, delimiters=None):
173 """
174 Returns a dialect (or None) corresponding to the sample
175 """
176
177 quotechar, doublequote, delimiter, skipinitialspace = \
178 self._guess_quote_and_delimiter(sample, delimiters)
179 if not delimiter:
180 delimiter, skipinitialspace = self._guess_delimiter(sample,
181 delimiters)
182
183 if not delimiter:
184 raise Error, "Could not determine delimiter"
185
186 class dialect(Dialect):
187 _name = "sniffed"
188 lineterminator = '\r\n'
189 quoting = QUOTE_MINIMAL
190 # escapechar = ''
191
192 dialect.doublequote = doublequote
193 dialect.delimiter = delimiter
194 # _csv.reader won't accept a quotechar of ''
195 dialect.quotechar = quotechar or '"'
196 dialect.skipinitialspace = skipinitialspace
197
198 return dialect
199
200
201 def _guess_quote_and_delimiter(self, data, delimiters):
202 """
203 Looks for text enclosed between two identical quotes
204 (the probable quotechar) which are preceded and followed
205 by the same character (the probable delimiter).
206 For example:
207 ,'some text',
208 The quote with the most wins, same with the delimiter.
209 If there is no quotechar the delimiter can't be determined
210 this way.
211 """
212
213 matches = []
214 for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
215 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
216 '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
217 '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
218 regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
219 matches = regexp.findall(data)
220 if matches:
221 break
222
223 if not matches:
224 # (quotechar, doublequote, delimiter, skipinitialspace)
225 return ('', False, None, 0)
226 quotes = {}
227 delims = {}
228 spaces = 0
229 for m in matches:
230 n = regexp.groupindex['quote'] - 1
231 key = m[n]
232 if key:
233 quotes[key] = quotes.get(key, 0) + 1
234 try:
235 n = regexp.groupindex['delim'] - 1
236 key = m[n]
237 except KeyError:
238 continue
239 if key and (delimiters is None or key in delimiters):
240 delims[key] = delims.get(key, 0) + 1
241 try:
242 n = regexp.groupindex['space'] - 1
243 except KeyError:
244 continue
245 if m[n]:
246 spaces += 1
247
248 quotechar = reduce(lambda a, b, quotes = quotes:
249 (quotes[a] > quotes[b]) and a or b, quotes.keys())
250
251 if delims:
252 delim = reduce(lambda a, b, delims = delims:
253 (delims[a] > delims[b]) and a or b, delims.keys())
254 skipinitialspace = delims[delim] == spaces
255 if delim == '\n': # most likely a file with a single column
256 delim = ''
257 else:
258 # there is *no* delimiter, it's a single column of quoted data
259 delim = ''
260 skipinitialspace = 0
261
262 # if we see an extra quote between delimiters, we've got a
263 # double quoted format
264 dq_regexp = re.compile(
265 r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
266 {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
267
268
269
270 if dq_regexp.search(data):
271 doublequote = True
272 else:
273 doublequote = False
274
275 return (quotechar, doublequote, delim, skipinitialspace)
276
277
278 def _guess_delimiter(self, data, delimiters):
279 """
280 The delimiter /should/ occur the same number of times on
281 each row. However, due to malformed data, it may not. We don't want
282 an all or nothing approach, so we allow for small variations in this
283 number.
284 1) build a table of the frequency of each character on every line.
285 2) build a table of frequencies of this frequency (meta-frequency?),
286 e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
287 7 times in 2 rows'
288 3) use the mode of the meta-frequency to determine the /expected/
289 frequency for that character
290 4) find out how often the character actually meets that goal
291 5) the character that best meets its goal is the delimiter
292 For performance reasons, the data is evaluated in chunks, so it can
293 try and evaluate the smallest portion of the data possible, evaluating
294 additional chunks as necessary.
295 """
296
297 data = filter(None, data.split('\n'))
298
299 ascii = [chr(c) for c in range(127)] # 7-bit ASCII
300
301 # build frequency tables
302 chunkLength = min(10, len(data))
303 iteration = 0
304 charFrequency = {}
305 modes = {}
306 delims = {}
307 start, end = 0, min(chunkLength, len(data))
308 while start < len(data):
309 iteration += 1
310 for line in data[start:end]:
311 for char in ascii:
312 metaFrequency = charFrequency.get(char, {})
313 # must count even if frequency is 0
314 freq = line.count(char)
315 # value is the mode
316 metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
317 charFrequency[char] = metaFrequency
318
319 for char in charFrequency.keys():
320 items = charFrequency[char].items()
321 if len(items) == 1 and items[0][0] == 0:
322 continue
323 # get the mode of the frequencies
324 if len(items) > 1:
325 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
326 items)
327 # adjust the mode - subtract the sum of all
328 # other frequencies
329 items.remove(modes[char])
330 modes[char] = (modes[char][0], modes[char][1]
331 - reduce(lambda a, b: (0, a[1] + b[1]),
332 items)[1])
333 else:
334 modes[char] = items[0]
335
336 # build a list of possible delimiters
337 modeList = modes.items()
338 total = float(chunkLength * iteration)
339 # (rows of consistent data) / (number of rows) = 100%
340 consistency = 1.0
341 # minimum consistency threshold
342 threshold = 0.9
343 while len(delims) == 0 and consistency >= threshold:
344 for k, v in modeList:
345 if v[0] > 0 and v[1] > 0:
346 if ((v[1]/total) >= consistency and
347 (delimiters is None or k in delimiters)):
348 delims[k] = v
349 consistency -= 0.01
350
351 if len(delims) == 1:
352 delim = delims.keys()[0]
353 skipinitialspace = (data[0].count(delim) ==
354 data[0].count("%c " % delim))
355 return (delim, skipinitialspace)
356
357 # analyze another chunkLength lines
358 start = end
359 end += chunkLength
360
361 if not delims:
362 return ('', 0)
363
364 # if there's more than one, fall back to a 'preferred' list
365 if len(delims) > 1:
366 for d in self.preferred:
367 if d in delims.keys():
368 skipinitialspace = (data[0].count(d) ==
369 data[0].count("%c " % d))
370 return (d, skipinitialspace)
371
372 # nothing else indicates a preference, pick the character that
373 # dominates(?)
374 items = [(v,k) for (k,v) in delims.items()]
375 items.sort()
376 delim = items[-1][1]
377
378 skipinitialspace = (data[0].count(delim) ==
379 data[0].count("%c " % delim))
380 return (delim, skipinitialspace)
381
382
383 def has_header(self, sample):
384 # Creates a dictionary of types of data in each column. If any
385 # column is of a single type (say, integers), *except* for the first
386 # row, then the first row is presumed to be labels. If the type
387 # can't be determined, it is assumed to be a string in which case
388 # the length of the string is the determining factor: if all of the
389 # rows except for the first are the same length, it's a header.
390 # Finally, a 'vote' is taken at the end for each column, adding or
391 # subtracting from the likelihood of the first row being a header.
392
393 rdr = reader(StringIO(sample), self.sniff(sample))
394
395 header = rdr.next() # assume first row is header
396
397 columns = len(header)
398 columnTypes = {}
399 for i in range(columns): columnTypes[i] = None
400
401 checked = 0
402 for row in rdr:
403 # arbitrary number of rows to check, to keep it sane
404 if checked > 20:
405 break
406 checked += 1
407
408 if len(row) != columns:
409 continue # skip rows that have irregular number of columns
410
411 for col in columnTypes.keys():
412
413 for thisType in [int, long, float, complex]:
414 try:
415 thisType(row[col])
416 break
417 except (ValueError, OverflowError):
418 pass
419 else:
420 # fallback to length of string
421 thisType = len(row[col])
422
423 # treat longs as ints
424 if thisType == long:
425 thisType = int
426
427 if thisType != columnTypes[col]:
428 if columnTypes[col] is None: # add new column type
429 columnTypes[col] = thisType
430 else:
431 # type is inconsistent, remove column from
432 # consideration
433 del columnTypes[col]
434
435 # finally, compare results against first row and "vote"
436 # on whether it's a header
437 hasHeader = 0
438 for col, colType in columnTypes.items():
439 if type(colType) == type(0): # it's a length
440 if len(header[col]) != colType:
441 hasHeader += 1
442 else:
443 hasHeader -= 1
444 else: # attempt typecast
445 try:
446 colType(header[col])
447 except (ValueError, TypeError):
448 hasHeader += 1
449 else:
450 hasHeader -= 1
451
452 return hasHeader > 0
Note: See TracBrowser for help on using the repository browser.