1 |
|
---|
2 | """
|
---|
3 | csv.py - read/write/investigate CSV files
|
---|
4 | """
|
---|
5 |
|
---|
6 | import re
|
---|
7 | from functools import reduce
|
---|
8 | from _csv import Error, __version__, writer, reader, register_dialect, \
|
---|
9 | unregister_dialect, get_dialect, list_dialects, \
|
---|
10 | field_size_limit, \
|
---|
11 | QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
|
---|
12 | __doc__
|
---|
13 | from _csv import Dialect as _Dialect
|
---|
14 |
|
---|
15 | try:
|
---|
16 | from cStringIO import StringIO
|
---|
17 | except ImportError:
|
---|
18 | from StringIO import StringIO
|
---|
19 |
|
---|
20 | __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
|
---|
21 | "Error", "Dialect", "__doc__", "excel", "excel_tab",
|
---|
22 | "field_size_limit", "reader", "writer",
|
---|
23 | "register_dialect", "get_dialect", "list_dialects", "Sniffer",
|
---|
24 | "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
|
---|
25 |
|
---|
26 | class Dialect:
|
---|
27 | """Describe an Excel dialect.
|
---|
28 |
|
---|
29 | This must be subclassed (see csv.excel). Valid attributes are:
|
---|
30 | delimiter, quotechar, escapechar, doublequote, skipinitialspace,
|
---|
31 | lineterminator, quoting.
|
---|
32 |
|
---|
33 | """
|
---|
34 | _name = ""
|
---|
35 | _valid = False
|
---|
36 | # placeholders
|
---|
37 | delimiter = None
|
---|
38 | quotechar = None
|
---|
39 | escapechar = None
|
---|
40 | doublequote = None
|
---|
41 | skipinitialspace = None
|
---|
42 | lineterminator = None
|
---|
43 | quoting = None
|
---|
44 |
|
---|
45 | def __init__(self):
|
---|
46 | if self.__class__ != Dialect:
|
---|
47 | self._valid = True
|
---|
48 | self._validate()
|
---|
49 |
|
---|
50 | def _validate(self):
|
---|
51 | try:
|
---|
52 | _Dialect(self)
|
---|
53 | except TypeError, e:
|
---|
54 | # We do this for compatibility with py2.3
|
---|
55 | raise Error(str(e))
|
---|
56 |
|
---|
57 | class excel(Dialect):
|
---|
58 | """Describe the usual properties of Excel-generated CSV files."""
|
---|
59 | delimiter = ','
|
---|
60 | quotechar = '"'
|
---|
61 | doublequote = True
|
---|
62 | skipinitialspace = False
|
---|
63 | lineterminator = '\r\n'
|
---|
64 | quoting = QUOTE_MINIMAL
|
---|
65 | register_dialect("excel", excel)
|
---|
66 |
|
---|
67 | class excel_tab(excel):
|
---|
68 | """Describe the usual properties of Excel-generated TAB-delimited files."""
|
---|
69 | delimiter = '\t'
|
---|
70 | register_dialect("excel-tab", excel_tab)
|
---|
71 |
|
---|
72 |
|
---|
73 | class DictReader:
|
---|
74 | def __init__(self, f, fieldnames=None, restkey=None, restval=None,
|
---|
75 | dialect="excel", *args, **kwds):
|
---|
76 | self._fieldnames = fieldnames # list of keys for the dict
|
---|
77 | self.restkey = restkey # key to catch long rows
|
---|
78 | self.restval = restval # default value for short rows
|
---|
79 | self.reader = reader(f, dialect, *args, **kwds)
|
---|
80 | self.dialect = dialect
|
---|
81 | self.line_num = 0
|
---|
82 |
|
---|
83 | def __iter__(self):
|
---|
84 | return self
|
---|
85 |
|
---|
86 | @property
|
---|
87 | def fieldnames(self):
|
---|
88 | if self._fieldnames is None:
|
---|
89 | try:
|
---|
90 | self._fieldnames = self.reader.next()
|
---|
91 | except StopIteration:
|
---|
92 | pass
|
---|
93 | self.line_num = self.reader.line_num
|
---|
94 | return self._fieldnames
|
---|
95 |
|
---|
96 | @fieldnames.setter
|
---|
97 | def fieldnames(self, value):
|
---|
98 | self._fieldnames = value
|
---|
99 |
|
---|
100 | def next(self):
|
---|
101 | if self.line_num == 0:
|
---|
102 | # Used only for its side effect.
|
---|
103 | self.fieldnames
|
---|
104 | row = self.reader.next()
|
---|
105 | self.line_num = self.reader.line_num
|
---|
106 |
|
---|
107 | # unlike the basic reader, we prefer not to return blanks,
|
---|
108 | # because we will typically wind up with a dict full of None
|
---|
109 | # values
|
---|
110 | while row == []:
|
---|
111 | row = self.reader.next()
|
---|
112 | d = dict(zip(self.fieldnames, row))
|
---|
113 | lf = len(self.fieldnames)
|
---|
114 | lr = len(row)
|
---|
115 | if lf < lr:
|
---|
116 | d[self.restkey] = row[lf:]
|
---|
117 | elif lf > lr:
|
---|
118 | for key in self.fieldnames[lr:]:
|
---|
119 | d[key] = self.restval
|
---|
120 | return d
|
---|
121 |
|
---|
122 |
|
---|
123 | class DictWriter:
|
---|
124 | def __init__(self, f, fieldnames, restval="", extrasaction="raise",
|
---|
125 | dialect="excel", *args, **kwds):
|
---|
126 | self.fieldnames = fieldnames # list of keys for the dict
|
---|
127 | self.restval = restval # for writing short dicts
|
---|
128 | if extrasaction.lower() not in ("raise", "ignore"):
|
---|
129 | raise ValueError, \
|
---|
130 | ("extrasaction (%s) must be 'raise' or 'ignore'" %
|
---|
131 | extrasaction)
|
---|
132 | self.extrasaction = extrasaction
|
---|
133 | self.writer = writer(f, dialect, *args, **kwds)
|
---|
134 |
|
---|
135 | def _dict_to_list(self, rowdict):
|
---|
136 | if self.extrasaction == "raise":
|
---|
137 | wrong_fields = [k for k in rowdict if k not in self.fieldnames]
|
---|
138 | if wrong_fields:
|
---|
139 | raise ValueError("dict contains fields not in fieldnames: " +
|
---|
140 | ", ".join(wrong_fields))
|
---|
141 | return [rowdict.get(key, self.restval) for key in self.fieldnames]
|
---|
142 |
|
---|
143 | def writerow(self, rowdict):
|
---|
144 | return self.writer.writerow(self._dict_to_list(rowdict))
|
---|
145 |
|
---|
146 | def writerows(self, rowdicts):
|
---|
147 | rows = []
|
---|
148 | for rowdict in rowdicts:
|
---|
149 | rows.append(self._dict_to_list(rowdict))
|
---|
150 | return self.writer.writerows(rows)
|
---|
151 |
|
---|
152 | # Guard Sniffer's type checking against builds that exclude complex()
|
---|
153 | try:
|
---|
154 | complex
|
---|
155 | except NameError:
|
---|
156 | complex = float
|
---|
157 |
|
---|
158 | class Sniffer:
|
---|
159 | '''
|
---|
160 | "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
|
---|
161 | Returns a Dialect object.
|
---|
162 | '''
|
---|
163 | def __init__(self):
|
---|
164 | # in case there is more than one possible delimiter
|
---|
165 | self.preferred = [',', '\t', ';', ' ', ':']
|
---|
166 |
|
---|
167 |
|
---|
168 | def sniff(self, sample, delimiters=None):
|
---|
169 | """
|
---|
170 | Returns a dialect (or None) corresponding to the sample
|
---|
171 | """
|
---|
172 |
|
---|
173 | quotechar, delimiter, skipinitialspace = \
|
---|
174 | self._guess_quote_and_delimiter(sample, delimiters)
|
---|
175 | if not delimiter:
|
---|
176 | delimiter, skipinitialspace = self._guess_delimiter(sample,
|
---|
177 | delimiters)
|
---|
178 |
|
---|
179 | if not delimiter:
|
---|
180 | raise Error, "Could not determine delimiter"
|
---|
181 |
|
---|
182 | class dialect(Dialect):
|
---|
183 | _name = "sniffed"
|
---|
184 | lineterminator = '\r\n'
|
---|
185 | quoting = QUOTE_MINIMAL
|
---|
186 | # escapechar = ''
|
---|
187 | doublequote = False
|
---|
188 |
|
---|
189 | dialect.delimiter = delimiter
|
---|
190 | # _csv.reader won't accept a quotechar of ''
|
---|
191 | dialect.quotechar = quotechar or '"'
|
---|
192 | dialect.skipinitialspace = skipinitialspace
|
---|
193 |
|
---|
194 | return dialect
|
---|
195 |
|
---|
196 |
|
---|
197 | def _guess_quote_and_delimiter(self, data, delimiters):
|
---|
198 | """
|
---|
199 | Looks for text enclosed between two identical quotes
|
---|
200 | (the probable quotechar) which are preceded and followed
|
---|
201 | by the same character (the probable delimiter).
|
---|
202 | For example:
|
---|
203 | ,'some text',
|
---|
204 | The quote with the most wins, same with the delimiter.
|
---|
205 | If there is no quotechar the delimiter can't be determined
|
---|
206 | this way.
|
---|
207 | """
|
---|
208 |
|
---|
209 | matches = []
|
---|
210 | for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
|
---|
211 | '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
|
---|
212 | '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
|
---|
213 | '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
|
---|
214 | regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
|
---|
215 | matches = regexp.findall(data)
|
---|
216 | if matches:
|
---|
217 | break
|
---|
218 |
|
---|
219 | if not matches:
|
---|
220 | return ('', None, 0) # (quotechar, delimiter, skipinitialspace)
|
---|
221 |
|
---|
222 | quotes = {}
|
---|
223 | delims = {}
|
---|
224 | spaces = 0
|
---|
225 | for m in matches:
|
---|
226 | n = regexp.groupindex['quote'] - 1
|
---|
227 | key = m[n]
|
---|
228 | if key:
|
---|
229 | quotes[key] = quotes.get(key, 0) + 1
|
---|
230 | try:
|
---|
231 | n = regexp.groupindex['delim'] - 1
|
---|
232 | key = m[n]
|
---|
233 | except KeyError:
|
---|
234 | continue
|
---|
235 | if key and (delimiters is None or key in delimiters):
|
---|
236 | delims[key] = delims.get(key, 0) + 1
|
---|
237 | try:
|
---|
238 | n = regexp.groupindex['space'] - 1
|
---|
239 | except KeyError:
|
---|
240 | continue
|
---|
241 | if m[n]:
|
---|
242 | spaces += 1
|
---|
243 |
|
---|
244 | quotechar = reduce(lambda a, b, quotes = quotes:
|
---|
245 | (quotes[a] > quotes[b]) and a or b, quotes.keys())
|
---|
246 |
|
---|
247 | if delims:
|
---|
248 | delim = reduce(lambda a, b, delims = delims:
|
---|
249 | (delims[a] > delims[b]) and a or b, delims.keys())
|
---|
250 | skipinitialspace = delims[delim] == spaces
|
---|
251 | if delim == '\n': # most likely a file with a single column
|
---|
252 | delim = ''
|
---|
253 | else:
|
---|
254 | # there is *no* delimiter, it's a single column of quoted data
|
---|
255 | delim = ''
|
---|
256 | skipinitialspace = 0
|
---|
257 |
|
---|
258 | return (quotechar, delim, skipinitialspace)
|
---|
259 |
|
---|
260 |
|
---|
261 | def _guess_delimiter(self, data, delimiters):
|
---|
262 | """
|
---|
263 | The delimiter /should/ occur the same number of times on
|
---|
264 | each row. However, due to malformed data, it may not. We don't want
|
---|
265 | an all or nothing approach, so we allow for small variations in this
|
---|
266 | number.
|
---|
267 | 1) build a table of the frequency of each character on every line.
|
---|
268 | 2) build a table of freqencies of this frequency (meta-frequency?),
|
---|
269 | e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
|
---|
270 | 7 times in 2 rows'
|
---|
271 | 3) use the mode of the meta-frequency to determine the /expected/
|
---|
272 | frequency for that character
|
---|
273 | 4) find out how often the character actually meets that goal
|
---|
274 | 5) the character that best meets its goal is the delimiter
|
---|
275 | For performance reasons, the data is evaluated in chunks, so it can
|
---|
276 | try and evaluate the smallest portion of the data possible, evaluating
|
---|
277 | additional chunks as necessary.
|
---|
278 | """
|
---|
279 |
|
---|
280 | data = filter(None, data.split('\n'))
|
---|
281 |
|
---|
282 | ascii = [chr(c) for c in range(127)] # 7-bit ASCII
|
---|
283 |
|
---|
284 | # build frequency tables
|
---|
285 | chunkLength = min(10, len(data))
|
---|
286 | iteration = 0
|
---|
287 | charFrequency = {}
|
---|
288 | modes = {}
|
---|
289 | delims = {}
|
---|
290 | start, end = 0, min(chunkLength, len(data))
|
---|
291 | while start < len(data):
|
---|
292 | iteration += 1
|
---|
293 | for line in data[start:end]:
|
---|
294 | for char in ascii:
|
---|
295 | metaFrequency = charFrequency.get(char, {})
|
---|
296 | # must count even if frequency is 0
|
---|
297 | freq = line.count(char)
|
---|
298 | # value is the mode
|
---|
299 | metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
|
---|
300 | charFrequency[char] = metaFrequency
|
---|
301 |
|
---|
302 | for char in charFrequency.keys():
|
---|
303 | items = charFrequency[char].items()
|
---|
304 | if len(items) == 1 and items[0][0] == 0:
|
---|
305 | continue
|
---|
306 | # get the mode of the frequencies
|
---|
307 | if len(items) > 1:
|
---|
308 | modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
|
---|
309 | items)
|
---|
310 | # adjust the mode - subtract the sum of all
|
---|
311 | # other frequencies
|
---|
312 | items.remove(modes[char])
|
---|
313 | modes[char] = (modes[char][0], modes[char][1]
|
---|
314 | - reduce(lambda a, b: (0, a[1] + b[1]),
|
---|
315 | items)[1])
|
---|
316 | else:
|
---|
317 | modes[char] = items[0]
|
---|
318 |
|
---|
319 | # build a list of possible delimiters
|
---|
320 | modeList = modes.items()
|
---|
321 | total = float(chunkLength * iteration)
|
---|
322 | # (rows of consistent data) / (number of rows) = 100%
|
---|
323 | consistency = 1.0
|
---|
324 | # minimum consistency threshold
|
---|
325 | threshold = 0.9
|
---|
326 | while len(delims) == 0 and consistency >= threshold:
|
---|
327 | for k, v in modeList:
|
---|
328 | if v[0] > 0 and v[1] > 0:
|
---|
329 | if ((v[1]/total) >= consistency and
|
---|
330 | (delimiters is None or k in delimiters)):
|
---|
331 | delims[k] = v
|
---|
332 | consistency -= 0.01
|
---|
333 |
|
---|
334 | if len(delims) == 1:
|
---|
335 | delim = delims.keys()[0]
|
---|
336 | skipinitialspace = (data[0].count(delim) ==
|
---|
337 | data[0].count("%c " % delim))
|
---|
338 | return (delim, skipinitialspace)
|
---|
339 |
|
---|
340 | # analyze another chunkLength lines
|
---|
341 | start = end
|
---|
342 | end += chunkLength
|
---|
343 |
|
---|
344 | if not delims:
|
---|
345 | return ('', 0)
|
---|
346 |
|
---|
347 | # if there's more than one, fall back to a 'preferred' list
|
---|
348 | if len(delims) > 1:
|
---|
349 | for d in self.preferred:
|
---|
350 | if d in delims.keys():
|
---|
351 | skipinitialspace = (data[0].count(d) ==
|
---|
352 | data[0].count("%c " % d))
|
---|
353 | return (d, skipinitialspace)
|
---|
354 |
|
---|
355 | # nothing else indicates a preference, pick the character that
|
---|
356 | # dominates(?)
|
---|
357 | items = [(v,k) for (k,v) in delims.items()]
|
---|
358 | items.sort()
|
---|
359 | delim = items[-1][1]
|
---|
360 |
|
---|
361 | skipinitialspace = (data[0].count(delim) ==
|
---|
362 | data[0].count("%c " % delim))
|
---|
363 | return (delim, skipinitialspace)
|
---|
364 |
|
---|
365 |
|
---|
366 | def has_header(self, sample):
|
---|
367 | # Creates a dictionary of types of data in each column. If any
|
---|
368 | # column is of a single type (say, integers), *except* for the first
|
---|
369 | # row, then the first row is presumed to be labels. If the type
|
---|
370 | # can't be determined, it is assumed to be a string in which case
|
---|
371 | # the length of the string is the determining factor: if all of the
|
---|
372 | # rows except for the first are the same length, it's a header.
|
---|
373 | # Finally, a 'vote' is taken at the end for each column, adding or
|
---|
374 | # subtracting from the likelihood of the first row being a header.
|
---|
375 |
|
---|
376 | rdr = reader(StringIO(sample), self.sniff(sample))
|
---|
377 |
|
---|
378 | header = rdr.next() # assume first row is header
|
---|
379 |
|
---|
380 | columns = len(header)
|
---|
381 | columnTypes = {}
|
---|
382 | for i in range(columns): columnTypes[i] = None
|
---|
383 |
|
---|
384 | checked = 0
|
---|
385 | for row in rdr:
|
---|
386 | # arbitrary number of rows to check, to keep it sane
|
---|
387 | if checked > 20:
|
---|
388 | break
|
---|
389 | checked += 1
|
---|
390 |
|
---|
391 | if len(row) != columns:
|
---|
392 | continue # skip rows that have irregular number of columns
|
---|
393 |
|
---|
394 | for col in columnTypes.keys():
|
---|
395 |
|
---|
396 | for thisType in [int, long, float, complex]:
|
---|
397 | try:
|
---|
398 | thisType(row[col])
|
---|
399 | break
|
---|
400 | except (ValueError, OverflowError):
|
---|
401 | pass
|
---|
402 | else:
|
---|
403 | # fallback to length of string
|
---|
404 | thisType = len(row[col])
|
---|
405 |
|
---|
406 | # treat longs as ints
|
---|
407 | if thisType == long:
|
---|
408 | thisType = int
|
---|
409 |
|
---|
410 | if thisType != columnTypes[col]:
|
---|
411 | if columnTypes[col] is None: # add new column type
|
---|
412 | columnTypes[col] = thisType
|
---|
413 | else:
|
---|
414 | # type is inconsistent, remove column from
|
---|
415 | # consideration
|
---|
416 | del columnTypes[col]
|
---|
417 |
|
---|
418 | # finally, compare results against first row and "vote"
|
---|
419 | # on whether it's a header
|
---|
420 | hasHeader = 0
|
---|
421 | for col, colType in columnTypes.items():
|
---|
422 | if type(colType) == type(0): # it's a length
|
---|
423 | if len(header[col]) != colType:
|
---|
424 | hasHeader += 1
|
---|
425 | else:
|
---|
426 | hasHeader -= 1
|
---|
427 | else: # attempt typecast
|
---|
428 | try:
|
---|
429 | colType(header[col])
|
---|
430 | except (ValueError, TypeError):
|
---|
431 | hasHeader += 1
|
---|
432 | else:
|
---|
433 | hasHeader -= 1
|
---|
434 |
|
---|
435 | return hasHeader > 0
|
---|