[2] | 1 |
|
---|
| 2 | """
|
---|
| 3 | csv.py - read/write/investigate CSV files
|
---|
| 4 | """
|
---|
| 5 |
|
---|
| 6 | import re
|
---|
| 7 | from functools import reduce
|
---|
| 8 | from _csv import Error, __version__, writer, reader, register_dialect, \
|
---|
| 9 | unregister_dialect, get_dialect, list_dialects, \
|
---|
| 10 | field_size_limit, \
|
---|
| 11 | QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
|
---|
| 12 | __doc__
|
---|
| 13 | from _csv import Dialect as _Dialect
|
---|
| 14 |
|
---|
| 15 | try:
|
---|
| 16 | from cStringIO import StringIO
|
---|
| 17 | except ImportError:
|
---|
| 18 | from StringIO import StringIO
|
---|
| 19 |
|
---|
| 20 | __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
|
---|
| 21 | "Error", "Dialect", "__doc__", "excel", "excel_tab",
|
---|
| 22 | "field_size_limit", "reader", "writer",
|
---|
| 23 | "register_dialect", "get_dialect", "list_dialects", "Sniffer",
|
---|
| 24 | "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
|
---|
| 25 |
|
---|
| 26 | class Dialect:
|
---|
| 27 | """Describe an Excel dialect.
|
---|
| 28 |
|
---|
| 29 | This must be subclassed (see csv.excel). Valid attributes are:
|
---|
| 30 | delimiter, quotechar, escapechar, doublequote, skipinitialspace,
|
---|
| 31 | lineterminator, quoting.
|
---|
| 32 |
|
---|
| 33 | """
|
---|
| 34 | _name = ""
|
---|
| 35 | _valid = False
|
---|
| 36 | # placeholders
|
---|
| 37 | delimiter = None
|
---|
| 38 | quotechar = None
|
---|
| 39 | escapechar = None
|
---|
| 40 | doublequote = None
|
---|
| 41 | skipinitialspace = None
|
---|
| 42 | lineterminator = None
|
---|
| 43 | quoting = None
|
---|
| 44 |
|
---|
| 45 | def __init__(self):
|
---|
| 46 | if self.__class__ != Dialect:
|
---|
| 47 | self._valid = True
|
---|
| 48 | self._validate()
|
---|
| 49 |
|
---|
| 50 | def _validate(self):
|
---|
| 51 | try:
|
---|
| 52 | _Dialect(self)
|
---|
| 53 | except TypeError, e:
|
---|
| 54 | # We do this for compatibility with py2.3
|
---|
| 55 | raise Error(str(e))
|
---|
| 56 |
|
---|
| 57 | class excel(Dialect):
|
---|
| 58 | """Describe the usual properties of Excel-generated CSV files."""
|
---|
| 59 | delimiter = ','
|
---|
| 60 | quotechar = '"'
|
---|
| 61 | doublequote = True
|
---|
| 62 | skipinitialspace = False
|
---|
| 63 | lineterminator = '\r\n'
|
---|
| 64 | quoting = QUOTE_MINIMAL
|
---|
| 65 | register_dialect("excel", excel)
|
---|
| 66 |
|
---|
| 67 | class excel_tab(excel):
|
---|
| 68 | """Describe the usual properties of Excel-generated TAB-delimited files."""
|
---|
| 69 | delimiter = '\t'
|
---|
| 70 | register_dialect("excel-tab", excel_tab)
|
---|
| 71 |
|
---|
| 72 |
|
---|
| 73 | class DictReader:
|
---|
| 74 | def __init__(self, f, fieldnames=None, restkey=None, restval=None,
|
---|
| 75 | dialect="excel", *args, **kwds):
|
---|
| 76 | self._fieldnames = fieldnames # list of keys for the dict
|
---|
| 77 | self.restkey = restkey # key to catch long rows
|
---|
| 78 | self.restval = restval # default value for short rows
|
---|
| 79 | self.reader = reader(f, dialect, *args, **kwds)
|
---|
| 80 | self.dialect = dialect
|
---|
| 81 | self.line_num = 0
|
---|
| 82 |
|
---|
| 83 | def __iter__(self):
|
---|
| 84 | return self
|
---|
| 85 |
|
---|
| 86 | @property
|
---|
| 87 | def fieldnames(self):
|
---|
| 88 | if self._fieldnames is None:
|
---|
| 89 | try:
|
---|
| 90 | self._fieldnames = self.reader.next()
|
---|
| 91 | except StopIteration:
|
---|
| 92 | pass
|
---|
| 93 | self.line_num = self.reader.line_num
|
---|
| 94 | return self._fieldnames
|
---|
| 95 |
|
---|
| 96 | @fieldnames.setter
|
---|
| 97 | def fieldnames(self, value):
|
---|
| 98 | self._fieldnames = value
|
---|
| 99 |
|
---|
| 100 | def next(self):
|
---|
| 101 | if self.line_num == 0:
|
---|
| 102 | # Used only for its side effect.
|
---|
| 103 | self.fieldnames
|
---|
| 104 | row = self.reader.next()
|
---|
| 105 | self.line_num = self.reader.line_num
|
---|
| 106 |
|
---|
| 107 | # unlike the basic reader, we prefer not to return blanks,
|
---|
| 108 | # because we will typically wind up with a dict full of None
|
---|
| 109 | # values
|
---|
| 110 | while row == []:
|
---|
| 111 | row = self.reader.next()
|
---|
| 112 | d = dict(zip(self.fieldnames, row))
|
---|
| 113 | lf = len(self.fieldnames)
|
---|
| 114 | lr = len(row)
|
---|
| 115 | if lf < lr:
|
---|
| 116 | d[self.restkey] = row[lf:]
|
---|
| 117 | elif lf > lr:
|
---|
| 118 | for key in self.fieldnames[lr:]:
|
---|
| 119 | d[key] = self.restval
|
---|
| 120 | return d
|
---|
| 121 |
|
---|
| 122 |
|
---|
| 123 | class DictWriter:
|
---|
| 124 | def __init__(self, f, fieldnames, restval="", extrasaction="raise",
|
---|
| 125 | dialect="excel", *args, **kwds):
|
---|
| 126 | self.fieldnames = fieldnames # list of keys for the dict
|
---|
| 127 | self.restval = restval # for writing short dicts
|
---|
| 128 | if extrasaction.lower() not in ("raise", "ignore"):
|
---|
| 129 | raise ValueError, \
|
---|
| 130 | ("extrasaction (%s) must be 'raise' or 'ignore'" %
|
---|
| 131 | extrasaction)
|
---|
| 132 | self.extrasaction = extrasaction
|
---|
| 133 | self.writer = writer(f, dialect, *args, **kwds)
|
---|
| 134 |
|
---|
[391] | 135 | def writeheader(self):
|
---|
| 136 | header = dict(zip(self.fieldnames, self.fieldnames))
|
---|
| 137 | self.writerow(header)
|
---|
| 138 |
|
---|
[2] | 139 | def _dict_to_list(self, rowdict):
|
---|
| 140 | if self.extrasaction == "raise":
|
---|
| 141 | wrong_fields = [k for k in rowdict if k not in self.fieldnames]
|
---|
| 142 | if wrong_fields:
|
---|
| 143 | raise ValueError("dict contains fields not in fieldnames: " +
|
---|
| 144 | ", ".join(wrong_fields))
|
---|
| 145 | return [rowdict.get(key, self.restval) for key in self.fieldnames]
|
---|
| 146 |
|
---|
| 147 | def writerow(self, rowdict):
|
---|
| 148 | return self.writer.writerow(self._dict_to_list(rowdict))
|
---|
| 149 |
|
---|
| 150 | def writerows(self, rowdicts):
|
---|
| 151 | rows = []
|
---|
| 152 | for rowdict in rowdicts:
|
---|
| 153 | rows.append(self._dict_to_list(rowdict))
|
---|
| 154 | return self.writer.writerows(rows)
|
---|
| 155 |
|
---|
| 156 | # Guard Sniffer's type checking against builds that exclude complex()
|
---|
| 157 | try:
|
---|
| 158 | complex
|
---|
| 159 | except NameError:
|
---|
| 160 | complex = float
|
---|
| 161 |
|
---|
| 162 | class Sniffer:
|
---|
| 163 | '''
|
---|
| 164 | "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
|
---|
| 165 | Returns a Dialect object.
|
---|
| 166 | '''
|
---|
| 167 | def __init__(self):
|
---|
| 168 | # in case there is more than one possible delimiter
|
---|
| 169 | self.preferred = [',', '\t', ';', ' ', ':']
|
---|
| 170 |
|
---|
| 171 |
|
---|
| 172 | def sniff(self, sample, delimiters=None):
|
---|
| 173 | """
|
---|
| 174 | Returns a dialect (or None) corresponding to the sample
|
---|
| 175 | """
|
---|
| 176 |
|
---|
[391] | 177 | quotechar, doublequote, delimiter, skipinitialspace = \
|
---|
[2] | 178 | self._guess_quote_and_delimiter(sample, delimiters)
|
---|
| 179 | if not delimiter:
|
---|
| 180 | delimiter, skipinitialspace = self._guess_delimiter(sample,
|
---|
| 181 | delimiters)
|
---|
| 182 |
|
---|
| 183 | if not delimiter:
|
---|
| 184 | raise Error, "Could not determine delimiter"
|
---|
| 185 |
|
---|
| 186 | class dialect(Dialect):
|
---|
| 187 | _name = "sniffed"
|
---|
| 188 | lineterminator = '\r\n'
|
---|
| 189 | quoting = QUOTE_MINIMAL
|
---|
| 190 | # escapechar = ''
|
---|
| 191 |
|
---|
[391] | 192 | dialect.doublequote = doublequote
|
---|
[2] | 193 | dialect.delimiter = delimiter
|
---|
| 194 | # _csv.reader won't accept a quotechar of ''
|
---|
| 195 | dialect.quotechar = quotechar or '"'
|
---|
| 196 | dialect.skipinitialspace = skipinitialspace
|
---|
| 197 |
|
---|
| 198 | return dialect
|
---|
| 199 |
|
---|
| 200 |
|
---|
| 201 | def _guess_quote_and_delimiter(self, data, delimiters):
|
---|
| 202 | """
|
---|
| 203 | Looks for text enclosed between two identical quotes
|
---|
| 204 | (the probable quotechar) which are preceded and followed
|
---|
| 205 | by the same character (the probable delimiter).
|
---|
| 206 | For example:
|
---|
| 207 | ,'some text',
|
---|
| 208 | The quote with the most wins, same with the delimiter.
|
---|
| 209 | If there is no quotechar the delimiter can't be determined
|
---|
| 210 | this way.
|
---|
| 211 | """
|
---|
| 212 |
|
---|
| 213 | matches = []
|
---|
| 214 | for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
|
---|
| 215 | '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)', # ".*?",
|
---|
| 216 | '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)', # ,".*?"
|
---|
| 217 | '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'): # ".*?" (no delim, no space)
|
---|
| 218 | regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
|
---|
| 219 | matches = regexp.findall(data)
|
---|
| 220 | if matches:
|
---|
| 221 | break
|
---|
| 222 |
|
---|
| 223 | if not matches:
|
---|
[391] | 224 | # (quotechar, doublequote, delimiter, skipinitialspace)
|
---|
| 225 | return ('', False, None, 0)
|
---|
[2] | 226 | quotes = {}
|
---|
| 227 | delims = {}
|
---|
| 228 | spaces = 0
|
---|
| 229 | for m in matches:
|
---|
| 230 | n = regexp.groupindex['quote'] - 1
|
---|
| 231 | key = m[n]
|
---|
| 232 | if key:
|
---|
| 233 | quotes[key] = quotes.get(key, 0) + 1
|
---|
| 234 | try:
|
---|
| 235 | n = regexp.groupindex['delim'] - 1
|
---|
| 236 | key = m[n]
|
---|
| 237 | except KeyError:
|
---|
| 238 | continue
|
---|
| 239 | if key and (delimiters is None or key in delimiters):
|
---|
| 240 | delims[key] = delims.get(key, 0) + 1
|
---|
| 241 | try:
|
---|
| 242 | n = regexp.groupindex['space'] - 1
|
---|
| 243 | except KeyError:
|
---|
| 244 | continue
|
---|
| 245 | if m[n]:
|
---|
| 246 | spaces += 1
|
---|
| 247 |
|
---|
| 248 | quotechar = reduce(lambda a, b, quotes = quotes:
|
---|
| 249 | (quotes[a] > quotes[b]) and a or b, quotes.keys())
|
---|
| 250 |
|
---|
| 251 | if delims:
|
---|
| 252 | delim = reduce(lambda a, b, delims = delims:
|
---|
| 253 | (delims[a] > delims[b]) and a or b, delims.keys())
|
---|
| 254 | skipinitialspace = delims[delim] == spaces
|
---|
| 255 | if delim == '\n': # most likely a file with a single column
|
---|
| 256 | delim = ''
|
---|
| 257 | else:
|
---|
| 258 | # there is *no* delimiter, it's a single column of quoted data
|
---|
| 259 | delim = ''
|
---|
| 260 | skipinitialspace = 0
|
---|
| 261 |
|
---|
[391] | 262 | # if we see an extra quote between delimiters, we've got a
|
---|
| 263 | # double quoted format
|
---|
| 264 | dq_regexp = re.compile(
|
---|
| 265 | r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
|
---|
| 266 | {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
|
---|
[2] | 267 |
|
---|
| 268 |
|
---|
[391] | 269 |
|
---|
| 270 | if dq_regexp.search(data):
|
---|
| 271 | doublequote = True
|
---|
| 272 | else:
|
---|
| 273 | doublequote = False
|
---|
| 274 |
|
---|
| 275 | return (quotechar, doublequote, delim, skipinitialspace)
|
---|
| 276 |
|
---|
| 277 |
|
---|
[2] | 278 | def _guess_delimiter(self, data, delimiters):
|
---|
| 279 | """
|
---|
| 280 | The delimiter /should/ occur the same number of times on
|
---|
| 281 | each row. However, due to malformed data, it may not. We don't want
|
---|
| 282 | an all or nothing approach, so we allow for small variations in this
|
---|
| 283 | number.
|
---|
| 284 | 1) build a table of the frequency of each character on every line.
|
---|
[391] | 285 | 2) build a table of frequencies of this frequency (meta-frequency?),
|
---|
[2] | 286 | e.g. 'x occurred 5 times in 10 rows, 6 times in 1000 rows,
|
---|
| 287 | 7 times in 2 rows'
|
---|
| 288 | 3) use the mode of the meta-frequency to determine the /expected/
|
---|
| 289 | frequency for that character
|
---|
| 290 | 4) find out how often the character actually meets that goal
|
---|
| 291 | 5) the character that best meets its goal is the delimiter
|
---|
| 292 | For performance reasons, the data is evaluated in chunks, so it can
|
---|
| 293 | try and evaluate the smallest portion of the data possible, evaluating
|
---|
| 294 | additional chunks as necessary.
|
---|
| 295 | """
|
---|
| 296 |
|
---|
| 297 | data = filter(None, data.split('\n'))
|
---|
| 298 |
|
---|
| 299 | ascii = [chr(c) for c in range(127)] # 7-bit ASCII
|
---|
| 300 |
|
---|
| 301 | # build frequency tables
|
---|
| 302 | chunkLength = min(10, len(data))
|
---|
| 303 | iteration = 0
|
---|
| 304 | charFrequency = {}
|
---|
| 305 | modes = {}
|
---|
| 306 | delims = {}
|
---|
| 307 | start, end = 0, min(chunkLength, len(data))
|
---|
| 308 | while start < len(data):
|
---|
| 309 | iteration += 1
|
---|
| 310 | for line in data[start:end]:
|
---|
| 311 | for char in ascii:
|
---|
| 312 | metaFrequency = charFrequency.get(char, {})
|
---|
| 313 | # must count even if frequency is 0
|
---|
| 314 | freq = line.count(char)
|
---|
| 315 | # value is the mode
|
---|
| 316 | metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
|
---|
| 317 | charFrequency[char] = metaFrequency
|
---|
| 318 |
|
---|
| 319 | for char in charFrequency.keys():
|
---|
| 320 | items = charFrequency[char].items()
|
---|
| 321 | if len(items) == 1 and items[0][0] == 0:
|
---|
| 322 | continue
|
---|
| 323 | # get the mode of the frequencies
|
---|
| 324 | if len(items) > 1:
|
---|
| 325 | modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
|
---|
| 326 | items)
|
---|
| 327 | # adjust the mode - subtract the sum of all
|
---|
| 328 | # other frequencies
|
---|
| 329 | items.remove(modes[char])
|
---|
| 330 | modes[char] = (modes[char][0], modes[char][1]
|
---|
| 331 | - reduce(lambda a, b: (0, a[1] + b[1]),
|
---|
| 332 | items)[1])
|
---|
| 333 | else:
|
---|
| 334 | modes[char] = items[0]
|
---|
| 335 |
|
---|
| 336 | # build a list of possible delimiters
|
---|
| 337 | modeList = modes.items()
|
---|
| 338 | total = float(chunkLength * iteration)
|
---|
| 339 | # (rows of consistent data) / (number of rows) = 100%
|
---|
| 340 | consistency = 1.0
|
---|
| 341 | # minimum consistency threshold
|
---|
| 342 | threshold = 0.9
|
---|
| 343 | while len(delims) == 0 and consistency >= threshold:
|
---|
| 344 | for k, v in modeList:
|
---|
| 345 | if v[0] > 0 and v[1] > 0:
|
---|
| 346 | if ((v[1]/total) >= consistency and
|
---|
| 347 | (delimiters is None or k in delimiters)):
|
---|
| 348 | delims[k] = v
|
---|
| 349 | consistency -= 0.01
|
---|
| 350 |
|
---|
| 351 | if len(delims) == 1:
|
---|
| 352 | delim = delims.keys()[0]
|
---|
| 353 | skipinitialspace = (data[0].count(delim) ==
|
---|
| 354 | data[0].count("%c " % delim))
|
---|
| 355 | return (delim, skipinitialspace)
|
---|
| 356 |
|
---|
| 357 | # analyze another chunkLength lines
|
---|
| 358 | start = end
|
---|
| 359 | end += chunkLength
|
---|
| 360 |
|
---|
| 361 | if not delims:
|
---|
| 362 | return ('', 0)
|
---|
| 363 |
|
---|
| 364 | # if there's more than one, fall back to a 'preferred' list
|
---|
| 365 | if len(delims) > 1:
|
---|
| 366 | for d in self.preferred:
|
---|
| 367 | if d in delims.keys():
|
---|
| 368 | skipinitialspace = (data[0].count(d) ==
|
---|
| 369 | data[0].count("%c " % d))
|
---|
| 370 | return (d, skipinitialspace)
|
---|
| 371 |
|
---|
| 372 | # nothing else indicates a preference, pick the character that
|
---|
| 373 | # dominates(?)
|
---|
| 374 | items = [(v,k) for (k,v) in delims.items()]
|
---|
| 375 | items.sort()
|
---|
| 376 | delim = items[-1][1]
|
---|
| 377 |
|
---|
| 378 | skipinitialspace = (data[0].count(delim) ==
|
---|
| 379 | data[0].count("%c " % delim))
|
---|
| 380 | return (delim, skipinitialspace)
|
---|
| 381 |
|
---|
| 382 |
|
---|
| 383 | def has_header(self, sample):
|
---|
| 384 | # Creates a dictionary of types of data in each column. If any
|
---|
| 385 | # column is of a single type (say, integers), *except* for the first
|
---|
| 386 | # row, then the first row is presumed to be labels. If the type
|
---|
| 387 | # can't be determined, it is assumed to be a string in which case
|
---|
| 388 | # the length of the string is the determining factor: if all of the
|
---|
| 389 | # rows except for the first are the same length, it's a header.
|
---|
| 390 | # Finally, a 'vote' is taken at the end for each column, adding or
|
---|
| 391 | # subtracting from the likelihood of the first row being a header.
|
---|
| 392 |
|
---|
| 393 | rdr = reader(StringIO(sample), self.sniff(sample))
|
---|
| 394 |
|
---|
| 395 | header = rdr.next() # assume first row is header
|
---|
| 396 |
|
---|
| 397 | columns = len(header)
|
---|
| 398 | columnTypes = {}
|
---|
| 399 | for i in range(columns): columnTypes[i] = None
|
---|
| 400 |
|
---|
| 401 | checked = 0
|
---|
| 402 | for row in rdr:
|
---|
| 403 | # arbitrary number of rows to check, to keep it sane
|
---|
| 404 | if checked > 20:
|
---|
| 405 | break
|
---|
| 406 | checked += 1
|
---|
| 407 |
|
---|
| 408 | if len(row) != columns:
|
---|
| 409 | continue # skip rows that have irregular number of columns
|
---|
| 410 |
|
---|
| 411 | for col in columnTypes.keys():
|
---|
| 412 |
|
---|
| 413 | for thisType in [int, long, float, complex]:
|
---|
| 414 | try:
|
---|
| 415 | thisType(row[col])
|
---|
| 416 | break
|
---|
| 417 | except (ValueError, OverflowError):
|
---|
| 418 | pass
|
---|
| 419 | else:
|
---|
| 420 | # fallback to length of string
|
---|
| 421 | thisType = len(row[col])
|
---|
| 422 |
|
---|
| 423 | # treat longs as ints
|
---|
| 424 | if thisType == long:
|
---|
| 425 | thisType = int
|
---|
| 426 |
|
---|
| 427 | if thisType != columnTypes[col]:
|
---|
| 428 | if columnTypes[col] is None: # add new column type
|
---|
| 429 | columnTypes[col] = thisType
|
---|
| 430 | else:
|
---|
| 431 | # type is inconsistent, remove column from
|
---|
| 432 | # consideration
|
---|
| 433 | del columnTypes[col]
|
---|
| 434 |
|
---|
| 435 | # finally, compare results against first row and "vote"
|
---|
| 436 | # on whether it's a header
|
---|
| 437 | hasHeader = 0
|
---|
| 438 | for col, colType in columnTypes.items():
|
---|
| 439 | if type(colType) == type(0): # it's a length
|
---|
| 440 | if len(header[col]) != colType:
|
---|
| 441 | hasHeader += 1
|
---|
| 442 | else:
|
---|
| 443 | hasHeader -= 1
|
---|
| 444 | else: # attempt typecast
|
---|
| 445 | try:
|
---|
| 446 | colType(header[col])
|
---|
| 447 | except (ValueError, TypeError):
|
---|
| 448 | hasHeader += 1
|
---|
| 449 | else:
|
---|
| 450 | hasHeader -= 1
|
---|
| 451 |
|
---|
| 452 | return hasHeader > 0
|
---|