Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

textwrap.py

Last change on this file was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 16.6 KB

Rev	Line
[2]	1	"""Text wrapping and filling.
	2	"""
	3
	4	# Copyright (C) 1999-2001 Gregory P. Ward.
	5	# Copyright (C) 2002, 2003 Python Software Foundation.
	6	# Written by Greg Ward <gward@python.net>
	7
[391]	8	__revision__ = "$Id$"
[2]	9
	10	import string, re
	11
[391]	12	try:
	13	_unicode = unicode
	14	except NameError:
	15	# If Python is built without Unicode support, the unicode type
	16	# will not exist. Fake one.
	17	class _unicode(object):
	18	pass
	19
[2]	20	# Do the right thing with boolean values for all known Python versions
	21	# (so this module can be copied to projects that don't depend on Python
	22	# 2.3, e.g. Optik and Docutils) by uncommenting the block of code below.
	23	#try:
	24	# True, False
	25	#except NameError:
	26	# (True, False) = (1, 0)
	27
	28	__all__ = ['TextWrapper', 'wrap', 'fill', 'dedent']
	29
	30	# Hardcode the recognized whitespace characters to the US-ASCII
	31	# whitespace characters. The main reason for doing this is that in
	32	# ISO-8859-1, 0xa0 is non-breaking whitespace, so in certain locales
	33	# that character winds up in string.whitespace. Respecting
	34	# string.whitespace in those cases would 1) make textwrap treat 0xa0 the
	35	# same as any other whitespace char, which is clearly wrong (it's a
	36	# non-breaking space), 2) possibly cause problems with Unicode,
	37	# since 0xa0 is not in range(128).
	38	_whitespace = '\t\n\x0b\x0c\r '
	39
	40	class TextWrapper:
	41	"""
	42	Object for wrapping/filling text. The public interface consists of
	43	the wrap() and fill() methods; the other methods are just there for
	44	subclasses to override in order to tweak the default behaviour.
	45	If you want to completely replace the main wrapping algorithm,
	46	you'll probably have to override _wrap_chunks().
	47
	48	Several instance attributes control various aspects of wrapping:
	49	width (default: 70)
	50	the maximum width of wrapped lines (unless break_long_words
	51	is false)
	52	initial_indent (default: "")
	53	string that will be prepended to the first line of wrapped
	54	output. Counts towards the line's width.
	55	subsequent_indent (default: "")
	56	string that will be prepended to all lines save the first
	57	of wrapped output; also counts towards each line's width.
	58	expand_tabs (default: true)
	59	Expand tabs in input text to spaces before further processing.
	60	Each tab will become 1 .. 8 spaces, depending on its position in
	61	its line. If false, each tab is treated as a single character.
	62	replace_whitespace (default: true)
	63	Replace all whitespace characters in the input text by spaces
	64	after tab expansion. Note that if expand_tabs is false and
	65	replace_whitespace is true, every tab will be converted to a
	66	single space!
	67	fix_sentence_endings (default: false)
	68	Ensure that sentence-ending punctuation is always followed
	69	by two spaces. Off by default because the algorithm is
	70	(unavoidably) imperfect.
	71	break_long_words (default: true)
	72	Break words longer than 'width'. If false, those words will not
	73	be broken, and some lines might be longer than 'width'.
	74	break_on_hyphens (default: true)
	75	Allow breaking hyphenated words. If true, wrapping will occur
	76	preferably on whitespaces and right after hyphens part of
	77	compound words.
	78	drop_whitespace (default: true)
	79	Drop leading and trailing whitespace from lines.
	80	"""
	81
	82	whitespace_trans = string.maketrans(_whitespace, ' ' * len(_whitespace))
	83
	84	unicode_whitespace_trans = {}
	85	uspace = ord(u' ')
	86	for x in map(ord, _whitespace):
	87	unicode_whitespace_trans[x] = uspace
	88
	89	# This funky little regex is just the trick for splitting
	90	# text up into word-wrappable chunks. E.g.
	91	# "Hello there -- you goof-ball, use the -b option!"
	92	# splits into
	93	# Hello/ /there/ /--/ /you/ /goof-/ball,/ /use/ /the/ /-b/ /option!
	94	# (after stripping out empty strings).
	95	wordsep_re = re.compile(
	96	r'(\s+\|' # any whitespace
	97	r'[^\s\w]*\w+[^0-9\W]-(?=\w+[^0-9\W])\|' # hyphenated words
	98	r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w))') # em-dash
	99
	100	# This less funky little regex just split on recognized spaces. E.g.
	101	# "Hello there -- you goof-ball, use the -b option!"
	102	# splits into
	103	# Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/
	104	wordsep_simple_re = re.compile(r'(\s+)')
	105
	106	# XXX this is not locale- or charset-aware -- string.lowercase
	107	# is US-ASCII only (and therefore English-only)
	108	sentence_end_re = re.compile(r'[%s]' # lowercase letter
	109	r'[\.\!\?]' # sentence-ending punct.
	110	r'[\"\']?' # optional end-of-quote
	111	r'\Z' # end of chunk
	112	% string.lowercase)
	113
	114
	115	def __init__(self,
	116	width=70,
	117	initial_indent="",
	118	subsequent_indent="",
	119	expand_tabs=True,
	120	replace_whitespace=True,
	121	fix_sentence_endings=False,
	122	break_long_words=True,
	123	drop_whitespace=True,
	124	break_on_hyphens=True):
	125	self.width = width
	126	self.initial_indent = initial_indent
	127	self.subsequent_indent = subsequent_indent
	128	self.expand_tabs = expand_tabs
	129	self.replace_whitespace = replace_whitespace
	130	self.fix_sentence_endings = fix_sentence_endings
	131	self.break_long_words = break_long_words
	132	self.drop_whitespace = drop_whitespace
	133	self.break_on_hyphens = break_on_hyphens
	134
	135	# recompile the regexes for Unicode mode -- done in this clumsy way for
	136	# backwards compatibility because it's rather common to monkey-patch
	137	# the TextWrapper class' wordsep_re attribute.
	138	self.wordsep_re_uni = re.compile(self.wordsep_re.pattern, re.U)
	139	self.wordsep_simple_re_uni = re.compile(
	140	self.wordsep_simple_re.pattern, re.U)
	141
	142
	143	# -- Private methods -----------------------------------------------
	144	# (possibly useful for subclasses to override)
	145
	146	def _munge_whitespace(self, text):
	147	"""_munge_whitespace(text : string) -> string
	148
	149	Munge whitespace in text: expand tabs and convert all other
	150	whitespace characters to spaces. Eg. " foo\tbar\n\nbaz"
	151	becomes " foo bar baz".
	152	"""
	153	if self.expand_tabs:
	154	text = text.expandtabs()
	155	if self.replace_whitespace:
	156	if isinstance(text, str):
	157	text = text.translate(self.whitespace_trans)
[391]	158	elif isinstance(text, _unicode):
[2]	159	text = text.translate(self.unicode_whitespace_trans)
	160	return text
	161
	162
	163	def _split(self, text):
	164	"""_split(text : string) -> [string]
	165
	166	Split the text to wrap into indivisible chunks. Chunks are
[391]	167	not quite the same as words; see _wrap_chunks() for full
[2]	168	details. As an example, the text
	169	Look, goof-ball -- use the -b option!
	170	breaks into the following chunks:
	171	'Look,', ' ', 'goof-', 'ball', ' ', '--', ' ',
	172	'use', ' ', 'the', ' ', '-b', ' ', 'option!'
	173	if break_on_hyphens is True, or in:
	174	'Look,', ' ', 'goof-ball', ' ', '--', ' ',
	175	'use', ' ', 'the', ' ', '-b', ' ', option!'
	176	otherwise.
	177	"""
[391]	178	if isinstance(text, _unicode):
[2]	179	if self.break_on_hyphens:
	180	pat = self.wordsep_re_uni
	181	else:
	182	pat = self.wordsep_simple_re_uni
	183	else:
	184	if self.break_on_hyphens:
	185	pat = self.wordsep_re
	186	else:
	187	pat = self.wordsep_simple_re
	188	chunks = pat.split(text)
	189	chunks = filter(None, chunks) # remove empty chunks
	190	return chunks
	191
	192	def _fix_sentence_endings(self, chunks):
	193	"""_fix_sentence_endings(chunks : [string])
	194
	195	Correct for sentence endings buried in 'chunks'. Eg. when the
	196	original text contains "... foo.\nBar ...", munge_whitespace()
	197	and split() will convert that to [..., "foo.", " ", "Bar", ...]
	198	which has one too few spaces; this method simply changes the one
	199	space to two.
	200	"""
	201	i = 0
[391]	202	patsearch = self.sentence_end_re.search
[2]	203	while i < len(chunks)-1:
[391]	204	if chunks[i+1] == " " and patsearch(chunks[i]):
[2]	205	chunks[i+1] = " "
	206	i += 2
	207	else:
	208	i += 1
	209
	210	def _handle_long_word(self, reversed_chunks, cur_line, cur_len, width):
	211	"""_handle_long_word(chunks : [string],
	212	cur_line : [string],
	213	cur_len : int, width : int)
	214
	215	Handle a chunk of text (most likely a word, not whitespace) that
	216	is too long to fit in any line.
	217	"""
	218	# Figure out when indent is larger than the specified width, and make
	219	# sure at least one character is stripped off on every pass
	220	if width < 1:
	221	space_left = 1
	222	else:
	223	space_left = width - cur_len
	224
	225	# If we're allowed to break long words, then do so: put as much
	226	# of the next chunk onto the current line as will fit.
	227	if self.break_long_words:
	228	cur_line.append(reversed_chunks[-1][:space_left])
	229	reversed_chunks[-1] = reversed_chunks[-1][space_left:]
	230
	231	# Otherwise, we have to preserve the long word intact. Only add
	232	# it to the current line if there's nothing already there --
	233	# that minimizes how much we violate the width constraint.
	234	elif not cur_line:
	235	cur_line.append(reversed_chunks.pop())
	236
	237	# If we're not allowed to break long words, and there's already
	238	# text on the current line, do nothing. Next time through the
	239	# main loop of _wrap_chunks(), we'll wind up here again, but
	240	# cur_len will be zero, so the next line will be entirely
	241	# devoted to the long word that we can't handle right now.
	242
	243	def _wrap_chunks(self, chunks):
	244	"""_wrap_chunks(chunks : [string]) -> [string]
	245
	246	Wrap a sequence of text chunks and return a list of lines of
	247	length 'self.width' or less. (If 'break_long_words' is false,
	248	some lines may be longer than this.) Chunks correspond roughly
	249	to words and the whitespace between them: each chunk is
	250	indivisible (modulo 'break_long_words'), but a line break can
	251	come between any two chunks. Chunks should not have internal
	252	whitespace; ie. a chunk is either all whitespace or a "word".
	253	Whitespace chunks will be removed from the beginning and end of
	254	lines, but apart from that whitespace is preserved.
	255	"""
	256	lines = []
	257	if self.width <= 0:
	258	raise ValueError("invalid width %r (must be > 0)" % self.width)
	259
	260	# Arrange in reverse order so items can be efficiently popped
	261	# from a stack of chucks.
	262	chunks.reverse()
	263
	264	while chunks:
	265
	266	# Start the list of chunks that will make up the current line.
	267	# cur_len is just the length of all the chunks in cur_line.
	268	cur_line = []
	269	cur_len = 0
	270
	271	# Figure out which static string will prefix this line.
	272	if lines:
	273	indent = self.subsequent_indent
	274	else:
	275	indent = self.initial_indent
	276
	277	# Maximum width for this line.
	278	width = self.width - len(indent)
	279
	280	# First chunk on line is whitespace -- drop it, unless this
	281	# is the very beginning of the text (ie. no lines started yet).
	282	if self.drop_whitespace and chunks[-1].strip() == '' and lines:
	283	del chunks[-1]
	284
	285	while chunks:
	286	l = len(chunks[-1])
	287
	288	# Can at least squeeze this chunk onto the current line.
	289	if cur_len + l <= width:
	290	cur_line.append(chunks.pop())
	291	cur_len += l
	292
	293	# Nope, this line is full.
	294	else:
	295	break
	296
	297	# The current line is full, and the next chunk is too big to
	298	# fit on any line (not just this one).
	299	if chunks and len(chunks[-1]) > width:
	300	self._handle_long_word(chunks, cur_line, cur_len, width)
	301
	302	# If the last chunk on this line is all whitespace, drop it.
	303	if self.drop_whitespace and cur_line and cur_line[-1].strip() == '':
	304	del cur_line[-1]
	305
	306	# Convert current line back to a string and store it in list
	307	# of all lines (return value).
	308	if cur_line:
	309	lines.append(indent + ''.join(cur_line))
	310
	311	return lines
	312
	313
	314	# -- Public interface ----------------------------------------------
	315
	316	def wrap(self, text):
	317	"""wrap(text : string) -> [string]
	318
	319	Reformat the single paragraph in 'text' so it fits in lines of
	320	no more than 'self.width' columns, and return a list of wrapped
	321	lines. Tabs in 'text' are expanded with string.expandtabs(),
	322	and all other whitespace characters (including newline) are
	323	converted to space.
	324	"""
	325	text = self._munge_whitespace(text)
	326	chunks = self._split(text)
	327	if self.fix_sentence_endings:
	328	self._fix_sentence_endings(chunks)
	329	return self._wrap_chunks(chunks)
	330
	331	def fill(self, text):
	332	"""fill(text : string) -> string
	333
	334	Reformat the single paragraph in 'text' to fit in lines of no
	335	more than 'self.width' columns, and return a new string
	336	containing the entire wrapped paragraph.
	337	"""
	338	return "\n".join(self.wrap(text))
	339
	340
	341	# -- Convenience interface ---------------------------------------------
	342
	343	def wrap(text, width=70, **kwargs):
	344	"""Wrap a single paragraph of text, returning a list of wrapped lines.
	345
	346	Reformat the single paragraph in 'text' so it fits in lines of no
	347	more than 'width' columns, and return a list of wrapped lines. By
	348	default, tabs in 'text' are expanded with string.expandtabs(), and
	349	all other whitespace characters (including newline) are converted to
	350	space. See TextWrapper class for available keyword args to customize
	351	wrapping behaviour.
	352	"""
	353	w = TextWrapper(width=width, **kwargs)
	354	return w.wrap(text)
	355
	356	def fill(text, width=70, **kwargs):
	357	"""Fill a single paragraph of text, returning a new string.
	358
	359	Reformat the single paragraph in 'text' to fit in lines of no more
	360	than 'width' columns, and return a new string containing the entire
	361	wrapped paragraph. As with wrap(), tabs are expanded and other
	362	whitespace characters converted to space. See TextWrapper class for
	363	available keyword args to customize wrapping behaviour.
	364	"""
	365	w = TextWrapper(width=width, **kwargs)
	366	return w.fill(text)
	367
	368
	369	# -- Loosely related functionality -------------------------------------
	370
	371	_whitespace_only_re = re.compile('^[ \t]+$', re.MULTILINE)
	372	_leading_whitespace_re = re.compile('(^[ \t]*)(?:[^ \t\n])', re.MULTILINE)
	373
	374	def dedent(text):
	375	"""Remove any common leading whitespace from every line in `text`.
	376
	377	This can be used to make triple-quoted strings line up with the left
	378	edge of the display, while still presenting them in the source code
	379	in indented form.
	380
	381	Note that tabs and spaces are both treated as whitespace, but they
	382	are not equal: the lines " hello" and "\thello" are
	383	considered to have no common leading whitespace. (This behaviour is
	384	new in Python 2.5; older versions of this module incorrectly
	385	expanded tabs before searching for common leading whitespace.)
	386	"""
	387	# Look for the longest leading string of spaces and tabs common to
	388	# all lines.
	389	margin = None
	390	text = _whitespace_only_re.sub('', text)
	391	indents = _leading_whitespace_re.findall(text)
	392	for indent in indents:
	393	if margin is None:
	394	margin = indent
	395
	396	# Current line more deeply indented than previous winner:
	397	# no change (previous winner is still on top).
	398	elif indent.startswith(margin):
	399	pass
	400
	401	# Current line consistent with and no deeper than previous winner:
	402	# it's the new winner.
	403	elif margin.startswith(indent):
	404	margin = indent
	405
	406	# Current line and previous winner have no common whitespace:
	407	# there is no margin.
	408	else:
	409	margin = ""
	410	break
	411
	412	# sanity check (testing/debugging only)
	413	if 0 and margin:
	414	for line in text.split("\n"):
	415	assert not line or line.startswith(margin), \
	416	"line = %r, margin = %r" % (line, margin)
	417
	418	if margin:
	419	text = re.sub(r'(?m)^' + margin, '', text)
	420	return text
	421
	422	if __name__ == "__main__":
	423	#print dedent("\tfoo\n\tbar")
	424	#print dedent(" \thello there\n \t how are you?")
	425	print dedent("Hello there.\n This is indented.")

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/textwrap.py

Download in other formats: