Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

pygettext.py

Last change on this file was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 21.6 KB

Rev	Line
[2]	1	#! /usr/bin/env python
	2	# -- coding: iso-8859-1 --
	3	# Originally written by Barry Warsaw <barry@zope.com>
	4	#
	5	# Minimally patched to make it even more xgettext compatible
	6	# by Peter Funk <pf@artcom-gmbh.de>
	7	#
	8	# 2002-11-22 Jürgen Hermann <jh@web.de>
	9	# Added checks that _() only contains string literals, and
	10	# command line args are resolved to module lists, i.e. you
	11	# can now pass a filename, a module or package name, or a
	12	# directory (including globbing chars, important for Win32).
	13	# Made docstring fit in 80 chars wide displays using pydoc.
	14	#
	15
	16	# for selftesting
	17	try:
	18	import fintl
	19	_ = fintl.gettext
	20	except ImportError:
	21	_ = lambda s: s
	22
	23	__doc__ = _("""pygettext -- Python equivalent of xgettext(1)
	24
	25	Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the
	26	internationalization of C programs. Most of these tools are independent of
	27	the programming language and can be used from within Python programs.
	28	Martin von Loewis' work[1] helps considerably in this regard.
	29
	30	There's one problem though; xgettext is the program that scans source code
	31	looking for message strings, but it groks only C (or C++). Python
	32	introduces a few wrinkles, such as dual quoting characters, triple quoted
	33	strings, and raw strings. xgettext understands none of this.
	34
	35	Enter pygettext, which uses Python's standard tokenize module to scan
	36	Python source code, generating .pot files identical to what GNU xgettext[2]
	37	generates for C and C++ code. From there, the standard GNU tools can be
	38	used.
	39
	40	A word about marking Python strings as candidates for translation. GNU
	41	xgettext recognizes the following keywords: gettext, dgettext, dcgettext,
	42	and gettext_noop. But those can be a lot of text to include all over your
	43	code. C and C++ have a trick: they use the C preprocessor. Most
	44	internationalized C source includes a #define for gettext() to _() so that
	45	what has to be written in the source is much less. Thus these are both
	46	translatable strings:
	47
	48	gettext("Translatable String")
	49	_("Translatable String")
	50
	51	Python of course has no preprocessor so this doesn't work so well. Thus,
	52	pygettext searches only for _() by default, but see the -k/--keyword flag
	53	below for how to augment this.
	54
	55	[1] http://www.python.org/workshops/1997-10/proceedings/loewis.html
	56	[2] http://www.gnu.org/software/gettext/gettext.html
	57
	58	NOTE: pygettext attempts to be option and feature compatible with GNU
	59	xgettext where ever possible. However some options are still missing or are
	60	not fully implemented. Also, xgettext's use of command line switches with
	61	option arguments is broken, and in these cases, pygettext just defines
	62	additional switches.
	63
	64	Usage: pygettext [options] inputfile ...
	65
	66	Options:
	67
	68	-a
	69	--extract-all
	70	Extract all strings.
	71
	72	-d name
	73	--default-domain=name
	74	Rename the default output file from messages.pot to name.pot.
	75
	76	-E
	77	--escape
	78	Replace non-ASCII characters with octal escape sequences.
	79
	80	-D
	81	--docstrings
	82	Extract module, class, method, and function docstrings. These do
	83	not need to be wrapped in _() markers, and in fact cannot be for
	84	Python to consider them docstrings. (See also the -X option).
	85
	86	-h
	87	--help
	88	Print this help message and exit.
	89
	90	-k word
	91	--keyword=word
	92	Keywords to look for in addition to the default set, which are:
	93	%(DEFAULTKEYWORDS)s
	94
	95	You can have multiple -k flags on the command line.
	96
	97	-K
	98	--no-default-keywords
	99	Disable the default set of keywords (see above). Any keywords
	100	explicitly added with the -k/--keyword option are still recognized.
	101
	102	--no-location
	103	Do not write filename/lineno location comments.
	104
	105	-n
	106	--add-location
	107	Write filename/lineno location comments indicating where each
	108	extracted string is found in the source. These lines appear before
	109	each msgid. The style of comments is controlled by the -S/--style
	110	option. This is the default.
	111
	112	-o filename
	113	--output=filename
	114	Rename the default output file from messages.pot to filename. If
	115	filename is `-' then the output is sent to standard out.
	116
	117	-p dir
	118	--output-dir=dir
	119	Output files will be placed in directory dir.
	120
	121	-S stylename
	122	--style stylename
	123	Specify which style to use for location comments. Two styles are
	124	supported:
	125
	126	Solaris # File: filename, line: line-number
	127	GNU #: filename:line
	128
	129	The style name is case insensitive. GNU style is the default.
	130
	131	-v
	132	--verbose
	133	Print the names of the files being processed.
	134
	135	-V
	136	--version
	137	Print the version of pygettext and exit.
	138
	139	-w columns
	140	--width=columns
	141	Set width of output to columns.
	142
	143	-x filename
	144	--exclude-file=filename
	145	Specify a file that contains a list of strings that are not be
	146	extracted from the input files. Each string to be excluded must
	147	appear on a line by itself in the file.
	148
	149	-X filename
	150	--no-docstrings=filename
	151	Specify a file that contains a list of files (one per line) that
	152	should not have their docstrings extracted. This is only useful in
	153	conjunction with the -D option above.
	154
	155	If `inputfile' is -, standard input is read.
	156	""")
	157
	158	import os
	159	import imp
	160	import sys
	161	import glob
	162	import time
	163	import getopt
	164	import token
	165	import tokenize
	166	import operator
	167
	168	__version__ = '1.5'
	169
	170	default_keywords = ['_']
	171	DEFAULTKEYWORDS = ', '.join(default_keywords)
	172
	173	EMPTYSTRING = ''
	174
	175
	176
	177
	178	# The normal pot-file header. msgmerge and Emacs's po-mode work better if it's
	179	# there.
	180	pot_header = _('''\
	181	# SOME DESCRIPTIVE TITLE.
	182	# Copyright (C) YEAR ORGANIZATION
	183	# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
	184	#
	185	msgid ""
	186	msgstr ""
	187	"Project-Id-Version: PACKAGE VERSION\\n"
	188	"POT-Creation-Date: %(time)s\\n"
	189	"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n"
	190	"Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n"
	191	"Language-Team: LANGUAGE <LL@li.org>\\n"
	192	"MIME-Version: 1.0\\n"
	193	"Content-Type: text/plain; charset=CHARSET\\n"
	194	"Content-Transfer-Encoding: ENCODING\\n"
	195	"Generated-By: pygettext.py %(version)s\\n"
	196
	197	''')
	198
	199
	200
	201	def usage(code, msg=''):
	202	print >> sys.stderr, __doc__ % globals()
	203	if msg:
	204	print >> sys.stderr, msg
	205	sys.exit(code)
	206
	207
	208
	209
	210	escapes = []
[391]	211
[2]	212	def make_escapes(pass_iso8859):
	213	global escapes
	214	escapes = [chr(i) for i in range(256)]
	215	if pass_iso8859:
	216	# Allow iso-8859 characters to pass through so that e.g. 'msgid
	217	# "Höhe"' would result not result in 'msgid "H\366he"'. Otherwise we
	218	# escape any character outside the 32..126 range.
[391]	219	mod = 128
	220	else:
	221	mod = 256
[2]	222	for i in range(mod):
	223	if not(32 <= i <= 126):
	224	escapes[i] = "\\%03o" % i
	225	escapes[ord('\\')] = '\\\\'
	226	escapes[ord('\t')] = '\\t'
	227	escapes[ord('\r')] = '\\r'
	228	escapes[ord('\n')] = '\\n'
	229	escapes[ord('\"')] = '\\"'
	230
	231
	232	def escape(s):
	233	global escapes
	234	s = list(s)
	235	for i in range(len(s)):
	236	s[i] = escapes[ord(s[i])]
	237	return EMPTYSTRING.join(s)
	238
	239
	240	def safe_eval(s):
	241	# unwrap quotes, safely
	242	return eval(s, {'__builtins__':{}}, {})
	243
	244
	245	def normalize(s):
	246	# This converts the various Python string types into a format that is
	247	# appropriate for .po files, namely much closer to C style.
	248	lines = s.split('\n')
	249	if len(lines) == 1:
	250	s = '"' + escape(s) + '"'
	251	else:
	252	if not lines[-1]:
	253	del lines[-1]
	254	lines[-1] = lines[-1] + '\n'
	255	for i in range(len(lines)):
	256	lines[i] = escape(lines[i])
	257	lineterm = '\\n"\n"'
	258	s = '""\n"' + lineterm.join(lines) + '"'
	259	return s
	260
	261
	262
	263	def containsAny(str, set):
	264	"""Check whether 'str' contains ANY of the chars in 'set'"""
	265	return 1 in [c in str for c in set]
	266
	267
	268	def _visit_pyfiles(list, dirname, names):
	269	"""Helper for getFilesForName()."""
	270	# get extension for python source files
	271	if not globals().has_key('_py_ext'):
	272	global _py_ext
	273	_py_ext = [triple[0] for triple in imp.get_suffixes()
	274	if triple[2] == imp.PY_SOURCE][0]
	275
	276	# don't recurse into CVS directories
	277	if 'CVS' in names:
	278	names.remove('CVS')
	279
	280	# add all *.py files to list
	281	list.extend(
	282	[os.path.join(dirname, file) for file in names
	283	if os.path.splitext(file)[1] == _py_ext]
	284	)
	285
	286
	287	def _get_modpkg_path(dotted_name, pathlist=None):
	288	"""Get the filesystem path for a module or a package.
	289
	290	Return the file system path to a file for a module, and to a directory for
	291	a package. Return None if the name is not found, or is a builtin or
	292	extension module.
	293	"""
	294	# split off top-most name
	295	parts = dotted_name.split('.', 1)
	296
	297	if len(parts) > 1:
	298	# we have a dotted path, import top-level package
	299	try:
	300	file, pathname, description = imp.find_module(parts[0], pathlist)
	301	if file: file.close()
	302	except ImportError:
	303	return None
	304
	305	# check if it's indeed a package
	306	if description[2] == imp.PKG_DIRECTORY:
	307	# recursively handle the remaining name parts
	308	pathname = _get_modpkg_path(parts[1], [pathname])
	309	else:
	310	pathname = None
	311	else:
	312	# plain name
	313	try:
	314	file, pathname, description = imp.find_module(
	315	dotted_name, pathlist)
	316	if file:
	317	file.close()
	318	if description[2] not in [imp.PY_SOURCE, imp.PKG_DIRECTORY]:
	319	pathname = None
	320	except ImportError:
	321	pathname = None
	322
	323	return pathname
	324
	325
	326	def getFilesForName(name):
	327	"""Get a list of module files for a filename, a module or package name,
	328	or a directory.
	329	"""
	330	if not os.path.exists(name):
	331	# check for glob chars
	332	if containsAny(name, "*?[]"):
	333	files = glob.glob(name)
	334	list = []
	335	for file in files:
	336	list.extend(getFilesForName(file))
	337	return list
	338
	339	# try to find module or package
	340	name = _get_modpkg_path(name)
	341	if not name:
	342	return []
	343
	344	if os.path.isdir(name):
	345	# find all python files in directory
	346	list = []
	347	os.path.walk(name, _visit_pyfiles, list)
	348	return list
	349	elif os.path.exists(name):
	350	# a single file
	351	return [name]
	352
	353	return []
	354
	355
	356
	357	class TokenEater:
	358	def __init__(self, options):
	359	self.__options = options
	360	self.__messages = {}
	361	self.__state = self.__waiting
	362	self.__data = []
	363	self.__lineno = -1
	364	self.__freshmodule = 1
	365	self.__curfile = None
	366
	367	def __call__(self, ttype, tstring, stup, etup, line):
	368	# dispatch
	369	## import token
	370	## print >> sys.stderr, 'ttype:', token.tok_name[ttype], \
	371	## 'tstring:', tstring
	372	self.__state(ttype, tstring, stup[0])
	373
	374	def __waiting(self, ttype, tstring, lineno):
	375	opts = self.__options
	376	# Do docstring extractions, if enabled
	377	if opts.docstrings and not opts.nodocstrings.get(self.__curfile):
	378	# module docstring?
	379	if self.__freshmodule:
	380	if ttype == tokenize.STRING:
	381	self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
	382	self.__freshmodule = 0
	383	elif ttype not in (tokenize.COMMENT, tokenize.NL):
	384	self.__freshmodule = 0
	385	return
	386	# class docstring?
	387	if ttype == tokenize.NAME and tstring in ('class', 'def'):
	388	self.__state = self.__suiteseen
	389	return
	390	if ttype == tokenize.NAME and tstring in opts.keywords:
	391	self.__state = self.__keywordseen
	392
	393	def __suiteseen(self, ttype, tstring, lineno):
	394	# ignore anything until we see the colon
	395	if ttype == tokenize.OP and tstring == ':':
	396	self.__state = self.__suitedocstring
	397
	398	def __suitedocstring(self, ttype, tstring, lineno):
	399	# ignore any intervening noise
	400	if ttype == tokenize.STRING:
	401	self.__addentry(safe_eval(tstring), lineno, isdocstring=1)
	402	self.__state = self.__waiting
	403	elif ttype not in (tokenize.NEWLINE, tokenize.INDENT,
	404	tokenize.COMMENT):
	405	# there was no class docstring
	406	self.__state = self.__waiting
	407
	408	def __keywordseen(self, ttype, tstring, lineno):
	409	if ttype == tokenize.OP and tstring == '(':
	410	self.__data = []
	411	self.__lineno = lineno
	412	self.__state = self.__openseen
	413	else:
	414	self.__state = self.__waiting
	415
	416	def __openseen(self, ttype, tstring, lineno):
	417	if ttype == tokenize.OP and tstring == ')':
	418	# We've seen the last of the translatable strings. Record the
	419	# line number of the first line of the strings and update the list
	420	# of messages seen. Reset state for the next batch. If there
	421	# were no strings inside _(), then just ignore this entry.
	422	if self.__data:
	423	self.__addentry(EMPTYSTRING.join(self.__data))
	424	self.__state = self.__waiting
	425	elif ttype == tokenize.STRING:
	426	self.__data.append(safe_eval(tstring))
	427	elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT,
	428	token.NEWLINE, tokenize.NL]:
	429	# warn if we see anything else than STRING or whitespace
	430	print >> sys.stderr, _(
	431	'*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"'
	432	) % {
	433	'token': tstring,
	434	'file': self.__curfile,
	435	'lineno': self.__lineno
	436	}
	437	self.__state = self.__waiting
	438
	439	def __addentry(self, msg, lineno=None, isdocstring=0):
	440	if lineno is None:
	441	lineno = self.__lineno
	442	if not msg in self.__options.toexclude:
	443	entry = (self.__curfile, lineno)
	444	self.__messages.setdefault(msg, {})[entry] = isdocstring
	445
	446	def set_filename(self, filename):
	447	self.__curfile = filename
	448	self.__freshmodule = 1
	449
	450	def write(self, fp):
	451	options = self.__options
	452	timestamp = time.strftime('%Y-%m-%d %H:%M+%Z')
	453	# The time stamp in the header doesn't have the same format as that
	454	# generated by xgettext...
	455	print >> fp, pot_header % {'time': timestamp, 'version': __version__}
	456	# Sort the entries. First sort each particular entry's keys, then
	457	# sort all the entries by their first item.
	458	reverse = {}
	459	for k, v in self.__messages.items():
	460	keys = v.keys()
	461	keys.sort()
	462	reverse.setdefault(tuple(keys), []).append((k, v))
	463	rkeys = reverse.keys()
	464	rkeys.sort()
	465	for rkey in rkeys:
	466	rentries = reverse[rkey]
	467	rentries.sort()
	468	for k, v in rentries:
	469	isdocstring = 0
	470	# If the entry was gleaned out of a docstring, then add a
	471	# comment stating so. This is to aid translators who may wish
	472	# to skip translating some unimportant docstrings.
	473	if reduce(operator.__add__, v.values()):
	474	isdocstring = 1
	475	# k is the message string, v is a dictionary-set of (filename,
	476	# lineno) tuples. We want to sort the entries in v first by
	477	# file name and then by line number.
	478	v = v.keys()
	479	v.sort()
	480	if not options.writelocations:
	481	pass
	482	# location comments are different b/w Solaris and GNU:
	483	elif options.locationstyle == options.SOLARIS:
	484	for filename, lineno in v:
	485	d = {'filename': filename, 'lineno': lineno}
	486	print >>fp, _(
	487	'# File: %(filename)s, line: %(lineno)d') % d
	488	elif options.locationstyle == options.GNU:
	489	# fit as many locations on one line, as long as the
	490	# resulting line length doesn't exceeds 'options.width'
	491	locline = '#:'
	492	for filename, lineno in v:
	493	d = {'filename': filename, 'lineno': lineno}
	494	s = _(' %(filename)s:%(lineno)d') % d
	495	if len(locline) + len(s) <= options.width:
	496	locline = locline + s
	497	else:
	498	print >> fp, locline
	499	locline = "#:" + s
	500	if len(locline) > 2:
	501	print >> fp, locline
	502	if isdocstring:
	503	print >> fp, '#, docstring'
	504	print >> fp, 'msgid', normalize(k)
	505	print >> fp, 'msgstr ""\n'
	506
	507
	508
	509
	510	def main():
	511	global default_keywords
	512	try:
	513	opts, args = getopt.getopt(
	514	sys.argv[1:],
	515	'ad:DEhk:Kno:p:S:Vvw:x:X:',
	516	['extract-all', 'default-domain=', 'escape', 'help',
	517	'keyword=', 'no-default-keywords',
	518	'add-location', 'no-location', 'output=', 'output-dir=',
	519	'style=', 'verbose', 'version', 'width=', 'exclude-file=',
	520	'docstrings', 'no-docstrings',
	521	])
	522	except getopt.error, msg:
	523	usage(1, msg)
	524
	525	# for holding option values
	526	class Options:
	527	# constants
	528	GNU = 1
	529	SOLARIS = 2
	530	# defaults
	531	extractall = 0 # FIXME: currently this option has no effect at all.
	532	escape = 0
	533	keywords = []
	534	outpath = ''
	535	outfile = 'messages.pot'
	536	writelocations = 1
	537	locationstyle = GNU
	538	verbose = 0
	539	width = 78
	540	excludefilename = ''
	541	docstrings = 0
	542	nodocstrings = {}
	543
	544	options = Options()
	545	locations = {'gnu' : options.GNU,
	546	'solaris' : options.SOLARIS,
	547	}
	548
	549	# parse options
	550	for opt, arg in opts:
	551	if opt in ('-h', '--help'):
	552	usage(0)
	553	elif opt in ('-a', '--extract-all'):
	554	options.extractall = 1
	555	elif opt in ('-d', '--default-domain'):
	556	options.outfile = arg + '.pot'
	557	elif opt in ('-E', '--escape'):
	558	options.escape = 1
	559	elif opt in ('-D', '--docstrings'):
	560	options.docstrings = 1
	561	elif opt in ('-k', '--keyword'):
	562	options.keywords.append(arg)
	563	elif opt in ('-K', '--no-default-keywords'):
	564	default_keywords = []
	565	elif opt in ('-n', '--add-location'):
	566	options.writelocations = 1
	567	elif opt in ('--no-location',):
	568	options.writelocations = 0
	569	elif opt in ('-S', '--style'):
	570	options.locationstyle = locations.get(arg.lower())
	571	if options.locationstyle is None:
	572	usage(1, _('Invalid value for --style: %s') % arg)
	573	elif opt in ('-o', '--output'):
	574	options.outfile = arg
	575	elif opt in ('-p', '--output-dir'):
	576	options.outpath = arg
	577	elif opt in ('-v', '--verbose'):
	578	options.verbose = 1
	579	elif opt in ('-V', '--version'):
	580	print _('pygettext.py (xgettext for Python) %s') % __version__
	581	sys.exit(0)
	582	elif opt in ('-w', '--width'):
	583	try:
	584	options.width = int(arg)
	585	except ValueError:
	586	usage(1, _('--width argument must be an integer: %s') % arg)
	587	elif opt in ('-x', '--exclude-file'):
	588	options.excludefilename = arg
	589	elif opt in ('-X', '--no-docstrings'):
	590	fp = open(arg)
	591	try:
	592	while 1:
	593	line = fp.readline()
	594	if not line:
[391]	595	break
[2]	596	options.nodocstrings[line[:-1]] = 1
	597	finally:
	598	fp.close()
	599
	600	# calculate escapes
	601	make_escapes(not options.escape)
	602
	603	# calculate all keywords
	604	options.keywords.extend(default_keywords)
	605
	606	# initialize list of strings to exclude
	607	if options.excludefilename:
	608	try:
	609	fp = open(options.excludefilename)
	610	options.toexclude = fp.readlines()
	611	fp.close()
	612	except IOError:
	613	print >> sys.stderr, _(
	614	"Can't read --exclude-file: %s") % options.excludefilename
	615	sys.exit(1)
	616	else:
	617	options.toexclude = []
	618
	619	# resolve args to module lists
	620	expanded = []
	621	for arg in args:
	622	if arg == '-':
	623	expanded.append(arg)
	624	else:
	625	expanded.extend(getFilesForName(arg))
	626	args = expanded
	627
	628	# slurp through all the files
	629	eater = TokenEater(options)
	630	for filename in args:
	631	if filename == '-':
	632	if options.verbose:
	633	print _('Reading standard input')
	634	fp = sys.stdin
	635	closep = 0
	636	else:
	637	if options.verbose:
	638	print _('Working on %s') % filename
	639	fp = open(filename)
	640	closep = 1
	641	try:
	642	eater.set_filename(filename)
	643	try:
	644	tokenize.tokenize(fp.readline, eater)
	645	except tokenize.TokenError, e:
	646	print >> sys.stderr, '%s: %s, line %d, column %d' % (
	647	e[0], filename, e[1][0], e[1][1])
	648	finally:
	649	if closep:
	650	fp.close()
	651
	652	# write the output
	653	if options.outfile == '-':
	654	fp = sys.stdout
	655	closep = 0
	656	else:
	657	if options.outpath:
	658	options.outfile = os.path.join(options.outpath, options.outfile)
	659	fp = open(options.outfile, 'w')
	660	closep = 1
	661	try:
	662	eater.write(fp)
	663	finally:
	664	if closep:
	665	fp.close()
	666
	667
	668
	669	if __name__ == '__main__':
	670	main()
	671	# some more test strings
	672	_(u'a unicode string')
	673	# this one creates a warning
	674	_('*** Seen unexpected token "%(token)s"') % {'token': 'test'}
	675	_('more' 'than' 'one' 'string')

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Tools/i18n/pygettext.py

Download in other formats: