Context Navigation

gettext.py@ 403

Last change on this file since 403 was 391, checked in by dmik, 11 years ago
python: Merge vendor 2.7.6 to trunk.
Property svn:eol-style set to `native`
File size: 19.6 KB

Line
1	"""Internationalization and localization support.
2
3	This module provides internationalization (I18N) and localization (L10N)
4	support for your Python programs by providing an interface to the GNU gettext
5	message catalog library.
6
7	I18N refers to the operation by which a program is made aware of multiple
8	languages. L10N refers to the adaptation of your program, once
9	internationalized, to the local language and cultural habits.
10
11	"""
12
13	# This module represents the integration of work, contributions, feedback, and
14	# suggestions from the following people:
15	#
16	# Martin von Loewis, who wrote the initial implementation of the underlying
17	# C-based libintlmodule (later renamed _gettext), along with a skeletal
18	# gettext.py implementation.
19	#
20	# Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule,
21	# which also included a pure-Python implementation to read .mo files if
22	# intlmodule wasn't available.
23	#
24	# James Henstridge, who also wrote a gettext.py module, which has some
25	# interesting, but currently unsupported experimental features: the notion of
26	# a Catalog class and instances, and the ability to add to a catalog file via
27	# a Python API.
28	#
29	# Barry Warsaw integrated these modules, wrote the .install() API and code,
30	# and conformed all C and Python code to Python's coding standards.
31	#
32	# Francois Pinard and Marc-Andre Lemburg also contributed valuably to this
33	# module.
34	#
35	# J. David Ibanez implemented plural forms. Bruno Haible fixed some bugs.
36	#
37	# TODO:
38	# - Lazy loading of .mo files. Currently the entire catalog is loaded into
39	# memory, but that's probably bad for large translated programs. Instead,
40	# the lexical sort of original strings in GNU .mo files should be exploited
41	# to do binary searches and lazy initializations. Or you might want to use
42	# the undocumented double-hash algorithm for .mo files with hash tables, but
43	# you'll need to study the GNU gettext code to do this.
44	#
45	# - Support Solaris .mo file formats. Unfortunately, we've been unable to
46	# find this format documented anywhere.
47
48
49	import locale, copy, os, re, struct, sys
50	from errno import ENOENT
51
52
53	__all__ = ['NullTranslations', 'GNUTranslations', 'Catalog',
54	'find', 'translation', 'install', 'textdomain', 'bindtextdomain',
55	'dgettext', 'dngettext', 'gettext', 'ngettext',
56	]
57
58	_default_localedir = os.path.join(sys.prefix, 'share', 'locale')
59
60
61	def test(condition, true, false):
62	"""
63	Implements the C expression:
64
65	condition ? true : false
66
67	Required to correctly interpret plural forms.
68	"""
69	if condition:
70	return true
71	else:
72	return false
73
74
75	def c2py(plural):
76	"""Gets a C expression as used in PO files for plural forms and returns a
77	Python lambda function that implements an equivalent expression.
78	"""
79	# Security check, allow only the "n" identifier
80	try:
81	from cStringIO import StringIO
82	except ImportError:
83	from StringIO import StringIO
84	import token, tokenize
85	tokens = tokenize.generate_tokens(StringIO(plural).readline)
86	try:
87	danger = [x for x in tokens if x[0] == token.NAME and x[1] != 'n']
88	except tokenize.TokenError:
89	raise ValueError, \
90	'plural forms expression error, maybe unbalanced parenthesis'
91	else:
92	if danger:
93	raise ValueError, 'plural forms expression could be dangerous'
94
95	# Replace some C operators by their Python equivalents
96	plural = plural.replace('&&', ' and ')
97	plural = plural.replace('\|\|', ' or ')
98
99	expr = re.compile(r'\!([^=])')
100	plural = expr.sub(' not \\1', plural)
101
102	# Regular expression and replacement function used to transform
103	# "a?b:c" to "test(a,b,c)".
104	expr = re.compile(r'(.?)\?(.?):(.*)')
105	def repl(x):
106	return "test(%s, %s, %s)" % (x.group(1), x.group(2),
107	expr.sub(repl, x.group(3)))
108
109	# Code to transform the plural expression, taking care of parentheses
110	stack = ['']
111	for c in plural:
112	if c == '(':
113	stack.append('')
114	elif c == ')':
115	if len(stack) == 1:
116	# Actually, we never reach this code, because unbalanced
117	# parentheses get caught in the security check at the
118	# beginning.
119	raise ValueError, 'unbalanced parenthesis in plural form'
120	s = expr.sub(repl, stack.pop())
121	stack[-1] += '(%s)' % s
122	else:
123	stack[-1] += c
124	plural = expr.sub(repl, stack.pop())
125
126	return eval('lambda n: int(%s)' % plural)
127
128
129
130	def _expand_lang(locale):
131	from locale import normalize
132	locale = normalize(locale)
133	COMPONENT_CODESET = 1 << 0
134	COMPONENT_TERRITORY = 1 << 1
135	COMPONENT_MODIFIER = 1 << 2
136	# split up the locale into its base components
137	mask = 0
138	pos = locale.find('@')
139	if pos >= 0:
140	modifier = locale[pos:]
141	locale = locale[:pos]
142	mask \|= COMPONENT_MODIFIER
143	else:
144	modifier = ''
145	pos = locale.find('.')
146	if pos >= 0:
147	codeset = locale[pos:]
148	locale = locale[:pos]
149	mask \|= COMPONENT_CODESET
150	else:
151	codeset = ''
152
153	# use new os2 code to get current codepage
154	if pos == -1 and os.name == "os2":
155	import _locale
156	codeset = _locale._getdefaultlocale()[1]
157	mask \|= COMPONENT_CODESET
158
159	pos = locale.find('_')
160	if pos >= 0:
161	territory = locale[pos:]
162	locale = locale[:pos]
163	mask \|= COMPONENT_TERRITORY
164	else:
165	territory = ''
166	language = locale
167	ret = []
168	for i in range(mask+1):
169	if not (i & ~mask): # if all components for this combo exist ...
170	val = language
171	if i & COMPONENT_TERRITORY: val += territory
172	if i & COMPONENT_CODESET: val += codeset
173	if i & COMPONENT_MODIFIER: val += modifier
174	ret.append(val)
175	ret.reverse()
176	return ret
177
178
179
180	class NullTranslations:
181	def __init__(self, fp=None):
182	self._info = {}
183	self._charset = None
184	self._output_charset = None
185	self._fallback = None
186	if fp is not None:
187	self._parse(fp)
188
189	def _parse(self, fp):
190	pass
191
192	def add_fallback(self, fallback):
193	if self._fallback:
194	self._fallback.add_fallback(fallback)
195	else:
196	self._fallback = fallback
197
198	def gettext(self, message):
199	if self._fallback:
200	return self._fallback.gettext(message)
201	return message
202
203	def lgettext(self, message):
204	if self._fallback:
205	return self._fallback.lgettext(message)
206	return message
207
208	def ngettext(self, msgid1, msgid2, n):
209	if self._fallback:
210	return self._fallback.ngettext(msgid1, msgid2, n)
211	if n == 1:
212	return msgid1
213	else:
214	return msgid2
215
216	def lngettext(self, msgid1, msgid2, n):
217	if self._fallback:
218	return self._fallback.lngettext(msgid1, msgid2, n)
219	if n == 1:
220	return msgid1
221	else:
222	return msgid2
223
224	def ugettext(self, message):
225	if self._fallback:
226	return self._fallback.ugettext(message)
227	return unicode(message)
228
229	def ungettext(self, msgid1, msgid2, n):
230	if self._fallback:
231	return self._fallback.ungettext(msgid1, msgid2, n)
232	if n == 1:
233	return unicode(msgid1)
234	else:
235	return unicode(msgid2)
236
237	def info(self):
238	return self._info
239
240	def charset(self):
241	return self._charset
242
243	def output_charset(self):
244	return self._output_charset
245
246	def set_output_charset(self, charset):
247	self._output_charset = charset
248
249	def install(self, unicode=False, names=None):
250	import __builtin__
251	__builtin__.__dict__['_'] = unicode and self.ugettext or self.gettext
252	if hasattr(names, "__contains__"):
253	if "gettext" in names:
254	__builtin__.__dict__['gettext'] = __builtin__.__dict__['_']
255	if "ngettext" in names:
256	__builtin__.__dict__['ngettext'] = (unicode and self.ungettext
257	or self.ngettext)
258	if "lgettext" in names:
259	__builtin__.__dict__['lgettext'] = self.lgettext
260	if "lngettext" in names:
261	__builtin__.__dict__['lngettext'] = self.lngettext
262
263
264	class GNUTranslations(NullTranslations):
265	# Magic number of .mo files
266	LE_MAGIC = 0x950412deL
267	BE_MAGIC = 0xde120495L
268
269	def _parse(self, fp):
270	"""Override this method to support alternative .mo formats."""
271	unpack = struct.unpack
272	filename = getattr(fp, 'name', '')
273	# Parse the .mo file header, which consists of 5 little endian 32
274	# bit words.
275	self._catalog = catalog = {}
276	self.plural = lambda n: int(n != 1) # germanic plural by default
277	buf = fp.read()
278	buflen = len(buf)
279	# Are we big endian or little endian?
280	magic = unpack('<I', buf[:4])[0]
281	if magic == self.LE_MAGIC:
282	version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20])
283	ii = '<II'
284	elif magic == self.BE_MAGIC:
285	version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20])
286	ii = '>II'
287	else:
288	raise IOError(0, 'Bad magic number', filename)
289	# Now put all messages from the .mo file buffer into the catalog
290	# dictionary.
291	for i in xrange(0, msgcount):
292	mlen, moff = unpack(ii, buf[masteridx:masteridx+8])
293	mend = moff + mlen
294	tlen, toff = unpack(ii, buf[transidx:transidx+8])
295	tend = toff + tlen
296	if mend < buflen and tend < buflen:
297	msg = buf[moff:mend]
298	tmsg = buf[toff:tend]
299	else:
300	raise IOError(0, 'File is corrupt', filename)
301	# See if we're looking at GNU .mo conventions for metadata
302	if mlen == 0:
303	# Catalog description
304	lastk = k = None
305	for item in tmsg.splitlines():
306	item = item.strip()
307	if not item:
308	continue
309	if ':' in item:
310	k, v = item.split(':', 1)
311	k = k.strip().lower()
312	v = v.strip()
313	self._info[k] = v
314	lastk = k
315	elif lastk:
316	self._info[lastk] += '\n' + item
317	if k == 'content-type':
318	self._charset = v.split('charset=')[1]
319	elif k == 'plural-forms':
320	v = v.split(';')
321	plural = v[1].split('plural=')[1]
322	self.plural = c2py(plural)
323	# Note: we unconditionally convert both msgids and msgstrs to
324	# Unicode using the character encoding specified in the charset
325	# parameter of the Content-Type header. The gettext documentation
326	# strongly encourages msgids to be us-ascii, but some applications
327	# require alternative encodings (e.g. Zope's ZCML and ZPT). For
328	# traditional gettext applications, the msgid conversion will
329	# cause no problems since us-ascii should always be a subset of
330	# the charset encoding. We may want to fall back to 8-bit msgids
331	# if the Unicode conversion fails.
332	if '\x00' in msg:
333	# Plural forms
334	msgid1, msgid2 = msg.split('\x00')
335	tmsg = tmsg.split('\x00')
336	if self._charset:
337	msgid1 = unicode(msgid1, self._charset)
338	tmsg = [unicode(x, self._charset) for x in tmsg]
339	for i in range(len(tmsg)):
340	catalog[(msgid1, i)] = tmsg[i]
341	else:
342	if self._charset:
343	msg = unicode(msg, self._charset)
344	tmsg = unicode(tmsg, self._charset)
345	catalog[msg] = tmsg
346	# advance to next entry in the seek tables
347	masteridx += 8
348	transidx += 8
349
350	def gettext(self, message):
351	missing = object()
352	tmsg = self._catalog.get(message, missing)
353	if tmsg is missing:
354	if self._fallback:
355	return self._fallback.gettext(message)
356	return message
357	# Encode the Unicode tmsg back to an 8-bit string, if possible
358	if self._output_charset:
359	return tmsg.encode(self._output_charset)
360	elif self._charset:
361	return tmsg.encode(self._charset)
362	return tmsg
363
364	def lgettext(self, message):
365	missing = object()
366	tmsg = self._catalog.get(message, missing)
367	if tmsg is missing:
368	if self._fallback:
369	return self._fallback.lgettext(message)
370	return message
371	if self._output_charset:
372	return tmsg.encode(self._output_charset)
373	return tmsg.encode(locale.getpreferredencoding())
374
375	def ngettext(self, msgid1, msgid2, n):
376	try:
377	tmsg = self._catalog[(msgid1, self.plural(n))]
378	if self._output_charset:
379	return tmsg.encode(self._output_charset)
380	elif self._charset:
381	return tmsg.encode(self._charset)
382	return tmsg
383	except KeyError:
384	if self._fallback:
385	return self._fallback.ngettext(msgid1, msgid2, n)
386	if n == 1:
387	return msgid1
388	else:
389	return msgid2
390
391	def lngettext(self, msgid1, msgid2, n):
392	try:
393	tmsg = self._catalog[(msgid1, self.plural(n))]
394	if self._output_charset:
395	return tmsg.encode(self._output_charset)
396	return tmsg.encode(locale.getpreferredencoding())
397	except KeyError:
398	if self._fallback:
399	return self._fallback.lngettext(msgid1, msgid2, n)
400	if n == 1:
401	return msgid1
402	else:
403	return msgid2
404
405	def ugettext(self, message):
406	missing = object()
407	tmsg = self._catalog.get(message, missing)
408	if tmsg is missing:
409	if self._fallback:
410	return self._fallback.ugettext(message)
411	return unicode(message)
412	return tmsg
413
414	def ungettext(self, msgid1, msgid2, n):
415	try:
416	tmsg = self._catalog[(msgid1, self.plural(n))]
417	except KeyError:
418	if self._fallback:
419	return self._fallback.ungettext(msgid1, msgid2, n)
420	if n == 1:
421	tmsg = unicode(msgid1)
422	else:
423	tmsg = unicode(msgid2)
424	return tmsg
425
426
427	# Locate a .mo file using the gettext strategy
428	def find(domain, localedir=None, languages=None, all=0):
429	# Get some reasonable defaults for arguments that were not supplied
430	if localedir is None:
431	localedir = _default_localedir
432	if languages is None:
433	languages = []
434	for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
435	val = os.environ.get(envar)
436	if val:
437	languages = val.split(':')
438	break
439	if 'C' not in languages:
440	languages.append('C')
441	# now normalize and expand the languages
442	nelangs = []
443	for lang in languages:
444	for nelang in _expand_lang(lang):
445	if nelang not in nelangs:
446	nelangs.append(nelang)
447	# select a language
448	if all:
449	result = []
450	else:
451	result = None
452	for lang in nelangs:
453	if lang == 'C':
454	break
455	mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain)
456	if os.path.exists(mofile):
457	if all:
458	result.append(mofile)
459	else:
460	return mofile
461	return result
462
463
464
465	# a mapping between absolute .mo file path and Translation object
466	_translations = {}
467
468	def translation(domain, localedir=None, languages=None,
469	class_=None, fallback=False, codeset=None):
470	if class_ is None:
471	class_ = GNUTranslations
472	mofiles = find(domain, localedir, languages, all=1)
473	if not mofiles:
474	if fallback:
475	return NullTranslations()
476	raise IOError(ENOENT, 'No translation file found for domain', domain)
477	# Avoid opening, reading, and parsing the .mo file after it's been done
478	# once.
479	result = None
480	for mofile in mofiles:
481	key = (class_, os.path.abspath(mofile))
482	t = _translations.get(key)
483	if t is None:
484	with open(mofile, 'rb') as fp:
485	t = _translations.setdefault(key, class_(fp))
486	# Copy the translation object to allow setting fallbacks and
487	# output charset. All other instance data is shared with the
488	# cached object.
489	t = copy.copy(t)
490	if codeset:
491	t.set_output_charset(codeset)
492	if result is None:
493	result = t
494	else:
495	result.add_fallback(t)
496	return result
497
498
499	def install(domain, localedir=None, unicode=False, codeset=None, names=None):
500	t = translation(domain, localedir, fallback=True, codeset=codeset)
501	t.install(unicode, names)
502
503
504
505	# a mapping b/w domains and locale directories
506	_localedirs = {}
507	# a mapping b/w domains and codesets
508	_localecodesets = {}
509	# current global domain, `messages' used for compatibility w/ GNU gettext
510	_current_domain = 'messages'
511
512
513	def textdomain(domain=None):
514	global _current_domain
515	if domain is not None:
516	_current_domain = domain
517	return _current_domain
518
519
520	def bindtextdomain(domain, localedir=None):
521	global _localedirs
522	if localedir is not None:
523	_localedirs[domain] = localedir
524	return _localedirs.get(domain, _default_localedir)
525
526
527	def bind_textdomain_codeset(domain, codeset=None):
528	global _localecodesets
529	if codeset is not None:
530	_localecodesets[domain] = codeset
531	return _localecodesets.get(domain)
532
533
534	def dgettext(domain, message):
535	try:
536	t = translation(domain, _localedirs.get(domain, None),
537	codeset=_localecodesets.get(domain))
538	except IOError:
539	return message
540	return t.gettext(message)
541
542	def ldgettext(domain, message):
543	try:
544	t = translation(domain, _localedirs.get(domain, None),
545	codeset=_localecodesets.get(domain))
546	except IOError:
547	return message
548	return t.lgettext(message)
549
550	def dngettext(domain, msgid1, msgid2, n):
551	try:
552	t = translation(domain, _localedirs.get(domain, None),
553	codeset=_localecodesets.get(domain))
554	except IOError:
555	if n == 1:
556	return msgid1
557	else:
558	return msgid2
559	return t.ngettext(msgid1, msgid2, n)
560
561	def ldngettext(domain, msgid1, msgid2, n):
562	try:
563	t = translation(domain, _localedirs.get(domain, None),
564	codeset=_localecodesets.get(domain))
565	except IOError:
566	if n == 1:
567	return msgid1
568	else:
569	return msgid2
570	return t.lngettext(msgid1, msgid2, n)
571
572	def gettext(message):
573	return dgettext(_current_domain, message)
574
575	def lgettext(message):
576	return ldgettext(_current_domain, message)
577
578	def ngettext(msgid1, msgid2, n):
579	return dngettext(_current_domain, msgid1, msgid2, n)
580
581	def lngettext(msgid1, msgid2, n):
582	return ldngettext(_current_domain, msgid1, msgid2, n)
583
584	# dcgettext() has been deemed unnecessary and is not implemented.
585
586	# James Henstridge's Catalog constructor from GNOME gettext. Documented usage
587	# was:
588	#
589	# import gettext
590	# cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR)
591	# _ = cat.gettext
592	# print _('Hello World')
593
594	# The resulting catalog object currently don't support access through a
595	# dictionary API, which was supported (but apparently unused) in GNOME
596	# gettext.
597
598	Catalog = translation

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: python/trunk/Lib/gettext.py@ 403

Download in other formats: