1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 """This module contains all the common features for languages.
23
24 Supported features
25 ==================
26 - language code (km, af)
27 - language name (Khmer, Afrikaans)
28 - Plurals
29 - Number of plurals (nplurals)
30 - Plural equation
31 - pofilter tests to ignore
32
33 Segmentation
34 ------------
35 - characters
36 - words
37 - sentences
38
39 TODOs and Ideas for possible features
40 =====================================
41 - Language-Team information
42 - Segmentation
43 - phrases
44
45 Punctuation
46 -----------
47 - End of sentence
48 - Start of sentence
49 - Middle of sentence
50 - Quotes
51 - single
52 - double
53
54 - Valid characters
55 - Accelerator characters
56 - Special characters
57 - Direction (rtl or ltr)
58 """
59
60 from translate.lang import data
61 import re
62
64 """This class is the common parent class for all language classes."""
65
66 code = ""
67 """The ISO 639 language code, possibly with a country specifier or other
68 modifier.
69
70 Examples::
71 km
72 pt_BR
73 sr_YU@Latn
74 """
75
76 fullname = ""
77 """The full (English) name of this language.
78
79 Dialect codes should have the form of
80 - Khmer
81 - Portugese (Brazil)
82 - TODO: sr_YU@Latn?
83 """
84
85 nplurals = 0
86 """The number of plural forms of this language.
87
88 0 is not a valid value - it must be overridden.
89 Any positive integer is valid (it should probably be between 1 and 6)
90 @see: L{data}
91 """
92
93 pluralequation = "0"
94 """The plural equation for selection of plural forms.
95
96 This is used for PO files to fill into the header.
97 @see: U{Gettext manual<http://www.gnu.org/software/gettext/manual/html_node/gettext_150.html#Plural-forms>}
98 @see: L{data}
99 """
100
101
102
103 listseperator = u", "
104 """This string is used to separate lists of textual elements. Most
105 languages probably can stick with the default comma, but Arabic and some
106 Asian languages might want to override this."""
107
108 commonpunc = u".,;:!?-@#$%^*_()[]{}/\\'`\"<>"
109 """These punctuation marks are common in English and most languages that
110 use latin script."""
111
112 quotes = u"‘’‛“”„‟′″‴‵‶‷‹›«»"
113 """These are different quotation marks used by various languages."""
114
115 invertedpunc = u"¿¡"
116 """Inveted punctuation sometimes used at the beginning of sentences in
117 Spanish, Asturian, Galician, and Catalan."""
118
119 rtlpunc = u"،؟؛÷"
120 """These punctuation marks are used by Arabic and Persian, for example."""
121
122 CJKpunc = u"。、,;!?「」『』【】"
123 """These punctuation marks are used in certain circumstances with CJK
124 languages."""
125
126 indicpunc = u"।॥॰"
127 """These punctuation marks are used by several Indic languages."""
128
129 ethiopicpunc = u"።፤፣"
130 """These punctuation marks are used by several Ethiopic languages."""
131
132 miscpunc = u"…±°¹²³·©®×£¥€"
133 """The middle dot (·) is used by Greek and Georgian."""
134
135 punctuation = u"".join([commonpunc, quotes, invertedpunc, rtlpunc, CJKpunc,\
136 indicpunc, ethiopicpunc, miscpunc])
137 """We include many types of punctuation here, simply since this is only
138 meant to determine if something is punctuation. Hopefully we catch some
139 languages which might not be represented with modules. Most languages won't
140 need to override this."""
141
142 sentenceend = u".!?…։؟।。!?።"
143 """These marks can indicate a sentence end. Once again we try to account
144 for many languages. Most langauges won't need to override this."""
145
146
147
148
149
150 sentencere = re.compile(r"""(?s) #make . also match newlines
151 .*? #anything, but match non-greedy
152 [%s] #the puntuation for sentence ending
153 \s+ #the spacing after the puntuation
154 (?=[^a-z\d])#lookahead that next part starts with caps
155 """ % sentenceend, re.VERBOSE)
156
157 puncdict = {}
158 """A dictionary of punctuation transformation rules that can be used by
159 punctranslate()."""
160
161 ignoretests = []
162 """List of pofilter tests for this language that must be ignored."""
163
164 checker = None
165 """A language specific checker (see filters.checks).
166
167 This doesn't need to be supplied, but will be used if it exists."""
168
169 _languages = {}
170
171 validaccel = None
172 """Characters that can be used as accelerators (access keys) i.e. Alt+X
173 where X is the accelerator. These can include combining diacritics as
174 long as they are accessible from the users keyboard in a single keystroke,
175 but normally they would be at least precomposed characters. All characters,
176 lower and upper, are included in the list."""
177
178 validdoublewords = []
179 """Some languages allow double words in certain cases. This is a dictionary
180 of such words."""
181
203
205 memo[id(self)] = self
206 return self
207
209 """Give a simple string representation without address information to
210 be able to store it in text for comparison later."""
211 detail = ""
212 if self.code:
213 detail = "(%s)" % self.code
214 return "<class 'translate.lang.common.Common%s'>" % detail
215
240 punctranslate = classmethod(punctranslate)
241
243 """Returns an estimate to a likely change in length relative to an
244 English string of length length."""
245
246
247 expansion_factor = 0
248 code = cls.code
249 while code:
250 expansion_factor = data.expansion_factors.get(cls.code, 0)
251 if expansion_factor:
252 break
253 code = data.simplercode(code)
254 else:
255 expansion_factor = 0.1
256 constant = max(5, int(40*expansion_factor))
257
258 return constant + int(expansion_factor * length)
259 length_difference = classmethod(length_difference)
260
262 """Converts the given string by adding or removing characters as an
263 estimation of translation length (with English assumed as source
264 language)."""
265 def alter_it(text):
266 l = len(text)
267 if l > 9:
268 extra = cls.length_difference(l)
269 if extra > 0:
270 text = text[:extra].replace(u'\n', u'') + text
271 else:
272 text = text[-extra:]
273 return text
274 expanded = []
275 for subtext in text.split(u"\n\n"):
276 expanded.append(alter_it(subtext))
277 text = u"\n\n".join(expanded)
278 return text
279 alter_length = classmethod(alter_length)
280
282 """Returns an iterator over the characters in text."""
283
284 prev = 'A'
285 for c in text:
286 if c.isspace() and prev.isspace():
287 continue
288 prev = c
289 if not (c in cls.punctuation):
290 yield c
291 character_iter = classmethod(character_iter)
292
296 characters = classmethod(characters)
297
299 """Returns an iterator over the words in text."""
300
301 for w in text.split():
302 word = w.strip(cls.punctuation)
303 if word:
304 yield word
305 word_iter = classmethod(word_iter)
306
308 """Returns a list of words in text."""
309 return [w for w in cls.word_iter(text)]
310 words = classmethod(words)
311
313 """Returns an iterator over the sentences in text."""
314 lastmatch = 0
315 text = text or ""
316 for item in cls.sentencere.finditer(text):
317 lastmatch = item.end()
318 sentence = item.group()
319 if strip:
320 sentence = sentence.strip()
321 if sentence:
322 yield sentence
323 remainder = text[lastmatch:]
324 if strip:
325 remainder = remainder.strip()
326 if remainder:
327 yield remainder
328 sentence_iter = classmethod(sentence_iter)
329
331 """Returns a list of senteces in text."""
332 return [s for s in cls.sentence_iter(text, strip=strip)]
333 sentences = classmethod(sentences)
334
336 """Determines whether the text starts with a capital letter."""
337 stripped = text.lstrip().lstrip(cls.punctuation)
338 return stripped and stripped[0].isupper()
339 capsstart = classmethod(capsstart)
340