Package pytils :: Module translit
[hide private]

Source Code for Module pytils.translit

  1  # -*- coding: utf-8 -*- 
  2  # -*- test-case-name: pytils.test.test_translit -*- 
  3  # pytils - simple processing for russian strings 
  4  # Copyright (C) 2006-2007  Yury Yurevich 
  5  # 
  6  # http://www.pyobject.ru/projects/pytils/ 
  7  # 
  8  # This program is free software; you can redistribute it and/or 
  9  # modify it under the terms of the GNU General Public License 
 10  # as published by the Free Software Foundation, version 2 
 11  # of the License. 
 12  # 
 13  # This program is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  """ 
 18  Simple transliteration 
 19  """ 
 20   
 21  __id__ = __revision__ = "$Id: translit.py 102 2007-07-12 12:33:36Z the.pythy $" 
 22  __url__ = "$URL: https://pythy.googlecode.com/svn/tags/pytils/0_2_2/pytils/translit.py $" 
 23   
 24  import re 
 25  from pytils import utils 
 26   
 27  TRANSTABLE = ( 
 28          (u"'", u"'"), 
 29          (u'"', u'"'), 
 30          (u"‘", u"'"), 
 31          (u"’", u"'"), 
 32          (u"«", u'"'), 
 33          (u"»", u'"'), 
 34          (u"–", u"-"), 
 35          (u"…", u"..."), 
 36          (u"№", u"#"), 
 37          ## верхний регистр 
 38          # трехбуквенные замены 
 39          (u"Щ", u"Sch"), 
 40          # при замене русский->английский будет первая замена, 
 41          # т.е. Sch 
 42          # а вот если английский->русский, то вариант SCH и Sch -- 
 43          # оба пройдут 
 44          (u"Щ", u"SCH"), 
 45          # двухбуквенные замены 
 46          (u"Ё", u"Yo"), 
 47          (u"Ё", u"YO"), 
 48          (u"Ж", u"Zh"), 
 49          (u"Ж", u"ZH"), 
 50          (u"Ц", u"Ts"), 
 51          (u"Ц", u"TS"), 
 52          (u"Ч", u"Ch"), 
 53          (u"Ч", u"CH"), 
 54          (u"Ш", u"Sh"), 
 55          (u"Ш", u"SH"), 
 56          (u"Ы", u"Yi"), 
 57          (u"Ы", u"YI"), 
 58          (u"Ю", u"Yu"), 
 59          (u"Ю", u"YU"), 
 60          (u"Я", u"Ya"), 
 61          (u"Я", u"YA"), 
 62          # однобуквенные замены 
 63          (u"А", u"A"), 
 64          (u"Б", u"B"), 
 65          (u"В", u"V"), 
 66          (u"Г", u"G"), 
 67          (u"Д", u"D"), 
 68          (u"Е", u"E"), 
 69          (u"З", u"Z"), 
 70          (u"И", u"I"), 
 71          (u"Й", u"J"), 
 72          (u"К", u"K"), 
 73          (u"Л", u"L"), 
 74          (u"М", u"M"), 
 75          (u"Н", u"N"), 
 76          (u"О", u"O"), 
 77          (u"П", u"P"), 
 78          (u"Р", u"R"), 
 79          (u"С", u"S"), 
 80          (u"Т", u"T"), 
 81          (u"У", u"U"), 
 82          (u"Ф", u"F"), 
 83          (u"Х", u"H"), 
 84          (u"Э", u"E"), 
 85          (u"Ъ", u"`"), 
 86          (u"Ь", u"'"), 
 87          ## нижний регистр 
 88          # трехбуквенные замены 
 89          (u"щ", u"sch"), 
 90          # двухбуквенные замены 
 91          (u"ё", u"yo"), 
 92          (u"ж", u"zh"), 
 93          (u"ц", u"ts"), 
 94          (u"ч", u"ch"), 
 95          (u"ш", u"sh"), 
 96          (u"ы", u"yi"), 
 97          (u"ю", u"yu"), 
 98          (u"я", u"ya"), 
 99          # однобуквенные замены 
100          (u"а", u"a"), 
101          (u"б", u"b"), 
102          (u"в", u"v"), 
103          (u"г", u"g"), 
104          (u"д", u"d"), 
105          (u"е", u"e"), 
106          (u"з", u"z"), 
107          (u"и", u"i"), 
108          (u"й", u"j"), 
109          (u"к", u"k"), 
110          (u"л", u"l"), 
111          (u"м", u"m"), 
112          (u"н", u"n"), 
113          (u"о", u"o"), 
114          (u"п", u"p"), 
115          (u"р", u"r"), 
116          (u"с", u"s"), 
117          (u"т", u"t"), 
118          (u"у", u"u"), 
119          (u"ф", u"f"), 
120          (u"х", u"h"), 
121          (u"э", u"e"), 
122          (u"ъ", u"`"), 
123          (u"ь", u"'"), 
124          # для полноты английского алфавит (в slugify) 
125          # дополняем английскими буквами, которых 
126          # не в парах 
127          (u"c", u"c"), 
128          (u"q", u"q"), 
129          (u"y", u"y"), 
130          (u"x", u"x"), 
131          (u"w", u"w"), 
132          (u"1", u"1"), 
133          (u"2", u"2"), 
134          (u"3", u"3"), 
135          (u"4", u"4"), 
136          (u"5", u"5"), 
137          (u"6", u"6"), 
138          (u"7", u"7"), 
139          (u"8", u"8"), 
140          (u"9", u"9"), 
141          (u"0", u"0"), 
142          )  #: Translation table 
143   
144  RU_ALPHABET = [x[0] for x in TRANSTABLE] #: Russian alphabet that we can translate 
145  EN_ALPHABET = [x[1] for x in TRANSTABLE] #: English alphabet that we can detransliterate 
146  ALPHABET = RU_ALPHABET + EN_ALPHABET #: Alphabet that we can (de)transliterate 
147   
148   
149 -def translify(in_string):
150 """ 151 Translify russian text 152 153 @param in_string: input string 154 @type in_string: C{unicode} 155 156 @return: transliterated string 157 @rtype: C{str} 158 159 @raise TypeError: when in_string is not C{unicode} 160 @raise ValueError: when string doesn't transliterate completely 161 """ 162 utils.check_type('in_string', unicode) 163 164 translit = in_string 165 for symb_in, symb_out in TRANSTABLE: 166 translit = translit.replace(symb_in, symb_out) 167 168 try: 169 translit = str(translit) 170 except UnicodeEncodeError: 171 raise ValueError("Unicode string doesn't transliterate completely, " + \ 172 "is it russian?") 173 174 return translit
175 176
177 -def detranslify(in_string):
178 """ 179 Detranslify 180 181 @param in_string: input string 182 @type in_string: C{basestring} 183 184 @return: detransliterated string 185 @rtype: C{str} 186 187 @raise TypeError: when in_string neither C{str}, no C{unicode} 188 @raise ValueError: if in_string is C{str}, but it isn't ascii 189 """ 190 utils.check_type('in_string', basestring) 191 192 # в unicode 193 try: 194 russian = unicode(in_string) 195 except UnicodeDecodeError: 196 raise ValueError("We expects when in_string is str type," + \ 197 "it is an ascii, but now it isn't. Use unicode " + \ 198 "in this case.") 199 200 for symb_out, symb_in in TRANSTABLE: 201 russian = russian.replace(symb_in, symb_out) 202 203 return russian
204 205
206 -def slugify(in_string):
207 """ 208 Prepare string for slug (i.e. URL or file/dir name) 209 210 @param in_string: input string 211 @type in_string: C{basestring} 212 213 @return: slug-string 214 @rtype: C{str} 215 216 @raise TypeError: when in_string isn't C{unicode} or C{str} 217 @raise ValueError: if in_string is C{str}, but it isn't ascii 218 """ 219 utils.check_type('in_string', basestring) 220 try: 221 u_in_string = unicode(in_string).lower() 222 except UnicodeDecodeError: 223 raise ValueError("We expects when in_string is str type," + \ 224 "it is an ascii, but now it isn't. Use unicode " + \ 225 "in this case.") 226 # convert & to "and" 227 u_in_string = re.sub('\&amp\;|\&', ' and ', u_in_string) 228 # replace spaces by hyphen 229 u_in_string = re.sub('[-\s]+', '-', u_in_string) 230 # remove symbols that not in alphabet 231 u_in_string = u''.join([symb for symb in u_in_string if symb in ALPHABET]) 232 # translify it 233 out_string = translify(u_in_string) 234 # remove non-alpha 235 return re.sub('[^\w\s-]', '', out_string).strip().lower()
236 237
238 -def dirify(in_string):
239 """ 240 Alias for L{slugify} 241 """ 242 slugify(in_string)
243