| | 5 | # Latin characters with accents, etc. |
|---|
| | 6 | mapping = { |
|---|
| | 7 | 138 : 's', 140 : 'O', 142 : 'z', 154 : 's', 156 : 'o', 158 : 'z', 159 : 'Y', |
|---|
| | 8 | 192 : 'A', 193 : 'A', 194 : 'A', 195 : 'a', 196 : 'A', 197 : 'A', 198 : 'E', |
|---|
| | 9 | 199 : 'C', 200 : 'E', 201 : 'E', 202 : 'E', 203 : 'E', 204 : 'I', 205 : 'I', |
|---|
| | 10 | 206 : 'I', 207 : 'I', 208 : 'D', 209 : 'N', 210 : 'O', 211 : 'O', 212 : 'O', |
|---|
| | 11 | 213 : 'O', 214 : 'O', 215 : 'x', 216 : 'O', 217 : 'U', 218 : 'U', 219 : 'U', |
|---|
| | 12 | 220 : 'U', 221 : 'Y', 223 : 's', 224 : 'a', 225 : 'a', 226 : 'a', 227 : 'a', |
|---|
| | 13 | 228 : 'a', 229 : 'a', 230 : 'e', 231 : 'c', 232 : 'e', 233 : 'e', 234 : 'e', |
|---|
| | 14 | 235 : 'e', 236 : 'i', 237 : 'i', 238 : 'i', 239 : 'i', 240 : 'd', 241 : 'n', |
|---|
| | 15 | 242 : 'o', 243 : 'o', 244 : 'o', 245 : 'o', 246 : 'o', 248 : 'o', 249 : 'u', |
|---|
| | 16 | 250 : 'u', 251 : 'u', 252 : 'u', 253 : 'y', 255 : 'y', |
|---|
| | 17 | 305 : 'i' |
|---|
| | 18 | } |
|---|
| | 19 | |
|---|
| | 20 | # On OpenBSD string.whitespace has a non-standard implementation |
|---|
| | 21 | # See http://dev.plone.org/plone/ticket/4704 for details |
|---|
| | 22 | whitespace = ''.join([c for c in string.whitespace if ord(c) < 128]) |
|---|
| | 23 | allowed = string.ascii_letters + string.digits + string.punctuation + whitespace |
|---|
| | 24 | |
|---|
| | 25 | # Define and compile static regexes |
|---|
| 10 | | mapping.update({ |
|---|
| 11 | | 305 : 'i', |
|---|
| 12 | | }) |
|---|
| | 33 | def cropName(base, maxLength=MAX_LENGTH): |
|---|
| | 34 | baseLength = len(base) |
|---|
| | 35 | |
|---|
| | 36 | index = baseLength |
|---|
| | 37 | while index > maxLength: |
|---|
| | 38 | index = base.rfind('-', 0, index) |
|---|
| | 39 | |
|---|
| | 40 | if index == -1 and baseLength > maxLength: |
|---|
| | 41 | base = base[: maxLength] |
|---|
| | 42 | |
|---|
| | 43 | elif index > 0: |
|---|
| | 44 | base = base[: index] |
|---|
| | 45 | |
|---|
| | 46 | return base |
|---|
| | 47 | |
|---|
| | 48 | def baseNormalize(text): |
|---|
| | 49 | """ |
|---|
| | 50 | This method is used for normalization of unicode characters to the base ASCII |
|---|
| | 51 | letters. Output is ASCII encoded string (or char) with only ASCII letters, |
|---|
| | 52 | digits, punctuation and whitespace characters. Case is preserved. |
|---|
| | 53 | |
|---|
| | 54 | >>> baseNormalize(123) |
|---|
| | 55 | '123' |
|---|
| | 56 | |
|---|
| | 57 | >>> baseNormalize(u'\u0fff') |
|---|
| | 58 | 'fff' |
|---|
| | 59 | |
|---|
| | 60 | >>> baseNormalize(u"foo\N{LATIN CAPITAL LETTER I WITH CARON}") |
|---|
| | 61 | 'fooI' |
|---|
| | 62 | """ |
|---|
| | 63 | if not isinstance(text, basestring): |
|---|
| | 64 | # This most surely ends up in something the user does not expect |
|---|
| | 65 | # to see. But at least it does not break. |
|---|
| | 66 | return repr(text) |
|---|
| | 67 | |
|---|
| | 68 | text = text.strip() |
|---|
| | 69 | |
|---|
| | 70 | res = u'' |
|---|
| | 71 | for ch in text: |
|---|
| | 72 | if ch in allowed: |
|---|
| | 73 | # ASCII chars, digits etc. stay untouched |
|---|
| | 74 | res += ch |
|---|
| | 75 | else: |
|---|
| | 76 | ordinal = ord(ch) |
|---|
| | 77 | if mapping.has_key(ordinal): |
|---|
| | 78 | # try to apply custom mappings |
|---|
| | 79 | res += mapping.get(ordinal) |
|---|
| | 80 | elif decomposition(ch): |
|---|
| | 81 | normalized = normalize('NFKD', ch).strip() |
|---|
| | 82 | # string may contain non-letter chars too. Remove them |
|---|
| | 83 | # string may result to more than one char |
|---|
| | 84 | res += ''.join([c for c in normalized if c in allowed]) |
|---|
| | 85 | else: |
|---|
| | 86 | # hex string instead of unknown char |
|---|
| | 87 | res += "%x" % ordinal |
|---|
| | 88 | |
|---|
| | 89 | return res.encode('ascii') |
|---|
| | 90 | |
|---|