Changeset 1319
- Timestamp:
- 05/23/08 17:57:05 (6 months ago)
- Files:
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
pleiades.normalizer/trunk/pleiades/normalizer/base.py
r1315 r1319 29 29 MULTIPLE_DASHES_REGEX = re.compile(r"\-+") 30 30 EXTRA_DASHES_REGEX = re.compile(r"(^\-+)|(\-+$)") 31 RIVER_REGEX = re.compile(r" fl\.$") 32 ISLAND_REGEX = re.compile(r" Ins\.$", re.I) 33 ISLAND_GROUP_REGEX = re.compile(r" Inss\.$", re.I) 31 34 MAX_LENGTH = 120 35 32 36 33 37 def cropName(base, maxLength=MAX_LENGTH): … … 101 105 """Returns a normalized text. text has to be a unicode string. 102 106 """ 107 suffix = None 108 for re in [RIVER_REGEX, ISLAND_REGEX, ISLAND_GROUP_REGEX]: 109 m = re.search(label) 110 if m: 111 suffix = m.group() 112 label = re.sub('', label) 113 break 114 103 115 for text in label.split('/'): 116 if suffix is not None: 117 text = text + suffix 104 118 text = baseNormalize(text) 105 119 base = text.lower() pleiades.normalizer/trunk/pleiades/normalizer/tests/batlas.txt
r1315 r1319 3 3 4 4 >>> from pleiades.normalizer import normalizer 5 6 Simple tests 5 7 6 8 >>> list(normalizer.normalizeN(u'Tetrapyrgia')) … … 18 20 >>> list(normalizer.normalizeN(u'Kalaba(n)tia')) 19 21 ['kalabantia'] 20 22 23 Multiple names 24 21 25 >>> list(normalizer.normalizeN(u'Tripolis ad Maeandrum/Apollonia ad Maeandrum/Antoniopolis')) 22 26 ['tripolis-ad-maeandrum', 'apollonia-ad-maeandrum', 'antoniopolis'] 23 27 28 Multiply-named rivers, islands, island groups 29 30 >>> list(normalizer.normalizeN(u'Foo/Bar fl.')) 31 ['foo-fl', 'bar-fl'] 32 33 >>> list(normalizer.normalizeN(u'Foo/Bar Ins.')) 34 ['foo-ins', 'bar-ins'] 35 36 >>> list(normalizer.normalizeN(u'Foo/Bar Inss.')) 37 ['foo-inss', 'bar-inss'] 38 39 Non-ASCII characters 40 24 41 >>> list(normalizer.normalizeN(unicode('AÄva', 'utf-8'))) 25 42 ['agva'] … … 27 44 >>> list(normalizer.normalizeN(unicode('Ãaykenarı', 'utf-8'))) 28 45 ['caykenari'] 29 46 47 Non-ASCII punctuation 48 30 49 >>> list(normalizer.normalizeN(unicode('âIotanaâ', 'utf-8'))) 31 50 ['iotana'] 32
