root/BADataMunger/trunk/bidmaker.py

Revision 1427, 4.9 kB (checked in by thomase, 4 months ago)

sticking a fork in the aspect of BADataMunger that is the creation of batlas ids, or so I hope

Line 
1 import re
2 import logging
3
4 from pleiades.normalizer import normalizer
5
6 DIAMOND_STOP_REGEX = re.compile(u"\u00A7.+")
7 SILENT_POSTFIX_REGEX = re.compile(r', (T|Mon)\.')
8 ITIN_IGNORE_REGEX = re.compile(u'[\?\*\[\]\u2018\u2019,]')
9 RIVER_REGEX = re.compile(r' fl\.$')
10 ISLAND_REGEX = re.compile(r' Ins\.$')
11 ISLAND_GROUP_REGEX = re.compile(r' Inss\.$')
12
13 ALLIDS = []
14
15 class BAtlasIDMaker:
16     """A class for making URIs for Barrington Atlas features"""
17    
18     def makeID (self, phrase, mapnum, gridsq, phrasepostfix='', phraseprefix=''):
19         """Make an ID from a properly prepared string."""
20         rawpieces = [phrase, mapnum, gridsq]
21         pieces = []
22         for p in rawpieces:
23             q = u'-'.join(normalizer.normalizeN(p))
24             pieces.append(q)
25         if len(phrasepostfix) == 0:
26             pass
27         else:
28             pieces.insert(1, phrasepostfix)
29         if len(phraseprefix) == 0:
30             pass
31         else:
32             pieces.insert(0, phraseprefix)
33         pieces = [p for p in pieces if len(p) > 0]
34         ident = u'-'.join(pieces)
35         return ident
36        
37        
38     def buildLabel (self, origstring, placenames, dirtype, locdesc = u'', itin = u''):
39         """Dir file doesn't exactly preserve the label, so we have to create it."""
40        
41         if dirtype == 'false':
42             logging.debug("false origstring: %s" % origstring)
43            
44         phrase = origstring.strip()
45        
46         if dirtype in ['road','coastal-change', 'unlabeled']:
47             # suppress certain types because they're not helpful
48             return u''
49            
50         elif dirtype in ['canal', 'canal-group', 'earthworks']:
51             # might be named or unnamed
52             if len(phrase) > 0:
53                 phrase = DIAMOND_STOP_REGEX.sub('', phrase)
54                 phrase = SILENT_POSTFIX_REGEX.sub('', phrase)
55             else:
56                 phrase = "%s %s" % (dirtype, locdesc.strip())
57        
58         elif len(placenames) > 0 or dirtype in ['name','numbered','lettered']:
59             # the most basic case: features with names:
60             #     use the batlas directory namestring
61             #     strip off the so-called "diamonds": variants in the dir but not on the map
62             #     strip out suffix tags for particular subtypes that are not used in map labels
63             phrase = DIAMOND_STOP_REGEX.sub('', phrase)
64             phrase = SILENT_POSTFIX_REGEX.sub('', phrase)
65            
66        
67            
68         elif dirtype in ['aqueduct', 'aqueduct-group', 'bath', 'bridge', 'bridge-group', 'causeway', 'cemetery', 'centuriation', 'centuriation-group', 'church-group', 'dam', 'dam-group', 'dike', 'dike-group', 'feature', 'feature-group', 'fort', 'fort-group', 'levee', 'lighthouse', 'lighthouse-group', 'mine', 'mine-group', 'mole', 'monastery', 'monument', 'monument-group', 'pass', 'quarry', 'quarry-group', 'reservoir', 'road-station', 'salt-pans', 'spring', 'tumulus', 'tunnel', 'villa', 'villa-group', 'wall', 'wall-group', 'waterwheel', 'well']:
69             # second most common case: more-or-less normal features without names
70             phrase = "%s %s %s" % (dirtype, phrase, locdesc)
71             if dirtype == 'road':
72                 cleanit = SILENT_POSTFIX_REGEX.sub(u'', itin)
73                 phrase = "%s %s" % (phrase, cleanit)
74                 phrase = ITIN_IGNORE_REGEX.sub(u'', phrase)
75                
76        
77            
78            
79         # there are various missing classes
80         else:
81             logging.warning("bidmaker.buildLabel did nothing for origstring='%s', dirtype='%s', locdesc='%s'" % (origstring, dirtype, locdesc))
82            
83         if dirtype == 'false':
84             logging.debug("returning label: '%s'" % phrase.strip())
85            
86         return phrase.strip()
87        
88      
89     def buildAltLabel (self, origstring, name, placetypes):
90         """We allow IDs that incorporate only one name in a multi-name label, so we have to generate
91         these according to a sensible model."""
92        
93         phrase = name.strip()
94         if phrase == DIAMOND_STOP_REGEX.sub('', origstring.strip()).strip():
95             return u''
96         else:
97             phrase = SILENT_POSTFIX_REGEX.sub('', phrase)
98             phrase = self.addSmartSuffix (phrase, placetypes)
99             return phrase.strip()
100        
101     def addSmartSuffix (self, name, placetypes):
102         """Sweet blackberry surprise."""
103        
104         fixregs = {
105             'island' : ISLAND_REGEX,
106             'island-group' : ISLAND_GROUP_REGEX,
107             'river' : RIVER_REGEX
108         }
109        
110         suffixes = {
111             'island' : u'Ins.',
112             'island-group' : u'Inss.',
113             'river' : u'fl.'
114         }
115        
116         phrase = name
117         for placetype in placetypes:
118             if fixregs.has_key(placetype):
119                 m = fixregs[placetype].search(phrase)
120                 if not m:
121                     phrase = u"%s %s" % (phrase, suffixes[placetype])
122                     break
123         return phrase
124        
125
Note: See TracBrowser for help on using the browser.