root/BADataMunger/trunk/gismixer.py

Revision 1177, 13.3 kB (checked in by thomase, 1 year ago)

provide a more flexible range of suppression options when evaluating whether to add gis data that didn't match anything in the directory

  • Property svn:eol-style set to native
Line 
1 import logging
2
3 import lxml.etree as etree
4
5 from etreehelps import getalltext
6 from texthelps import normalizetext
7 import tableparser
8
9 ESRI = "http://www.esri.com/schemas/ArcGIS/9.0"
10
11 class gisMixer:
12     """"""
13    
14     def __init__(self, parent, source):
15        
16         self.parent = parent
17         # read in the source file
18         f = open(source)
19         gis_text = f.read()
20         f.close()
21         tree = etree.XML(gis_text)
22        
23         # get the field names
24         e_fields = tree.xpath("//WorkspaceData/descendant::Fields/FieldArray/Field")
25         self.fieldnames = [normalizetext(getalltext(e_field.xpath("Name")[0])) for e_field in e_fields]
26         self.gridi = self.fieldnames.index('GridSquare')
27         self.labeli = self.fieldnames.index('Label')
28         self.disambiguatori = self.fieldnames.index('disambiguator')
29         self.shapei = self.fieldnames.index('SHAPE')
30         self.typei = self.fieldnames.index('Type')
31         self.approxi = self.fieldnames.index('Approximate')
32        
33         # get the geometry type
34         e_geofielddef = tree.xpath("//WorkspaceData/descendant::Fields/FieldArray/Field[%s]/GeometryDef/GeometryType" % (self.shapei + 1))
35         self.geotype = normalizetext(getalltext(e_geofielddef[0]))
36         logging.info("geotype=%s" % self.geotype)
37        
38         # get the data and store it for later use
39         self.fieldvalues = []
40         e_records = tree.xpath("//WorkspaceData/descendant::Records/Record")
41         for i, e_record in enumerate(e_records):
42             self.fieldvalues.append([])
43             e_values = e_record.xpath("Values/Value")
44             for j, e_value in enumerate(e_values):
45                 if j == self.shapei:
46                     self.fieldvalues[i].append(self.parseshape(e_value))
47                 else:
48                     self.fieldvalues[i].append(normalizetext(getalltext(e_value)))
49            
50         # create a match flag list, one item for each set of fielddata
51         self.fieldmatches = [False for i in range(len(self.fieldvalues))]
52            
53         logging.info ("gisMixer got %s records from %s" % (len(self.fieldvalues), source))
54         logging.debug("FIELD NAMES ARE:")
55         for name in self.fieldnames:
56             logging.debug("%s" % name )
57         logging.debug("FIRST RECORD CONTENT:")
58         for item in self.fieldvalues[0]:
59             logging.debug("%s" % item)
60            
61     def parseshape(self, e_shape):
62         coords = []
63         if self.geotype == 'esriGeometryPoint':
64             t_x = e_shape.findtext('X')
65             t_y = e_shape.findtext('Y')
66             cpair = (round(eval(t_x), 4), round(eval(t_y), 4))
67             coords.append(cpair)
68         else:
69             logging.warning("gismixer.gisMixer.parseshape() has no support for geotype = %s" % self.geotype)
70         return coords
71            
72     def mixall(self, config, places):
73
74         logging.info("BEGIN attempting to match and combine %s directory places with %s map/gis places" \
75             % (len(places),len(self.fieldvalues)))
76        
77         for place in places:
78             if place.dirtype == 'unlocated' or place.dirtype == 'false':
79                 place.matched = True
80             else:
81                 self.mixin(config, place)
82                
83         logging.info("DONE attempting to match and combine %s directory places with %s map/gis places" % \
84             (len(places),len(self.fieldvalues)))
85        
86         logging.info("BEGIN attempting to deal with map/gis places for which there is no match in the directory")
87        
88         gisdata = self.fieldvalues
89         matchedgisdata = [value for i, value in enumerate(self.fieldvalues) if self.fieldmatches[i]]
90        
91        
92         if len(gisdata) == len(matchedgisdata):
93             logging.info("all GIS places (%s) were matched" % len(gisdata))
94         elif len(gisdata) > len(matchedgisdata):
95             logging.info("%s of the %s GIS places were matched" % (len(matchedgisdata), len(gisdata)))
96             unmatchedgisdata = [value for i, value in enumerate(self.fieldvalues) if self.fieldmatches[i] == False]
97             suppressnodes = config.xpath("//suppress")
98             lbl_suppressions = []
99             full_suppressions = []
100             for snode in suppressnodes:
101                 try:
102                     supgrid = normalizetext(getalltext(snode.xpath("grid")[0]))
103                 except:
104                     supgrid = ''
105                 try:
106                     suplabel = normalizetext(getalltext(snode.xpath("label")[0]))
107                 except:
108                     suplabel = ''
109                 try:
110                     suptype = normalizetext(getalltext(snode.xpath("type")[0]))
111                 except:
112                     suptype = ''
113                 try:
114                     supdisamb = normalizetext(getalltext(snode.xpath("disambiguator")[0]))
115                 except:
116                     supdisamb = ''
117                 if len(suplabel) > 0 and suptype == '' and supdisamb == '':
118                     lbl_suppressions.append((supgrid, suplabel))
119                 else:
120                     full_suppressions.append((supgrid, suplabel, suptype, supdisamb))
121                
122             #suppressions = [(normalizetext(getalltext(snode.xpath("grid")[0])), normalizetext(getalltext(snode.xpath("label")[0]))) for snode in suppressnodes]
123            
124             for i, value in enumerate(unmatchedgisdata):
125                 supmatches = [item for item in lbl_suppressions if item == (value[self.gridi], value[self.labeli])]
126                 if len(supmatches) == 1:
127                     logging.info("suppressing item %s based on a grid-and-label suppression directive:" % i)
128                     for j, item in enumerate(value):
129                         logging.info("    %s = %s" % (self.fieldnames[j], item))
130                 elif len(supmatches) > 1:
131                     logging.warning("multiple suppression directives match this item (%s, %s)" % (value[self.gridi], value[self.labeli]))
132                 else:
133                     supmatches = [item for item in full_suppressions if item == (value[self.gridi], value[self.labeli], value[self.typei].lower(), value[self.disambiguatori])]
134                     if len(supmatches) == 1:
135                         logging.info("suppressing item %s based on a full suppression directive:" % i)
136                         for j, item in enumerate(value):
137                             logging.info("    %s = %s" % (self.fieldnames[j], item))
138                     elif len(supmatches) > 1:
139                         logging.warning("multiple suppression directives match this item (%s, %s, %s, %s)" % (value[self.gridi], value[self.labeli], value[self.typei].lower(), value[self.disambiguatori]))
140                     else:
141                         logging.warning("UNMATCHED map/gis place %s was NOT suppressed:" % i)
142                         for j, item in enumerate(value):
143                             logging.warning("    %s = %s" % (self.fieldnames[j], item))
144                         places.append(self.addin(value, i))
145         else:
146             logging.warning("tallies absurdly claim that more GIS places were matched (%s) than are actually in the GIS data (%s)" \
147                 % (len(matchedgisdata), len(gisdata)))
148         logging.info("DONE attempting to deal with map/gis places for which there is no match in the directory")
149        
150     def addin(self, gisdata, seq):
151         p = tableparser.Place()
152         p.anonsequence = seq
153         p.map_number = self.parent.map_number
154         p.tablei = -1
155         p.rowi = -1
156         p.matched = True
157         p.shapes.append(gisdata[self.shapei])
158         p.approximates.append(gisdata[self.approxi])
159         p.types.append(self.get_type(gisdata[self.typei].lower()))
160         p.grid = gisdata[self.gridi]
161         return p
162
163     def mixin(self, config, place):
164         r = self.get_records(config, place)
165         if r:
166
167             if place.matched:
168                 logging.warning("place claims it's already matched")
169             place.matched = True
170                
171             # mix the data
172            
173             for i, re in enumerate(r):
174                 place.shapes.append(re[self.shapei])
175                 place.approximates.append(re[self.approxi])
176                 place.types.append(self.get_type(re[self.typei].lower()))
177         else:
178             try:
179                 logging.warning( "failed to match %s: '%s' (%s - %s), rowi=%s, disambiguator: %s" % (place.dirtype, label.encode('ascii', 'backslashreplace'), place.locdesc.encode('ascii', 'backslashreplace'), place.itinraw.encode('ascii', 'backslashreplace'), place.rowi, disambiguator))
180             except:
181                 logging.warning( "failed to match %s: '%s' (%s - %s), rowi=%s" % (place.dirtype, place.namestring.encode('ascii', 'backslashreplace'), place.locdesc.encode('ascii', 'backslashreplace'), place.itinraw.encode('ascii', 'backslashreplace'), place.rowi))
182        
183            
184     def get_type(self, rawtype):
185         if rawtype == 'road station':
186             cleantype = 'station'
187         elif rawtype == 'water wheel':
188             cleantype = 'wheel'
189         elif rawtype == 'mountain pass':
190             cleantype = 'pass'
191         elif rawtype == 'inland water':
192             cleantype = 'water-inland'
193         elif rawtype == 'open water':
194             cleantype = 'water-open'
195         elif rawtype == 'salt marsh':
196             cleantype = 'salt-marsh'
197         else:
198             cleantype = rawtype
199         return cleantype
200                    
201     def get_records(self, config, p):
202
203         gridmatches = []
204         typematches = []
205         labelmatches = []
206         matches = []
207        
208         candidates = [value for i, value in enumerate(self.fieldvalues) if self.fieldmatches[i] == False]
209         label = self.clean_label(p.namestring)
210         gridsquare = p.grid
211         soughttype = p.dirtype.lower()
212        
213         try:
214             e_dis = config.xpath("//disambiguator[@tabletype='%s' and @rowi='%s']" % (soughttype , p.rowi ))[0]
215             disambiguator = normalizetext(getalltext(e_dis))
216         except:
217             disambiguator = 0
218            
219         logging.info("seeking a %s in gridsquare %s with disambiguator %s" % (soughttype, gridsquare, disambiguator))
220        
221        
222         try:
223             gridmatches = [value for value in candidates if value[self.gridi] == gridsquare]
224         except ValueError:
225             logging.warning( "gisMixer could not find any data for gridsquare = %s (seeking %s)" % (gridsquare, label))
226         if len(gridmatches)>0 and soughttype != 'name' and soughttype != 'numbered':
227             try:
228                 typematches = [value for value in gridmatches if value[self.typei].lower() == soughttype]
229             except:
230                 logging.warning("gisMixer could not find any data for type = '%s' in grid '%s'" % (soughttype, gridsquare))
231         elif len(gridmatches)>0:
232             typematches = gridmatches
233            
234         if len(gridmatches) == 0:
235             logging.warning("gisMixer found 0 gridmatches")
236         elif len(typematches) == 0:
237             logging.warning("gisMixer found 0 typematches in grid")
238        
239         if len(typematches) > 0:
240             try:
241                 labelmatches = [value for value in typematches if self.clean_label(value[self.labeli]) == label]
242             except ValueError:
243                 logging.warning("gisMixer could not find any data for label = '%s' in grid '%s' with type '%s'" % (label, gridsquare, soughttype))
244         if len(labelmatches) > 0 and disambiguator != 0:
245             matches = [value for value in labelmatches if value[self.disambiguatori] == disambiguator]
246             if len(matches) == 0:
247                 logging.warning("gisMaker could not find any data for label = '%s' and disambiguator = '%s' in grid '%s'" % (label, gridsquare, disambiguator))
248         elif len(labelmatches) == 0:
249             logging.warning("gisMaker could not find any data for label = '%s' in grid '%s'" % (label, gridsquare))
250             matches = labelmatches
251         else:
252             matches = labelmatches
253         if len(matches) == 1:
254             thematch = matches[0]
255             self.fieldmatches[self.fieldvalues.index(thematch)] = True
256             return matches
257         elif len(matches) > 1:
258             try:
259                 multe = config.xpath("//multiple[@tabletype='%s' and @rowi='%s']" % (soughttype , p.rowi ))[0]
260                 #print multe
261                 multext = normalizetext(getalltext(multe))
262                 #print multext
263                 mult = eval(multext)
264             except:
265                 mult = 0
266             if len(matches) == mult:
267                 for thematch in matches:
268                     self.fieldmatches[self.fieldvalues.index(thematch)] = True
269                 return matches
270             else:
271                 logging.warning("gisMaker found %s apparent matches for label = '%s', in grid '%s', with disambiguator = '%s' and mult = '%s'" % (len(matches), label, gridsquare, disambiguator, mult))
272                 return None
273         else:
274             return None
275            
276     def clean_label(self, raw_label):
277         clabel = raw_label.replace("/ ", "/")
278         diamondi = clabel.find(u'\xa7')
279         if diamondi > 0:
280             clabel = clabel[:diamondi-1]
281         clabel = clabel.replace(", Mon.", "")
282         clabel = clabel.replace(", T.", "")
283         clabel = clabel.replace(u"\u2018", u"'")
284         clabel = clabel.replace(u"\u2019", u"'")
285         #print "'%s'" % clabel.encode('ascii', 'backslashreplace')
286         clabel = clabel.strip()
287         return clabel
288        
289        
290                
Note: See TracBrowser for help on using the browser.