Changeset 825
- Timestamp:
- 06/12/07 14:10:41 (2 years ago)
- Files:
-
- BADataMunger/trunk/nscleanup.xsl (modified) (1 diff)
- BADataMunger/trunk/pipeline.py (modified) (4 diffs)
- BADataMunger/trunk/tablegroker.py (modified) (2 diffs)
- BADataMunger/trunk/tableparser.py (modified) (18 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
BADataMunger/trunk/nscleanup.xsl
r816 r825 1 1 <?xml version="1.0" encoding="UTF-8"?> 2 <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0" 3 xmlns:xhtml="http://www.w3.org/1999/xhtml" 4 xmlns:mods="http://www.loc.gov/mods/v3" 5 xmlns:tei="http://www.tei-c.org/ns/1.0"> 2 <xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0" xmlns:xhtml="http://www.w3.org/1999/xhtml" 3 xmlns:mods="http://www.loc.gov/mods/v3" xmlns:tei="http://www.tei-c.org/ns/1.0" 4 xmlns:awmcgaz="http://www.unc.edu/awmc/gazetteer/schemata/ns/0.3" 5 xmlns:adlgaz="http://www.alexandria.ucsb.edu/gazetteer/ContentStandard/version3.2/" 6 xmlns:georss="http://www.georss.org/georss" xmlns:dc="http://purl.org/dc/elements/1.1/"> 6 7 7 <xsl:template match="/"><xsl:apply-templates/></xsl:template> 8 8 <xsl:template match="/"> 9 <xsl:apply-templates /> 10 </xsl:template> 11 9 12 <xsl:template match="xhtml:*"> 10 13 <xsl:element name="{local-name()}" namespace="http://www.w3.org/1999/xhtml"> 11 <xsl:copy-of select="@*" />12 <xsl:apply-templates />14 <xsl:copy-of select="@*" /> 15 <xsl:apply-templates /> 13 16 </xsl:element> 14 17 </xsl:template> 15 18 16 19 <xsl:template match="mods:*"> 17 20 <xsl:element name="{local-name()}" namespace="http://www.loc.gov/mods/v3"> 18 <xsl:copy-of select="@*" />19 <xsl:apply-templates />21 <xsl:copy-of select="@*" /> 22 <xsl:apply-templates /> 20 23 </xsl:element> 21 24 </xsl:template> 22 25 23 26 <xsl:template match="tei:*"> 24 <xsl:element name=" {local-name()}" namespace="http://www.tei-c.org/ns/1.0">25 <xsl:copy-of select="@*" />26 <xsl:apply-templates />27 <xsl:element name="tei:{local-name()}" namespace="http://www.tei-c.org/ns/1.0"> 28 <xsl:copy-of select="@*" /> 29 <xsl:apply-templates /> 27 30 </xsl:element> 28 31 </xsl:template> 29 32 33 <xsl:template match="awmcgaz:geoEntity"> 34 <awmcgaz:geoEntity xmlns:awmcgaz="http://www.unc.edu/awmc/gazetteer/schemata/ns/0.3" 35 xmlns:adlgaz="http://www.alexandria.ucsb.edu/gazetteer/ContentStandard/version3.2/" 36 xmlns:georss="http://www.georss.org/georss" xmlns:tei="http://www.tei-c.org/ns/1.0" 37 xmlns:dc="http://purl.org/dc/elements/1.1/"> 38 <xsl:copy-of select="@*" /> 39 <xsl:apply-templates /> 40 </awmcgaz:geoEntity> 41 </xsl:template> 42 43 <xsl:template match="awmcgaz:*"> 44 <xsl:element name="awmcgaz:{local-name()}" namespace="http://www.unc.edu/awmc/gazetteer/schemata/ns/0.3"> 45 <xsl:copy-of select="@*" /> 46 <xsl:apply-templates /> 47 </xsl:element> 48 </xsl:template> 49 50 <xsl:template match="adlgaz:*"> 51 <xsl:element name="adlgaz:{local-name()}" 52 namespace="http://www.alexandria.ucsb.edu/gazetteer/ContentStandard/version3.2/"> 53 <xsl:copy-of select="@*" /> 54 <xsl:apply-templates /> 55 </xsl:element> 56 </xsl:template> 57 58 <xsl:template match="georss:*"> 59 <xsl:element name="georss:{local-name()}" namespace="http://www.georss.org/georss"> 60 <xsl:copy-of select="@*" /> 61 <xsl:apply-templates /> 62 </xsl:element> 63 </xsl:template> 64 65 <xsl:template match="dc:*"> 66 <xsl:element name="dc:{local-name()}" namespace="http://purl.org/dc/elements/1.1/"> 67 <xsl:copy-of select="@*" /> 68 <xsl:apply-templates /> 69 </xsl:element> 70 </xsl:template> 71 30 72 </xsl:stylesheet> BADataMunger/trunk/pipeline.py
r796 r825 9 9 import tablegroker 10 10 import tableparser 11 import placesaver 12 from etreehelps import getalltext 13 from texthelps import normalizetext 11 14 12 15 … … 17 20 Use like: 18 21 19 import pipeline20 p = pipeline.Pipe(r'C:\TomDocs\awmcwork\pleiadesact\svnbox\BADataMunger\wordhtml\BATL038_.htm')21 p.cycle()22 import pipeline 23 p = pipeline.Pipe(r'C:\TomDocs\awmcwork\pleiadesact\svnbox\BADataMunger\wordhtml\BATL065_.htm', r'C:\TomDocs\awmcwork\pleiadesact\svnbox\BADataMunger\wordhtml\BATL065_config.xml') 24 p.cycle() 22 25 23 26 24 27 """ 25 28 26 def __init__(self, source ):29 def __init__(self, source, source_config=None): 27 30 """Create a pipe object to manage the conversions and open the source 28 31 file 29 32 """ 33 30 34 31 35 self.data = {} … … 41 45 self['wordhtml'] = f.read() 42 46 f.close() 47 if source_config: 48 self['config_path'] = normpath(normcase(source_config)) 49 if not isfile(self['config_path']): 50 raise Error, "No file found at path: %s" % self['config_path'] 51 else: 52 f = open(self['config_path']) 53 config_text = f.read() 54 f.close() 55 self['config'] = etree.XML(config_text) 56 try: 57 self.map_number = self['config'].xpath("//map_number")[0].text 58 except: 59 self.map_number = 'XYZ' 60 self.creators = [] 61 self.contributors = [] 62 self.rights = '' 63 for c in self['config'].xpath("//creator"): 64 self.creators.append(normalizetext(getalltext(c))) 65 for c in self['config'].xpath("//contributor"): 66 self.contributors.append(normalizetext(getalltext(c))) 67 try: 68 self.rights = self['config'].xpath("//rights")[0].text 69 except: 70 pass 71 43 72 44 73 def __getitem__(self, key): return self.data[key] … … 54 83 bibliosaver.save_biblio_mods(self) 55 84 self['dirtables'] = tablegroker.grok(self['cleanxml']) 56 self['places'] = tableparser.parse(self['dirtables']) 85 self['places'] = tableparser.parse(self, self['dirtables'], self.map_number) 86 placesaver.save_places_tei(self) 87 placesaver.save_places_frank(self) 57 88 58 89 def save(self, itemkey, encoding='utf-8'): BADataMunger/trunk/tablegroker.py
r795 r825 6 6 """Find the tables in source and return a dictionary of them, keyed by their titles.""" 7 7 tables={} 8 8 9 9 10 # find listing div … … 15 16 16 17 # iterate through tables and get their titles 17 for t able in dirlistdiv.xpath("descendant::*[local-name()='table']"):18 for ti, table in enumerate(dirlistdiv.xpath("descendant::*[local-name()='table']")): 18 19 text = normalizetext(getalltext(table.xpath("preceding-sibling::*[local-name()='p'][1]")[0])) 19 tables[text]= table20 tables[text]=(ti,table) 20 21 return tables BADataMunger/trunk/tableparser.py
r814 r825 1 1 import operator 2 2 import re 3 4 import lxml.etree as etree 3 5 4 6 import xpermutations … … 10 12 "Aqueducts":"aqueduct", 11 13 "Aqueduct":"aqueduct", 14 "Bridge":"bridge", 15 "Bridges":"bridge", 16 "False Toponyms":"false", 17 "False Toponym":"false", 12 18 "Name":"name", 13 19 "Names":"name", … … 16 22 "Road":"road", 17 23 "Unlocated Toponyms":"unlocated", 18 "Unlocated Toponym":"unlocated" 24 "Unlocated Toponym":"unlocated", 25 "Walls":"wall", 26 "Wall":"wall" 19 27 } 20 28 21 29 periodcodes = ['A','C','H','R','L'] 22 30 23 def parse( tabledict):31 def parse(context, tabledict, map_number): 24 32 """Parse a dictionary of etree tables and extract all the place information. 25 33 Return the info in a list.""" … … 27 35 places = [] 28 36 for key in tabledict.keys(): 29 print ">>>> key = '%s'" % key37 # print ">>>> key = '%s'" % key 30 38 if key.startswith('Abbreviation'): 31 39 pass 32 40 else: 33 41 tabletype = tabletypes[key] 34 places += parsetable( tabletype, tabledict[key])42 places += parsetable(context, tabletype, tabledict[key], map_number) 35 43 36 44 return places 37 45 38 def parsetable( tabletype, table):46 def parsetable(context, tabletype, table, map_number): 39 47 places = [] 40 for r ow in table.xpath("*[local-name()='tr']"):48 for ri, row in enumerate(table[1].xpath("descendant::*[local-name()='tr']")): 41 49 rowtext = normalizetext(getalltext(row)) 42 if rowtext.find('See Map') != -1: 50 if len(row.xpath("ancestor::*[local-name()='thead']")) > 0: 51 # ignore stuff in the table header 52 pass 53 elif rowtext.find('See Map') != -1: 43 54 # ignore external cross references 44 55 pass … … 50 61 pass 51 62 else: 52 print "%s: %s" % (tabletype, rowtext.encode('ascii', 'backslashreplace'))63 # print "%s: %s" % (tabletype, rowtext.encode('ascii', 'backslashreplace')) 53 64 p = Place() 54 parser = getattr(p, "add_%s" % tabletype) 55 parser(row.xpath("*[local-name()='td']")) 56 print p 65 p.map_number = map_number 66 p.tablei = table[0] 67 p.rowi = ri 68 rowcells = row.xpath("*[local-name()='td']") 69 configs = pparser_config = context['config'].xpath('//%s' % tabletype) 70 if len(configs) == 1: 71 p.parse_special(rowcells, configs[0]) 72 elif len(configs) == 0: 73 pparser = getattr(p, "add_%s" % tabletype) 74 pparser(rowcells) 75 else: 76 raise Error, "multiple config elements found for '%' table in file '%s'" % (tabletype, context['config_path']) 77 #print p 57 78 #if tabletype == 'unlocated': 58 79 # for pname in p.placenames: … … 105 126 self.locdesc=u'' 106 127 self.itinerary=[] 128 self.itinraw = u'' 107 129 self.placenames=[] 130 self.namestring = u'' 108 131 self.number='' 109 132 self.periods=[] 110 133 self.references=[] 111 134 self.type='' 135 112 136 113 137 def __str__(self): … … 118 142 result += 'locdesc: %s\n' % self.locdesc.encode('ascii','backslashreplace') 119 143 if len(self.itinerary) > 0: 144 result += 'raw itinerary: %s' % self.itinraw.encode('ascii', 'backslashreplace') 120 145 result += 'itinerary:' 121 146 for itin in self.itinerary: … … 142 167 143 168 169 def parse_special(self, cells, config): 170 # this batch of cells requires special parsing because the format is idiosyncratic 171 # print "SPECIAL PARSING: %s" % etree.tostring(config) 172 directives = config.xpath("./*") 173 for d in directives: 174 self.parse_special_directive(d, cells) 175 176 def parse_special_directive(self, d, cells): 177 try: 178 celli = int(d.attrib['cell']) 179 except: 180 pass 181 if d.tag == 'type': 182 self.type = d.text 183 elif d.tag == 'choose': 184 185 whens = d.xpath("./when") 186 otherwise = d.xpath("./otherwise") 187 matched = False 188 for i, when in enumerate(whens): 189 dcelli = int(when.attrib['cell']) 190 dval = when.attrib['value'] 191 if normalizetext(cells[dcelli].text) == dval: 192 193 matched = True 194 for subd in when.xpath("./*"): 195 self.parse_special_directive(subd, cells) 196 break 197 if matched: 198 pass 199 elif otherwise: 200 for subd in otherwise.xpath("./*"): 201 self.parse_special_directive(subd, cells) 202 elif d.tag == 'itinerary': 203 self.itinerary = self.parse_itinerary(cells[celli]) 204 elif d.tag == 'placenames': 205 206 # placenames should be coordinated with periods, if possible 207 try: 208 context = d.xpath("ancestor::when | ancestor::otherwise")[0] 209 except: 210 context = d.xpath("ancestor::*")[1] 211 if context.tag == 'when' or context.tag == 'otherwise': 212 try: 213 pcelli = context.xpath("./descendant::periods")[0].attrib['cell'] 214 except: 215 pass 216 else: 217 try: 218 pcelli = context.xpath("./descendant::periods[not(ancestor::when) and not(ancestor::otherwise)]")[0].attrib['cell'] 219 except: 220 pass 221 if pcelli: 222 #print "processing placenames from celli = %s (%s) with periods from pcelli = %s (%s)" 223 self.placenames = self.parse_placenames(cells[celli], cells[pcelli]) 224 else: 225 self.placenames = self.parse_placenames(cells[celli]) 226 elif d.tag == 'itinraw': 227 self.itinraw = normalizetext(getalltext(cells[celli])) 228 elif d.tag == 'periods': 229 self.periods = self.parse_periods(cells[celli]) 230 elif d.tag == 'references': 231 self.references = self.parse_references(cells[celli]) 232 else: 233 print "Untrapped tag = '%s'" % d.tag 234 235 144 236 def add_aqueduct(self, cells): 145 237 # grid | locdesc | periods | references … … 150 242 self.references = self.parse_references(cells[3]) 151 243 244 def add_bridge(self, cells): 245 # grid | locdesc | periods | references 246 self.type = 'bridge' 247 self.grid = normalizetext(getalltext(cells[0])) 248 self.locdesc = normalizetext(getalltext(cells[1])) 249 self.periods = self.parse_periods(cells[2]) 250 self.references = self.parse_references(cells[3]) 251 152 252 def add_name(self, cells): 153 253 # grid | placename(s) | periods | locdesc | references … … 157 257 self.locdesc = normalizetext(getalltext(cells[3])) 158 258 self.placenames = self.parse_placenames(cells[1], cells[2]) 259 self.namestring = normalizetext(getalltext(cells[1])) 159 260 self.references = self.parse_references(cells[4]) 160 pass 261 262 263 def add_false(self, cells): 264 # name | references | comment 265 self.type = 'false' 266 self.placenames = self.parse_placenames(cells[0]) 267 self.namestring = normalizetext(getalltext(cells[0])) 268 self.references = self.parse_references(cells[1]) 269 self.comment = normalizetext(getalltext(cells[2])) 161 270 162 271 def add_numbered(self, cells): … … 164 273 self.type = 'numbered' 165 274 self.number = normalizetext(getalltext(cells[0])) 275 self.namestring = normalizetext(getalltext(cells[0])) 166 276 self.grid = normalizetext(getalltext(cells[1])) 167 277 self.locdesc = normalizetext(getalltext(cells[2])) 168 278 self.periods = self.parse_periods(cells[3]) 169 279 self.references = self.parse_references(cells[4]) 170 pass280 171 281 172 282 def add_road(self, cells): 173 283 # itinerary | period | reference 174 284 self.type = 'road' 285 self.itinraw = normalizetext(getalltext(cells[0])) 175 286 self.itinerary = self.parse_itinerary(cells[0]) 176 287 self.periods = self.parse_periods(cells[1]) 177 288 self.references = self.parse_references(cells[2]) 178 pass289 179 290 180 291 def add_unlocated(self, cells): … … 184 295 self.locdesc = normalizetext(getalltext(cells[2])) 185 296 self.placenames = self.parse_placenames(cells[0], cells[1]) 186 self.references = self.parse_references(cells[2]) 187 pass 297 self.namestring = normalizetext(getalltext(cells[0])) 298 self.references = self.parse_references(cells[3]) 299 300 301 def add_wall(self, cells): 302 # grid | locdesc | periods | references 303 self.type = 'wall' 304 self.grid = normalizetext(getalltext(cells[0])) 305 self.locdesc = normalizetext(getalltext(cells[1])) 306 self.periods = self.parse_periods(cells[2]) 307 self.references = self.parse_references(cells[3]) 188 308 189 309 def parse_periods(self, periodcell): … … 216 336 return [normalizetext(place) for place in places] 217 337 218 def parse_placenames(self, namecell, periodcell ):338 def parse_placenames(self, namecell, periodcell=None): 219 339 """We can have multiple placenames in a single cell, and there are two types of delimiters. 220 340 Parse this mess to get the individual names, and then deal with variants too.""" … … 248 368 # associate period information, if necessary, with the placenames 249 369 # if separate periods are provided for each work 250 periodparas = periodcell.xpath("*[local-name()='p']") 251 if len(periodparas) > 1: 370 if periodcell: 371 periodparas = periodcell.xpath("*[local-name()='p']") 372 print "len(periodparas) == %s" % len(periodparas) 373 if len(periodparas) > 0: 374 for pname in placenames: 375 try: 376 pname.periods = self.parse_periods(periodparas[pname.originalPosition]) 377 except: 378 pass 379 elif len(self.periods) > 0: 252 380 for pname in placenames: 253 try: 254 pname.periods = self.parse_periods(periodparas[pname.originalPosition]) 255 except: 256 pass 381 pname.periods = self.periods 257 382 258 383 return placenames … … 288 413 pattern = u'[\u2018\u2019]' 289 414 newntext = re.sub(pattern, u'', ntext) 290 print "'%s' is attested but inaccurate (will become '%s')" % (ntext.encode('ascii', 'backslashreplace'), newntext.encode('ascii', 'backslashreplace'))415 #print "'%s' is attested but inaccurate (will become '%s')" % (ntext.encode('ascii', 'backslashreplace'), newntext.encode('ascii', 'backslashreplace')) 291 416 ntext = newntext 292 417 … … 300 425 pattern = u'[\[\]]' 301 426 newntext = re.sub(pattern, u'', ntext) 302 print "'%s' is inferred (will become '%s')" % (ntext.encode('ascii', 'backslashreplace'), newntext.encode('ascii', 'backslashreplace'))427 #print "'%s' is inferred (will become '%s')" % (ntext.encode('ascii', 'backslashreplace'), newntext.encode('ascii', 'backslashreplace')) 303 428 ntext = newntext 304 429 … … 331 456 332 457 class Placename: 333 def __init__(self, nstring=u'', onstring=u'', sequence=0, completeness='complete', certainty='certain', accuracy='ac urate', inferred=False):458 def __init__(self, nstring=u'', onstring=u'', sequence=0, completeness='complete', certainty='certain', accuracy='accurate', inferred=False): 334 459 self.name=nstring 335 460 self.originalNameString = onstring … … 344 469 345 470 def parse_periodcell(self, periodcell): 471 print "parse_periodcell" 346 472 periods=[] 473 pctext = normalizetext(getalltext(periodcell)) 474 if len(pctext) > 0: 475 periodishes = periodcell.xpath("descendant::*[local-name()='p']") 476 try: 477 periodish = periodishes[self.originalPosition] 478 except: 479 pass 480 if periodish: 481 lpstring = normalizetext(getalltext(periodish)) 482 if len(lpstring) > 0: 483 for pcode in periodcodes: 484 qpcode = '%s?' % pcode 485 if lpstring.find(qpcode) != -1: 486 lpstring = lpstring.replace(qpcode, '') 487 if lpstring.find(pcode) != -1: 488 lpstring = lpstring.replace(pcode, '') 489 periods.append((pcode, 'confident')) 490 else: 491 periods.append((pcode, 'less-confident')) 492 else: 493 if lpstring.find(pcode) != -1: 494 lpstring = lpstring.replace(pcode, '') 495 periods.append((pcode, 'confident')) 496 347 497 return periods 348 498
