Changeset 1344
- Timestamp:
- 07/22/08 13:36:47 (3 months ago)
- Files:
-
- BADataMunger/trunk/batlaspipe.py (modified) (1 diff)
- BADataMunger/trunk/bidmaker.py (modified) (3 diffs)
- BADataMunger/trunk/tableparser.py (modified) (11 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
BADataMunger/trunk/batlaspipe.py
r1339 r1344 156 156 grid = place.grid 157 157 158 if place.dirtype in ['road' ]:158 if place.dirtype in ['road','coastal-change']: 159 159 logging.info("batlaspipe.idit() deliberately suppressed ID creation for %s: (%s, %s, %s, %s)" % (place.dirtype, place.namestring, place.grid, place.locdesc, u' - '.join(place.itinerary))) 160 160 else: BADataMunger/trunk/bidmaker.py
r1338 r1344 44 44 phrase = origstring.strip() 45 45 46 if dirtype in ['road' ]:46 if dirtype in ['road','coastal-change']: 47 47 # suppress certain types because they're not helpful 48 48 return u'' 49 49 50 elif dirtype in [' earthworks']:50 elif dirtype in ['canal', 'earthworks']: 51 51 # might be named or unnamed 52 52 if len(phrase) > 0: … … 56 56 phrase = "%s %s" % (dirtype, locdesc.strip()) 57 57 58 elif len(placenames) > 0 or dirtype in ['name','numbered' ]:58 elif len(placenames) > 0 or dirtype in ['name','numbered','lettered']: 59 59 # the most basic case: features with names: 60 60 # use the batlas directory namestring … … 66 66 67 67 68 elif dirtype in ['aqueduct', 'aqueduct-group', 'bridge', ' levee', 'quarry', 'villa', 'villa-group', 'wall']:68 elif dirtype in ['aqueduct', 'aqueduct-group', 'bridge', 'bridge-group', 'causeway', 'cemetery', 'church-group', 'dam', 'fort', 'fort-group', 'levee', 'monument', 'quarry', 'quarry-group', 'road-station', 'spring', 'tumulus', 'villa', 'villa-group', 'wall', 'well']: 69 69 # second most common case: more-or-less normal features without names 70 70 phrase = "%s %s %s" % (dirtype, phrase, locdesc) BADataMunger/trunk/tableparser.py
r1337 r1344 16 16 CMAP_TMPL_NAMED_5 = {'grid':0,'placenames':1,'namestring':1, 'periods':2,'locdesc':3,'references':4} 17 17 18 GROUP_COUNT_REGEX = re.compile(r'\s+\((.)\)$') 18 CLEAN_NAMEISH_REGEX = re.compile(r'[\*\[\]]') 19 20 GROUP_COUNT_REGEX = re.compile(r'\s+\((\d)\)$') 21 22 POSTFIX_NUMERAL_REGEX = re.compile(r'([^\d]+)\d$') 19 23 20 24 tabletypes = { … … 23 27 "Bridge":"bridge", 24 28 "Bridges":"bridge", 29 "Canals (unnamed)":"canal", 30 "Canal (unnamed)":"canal", 31 "Canals (named)":"canal_named", 32 "Causeway":"causeway", 33 "Causeways":"causeway", 34 "Cemeteries":"cemetery", 35 "Church Cluster":"church_group", 36 "Clausurae (linear barriers)":"clausura", 37 "Coastal Change":"coastal_change", 38 "Dam":"dam", 39 "Dikes / Levees":"levee", 25 40 "Earthworks":"earthworks", 26 41 "False Toponyms":"false", 27 42 "False Toponym":"false", 43 "Fort":"fort", 44 "Forts":"fort", 45 "Forts / Fortified Settlements":"fort_set", 46 "Lettered Sites":"lettered_site", 47 "Lettered Place Names":"lettered_places", 48 "Lettered Water Names":"lettered_water", 28 49 "Levee":"levee", 29 50 "Levees":"levee", 51 "Mines / Quarries":"quarry", 52 "Monuments":"monument", 30 53 "Name":"name", 31 54 "Names":"name", 32 55 "Numbered Sites":"numbered", 56 "Numbered Fort Group":"numbered_fort_group", 33 57 "Quarry":"quarry", 34 58 "Quarries":"quarry", 35 59 "Roads":"road", 36 60 "Road":"road", 61 "Road Station":"road_station", 62 "Springs":"spring", 63 "Tumuli":"tumulus", 37 64 "Unlocated Toponyms":"unlocated", 38 65 "Unlocated Toponym":"unlocated", … … 40 67 "Villa":"villa", 41 68 "Walls":"wall", 42 "Wall":"wall" 69 "Wall":"wall", 70 "Well":"well" 43 71 } 44 72 … … 310 338 print "cellmap:" 311 339 print cellmap 312 if k == 'number' :340 if k == 'number' or k == 'letter': 313 341 self.number = normalizetext(getalltext(cells[v])) 314 342 if k == 'itinerary': … … 346 374 self.dirtype = 'bridge' 347 375 cmap = CMAP_TMPL_UN_4 348 self.addinator(cells, cmap) 376 self.addinator(cells, cmap) 377 378 def add_canal(self, cells): 379 self.dirtype = 'canal' 380 cmap = CMAP_TMPL_UN_4 381 self.addinator(cells, cmap) 382 383 def add_canal_named(self, cells): 384 self.dirtype = 'canal' 385 cmap = CMAP_TMPL_NAMED_5 386 self.addinator(cells,cmap) 387 388 def add_causeway(self, cells): 389 self.dirtype = 'causeway' 390 cmap = CMAP_TMPL_UN_4 391 self.addinator(cells, cmap) 392 393 def add_cemetery(self, cells): 394 self.dirtype = 'cemetery' 395 cmap = CMAP_TMPL_UN_4 396 self.addinator(cells, cmap) 397 398 def add_church_group(self, cells): 399 self.dirtype = 'church-group' 400 cmap = CMAP_TMPL_UN_4 401 self.addinator(cells, cmap) 402 403 def add_clausura(self, cells): 404 self.dirtype = 'wall' 405 self.types.append('clausura') 406 cmap = CMAP_TMPL_UN_4 407 self.addinator(cells, cmap) 408 409 def add_coastal_change(self, cells): 410 self.dirtype = 'coastal-change' 411 cmap = CMAP_TMPL_UN_4 412 self.addinator(cells, cmap) 413 414 def add_dam(self, cells): 415 self.dirtype = 'dam' 416 cmap = CMAP_TMPL_UN_4 417 self.addinator(cells, cmap) 349 418 350 419 def add_earthworks(self, cells): … … 367 436 cmap = {'grid':0, 'locdesc':1, 'periods':2, 'material':3, 'references':4} 368 437 self.addinator(cells,cmap) 438 439 def add_monument(self, cells): 440 self.dirtype = 'monument' 441 cmap = CMAP_TMPL_UN_4 442 self.addinator(cells, cmap) 369 443 370 444 def add_name(self, cells): … … 379 453 self.placenames = self.parse_placenames(cells[0], None, cells[1]) 380 454 455 def add_fort(self, cells): 456 self.dirtype = 'fort' 457 cmap = CMAP_TMPL_UN_4 458 self.addinator(cells, cmap) 459 460 def add_fort_set(self, cells): 461 self.dirtype = 'fort' 462 self.types.append('fortified-settlement') 463 cmap = CMAP_TMPL_UN_4 464 self.addinator(cells, cmap) 465 381 466 def add_numbered(self, cells): 382 467 self.dirtype = 'numbered' … … 384 469 self.addinator(cells, cmap) 385 470 471 def add_numbered_fort_group(self, cells): 472 self.dirtype = 'numbered' 473 self.types.append('fort-group') 474 cmap = {'number':0, 'namestring':0, 'grid':1, 'locdesc':2, 'periods':3, 'references':4} 475 self.addinator(cells, cmap) 476 477 def add_lettered_site(self, cells): 478 self.dirtype = 'lettered' 479 self.types.append('site') 480 cmap = {'letter':0, 'grid':1, 'placenames':2, 'namestring':2, 'periods':3, 'locdesc':4, 'references':5} 481 self.addinator(cells, cmap) 482 483 def add_lettered_places(self, cells): 484 self.dirtype = 'lettered' 485 self.types.append('place') 486 cmap = {'letter':0, 'grid':1, 'placenames':2, 'namestring':2, 'periods':3, 'locdesc':4, 'references':5} 487 self.addinator(cells, cmap) 488 489 def add_lettered_water(self, cells): 490 self.dirtype = 'lettered' 491 self.types.append('water') 492 cmap = {'letter':0, 'grid':1, 'placenames':2, 'namestring':2, 'periods':3, 'locdesc':4, 'references':5} 493 self.addinator(cells, cmap) 494 386 495 def add_road(self, cells): 387 496 self.dirtype = 'road' … … 389 498 self.addinator(cells,cmap) 390 499 500 def add_road_station(self, cells): 501 self.dirtype = 'road-station' 502 cmap = CMAP_TMPL_UN_4 503 self.addinator(cells, cmap) 504 505 def add_tumulus(self, cells): 506 self.dirtype = 'tumulus' 507 cmap = CMAP_TMPL_UN_4 508 self.addinator(cells, cmap) 509 510 def add_spring(self, cells): 511 self.dirtype = 'spring' 512 cmap = CMAP_TMPL_UN_4 513 self.addinator(cells, cmap) 514 391 515 def add_unlocated(self, cells): 392 516 self.dirtype = 'unlocated' … … 401 525 def add_wall(self, cells): 402 526 self.dirtype = 'wall' 527 cmap = CMAP_TMPL_UN_4 528 self.addinator(cells, cmap) 529 530 def add_well(self, cells): 531 self.dirtype = 'well' 403 532 cmap = CMAP_TMPL_UN_4 404 533 self.addinator(cells, cmap) … … 449 578 for i, nameish in enumerate(nameishes): 450 579 # test to see if the nameish follows a "diamond" symbol (i.e., it is a minor alternative name) 580 diamond_regex = re.compile(u".+\u00A7.*%s([\s\/].*|$)" % CLEAN_NAMEISH_REGEX.sub('', nameish.strip())) 581 diamond_m = diamond_regex.search(celltext) 582 583 # test for italic tag (and deal with MSWord formatting goofiness along the way) 451 584 try: 452 diamond_regex = re.compile(u".+\u00A7.*%s" % nameish.strip().replace('*', ''))453 except sre_constants.error, detail:454 logging.critical ("error attempting to create a diamond_regex for nameish = '%s'; \n\terror detail: %s" % (nameish.strip(), detail))585 nameishmatches = namecell.xpath('descendant-or-self::*[contains(normalize-space(text()),"%s")]' % nameish.strip()) 586 except etree.XPathSyntaxError, detail: 587 logging.critical ("incorporating nameish (%s) into an xpath expression caused a fatal error: %s" % (nameish.strip(), detail)) 455 588 raise 456 diamond_m = diamond_regex.match(celltext)457 458 # test for italic tag459 nameishmatches = namecell.xpath("descendant-or-self::*[contains(normalize-space(text()),'%s')]" % nameish.strip())460 589 try: 461 590 deepestnameishmatch = nameishmatches[-1] 462 591 except IndexError, detail: 463 592 cellxml = etree.tostring(namecell) 464 #logging.warning("error attempting to use xpath to find '%s' in xml\n:%s\nattempting to trap for serial italic tags" % (nameish.strip(), cellxml))593 m_postfix_numeral = POSTFIX_NUMERAL_REGEX.match(nameish.strip()) 465 594 if '</i><i>' in cellxml: 466 # logging.info('found </i></i> in cell xml, attempting to remove and retry')595 # serial italic tags (pointless msword artifact); replace them and try again 467 596 cellxml = cellxml.replace('</i><i>', '') 468 597 celltree = etree.XML(cellxml) 469 #logging.info("result of replacement attempt: '%s'" % etree.tostring(celltree))470 598 nameishmatches = celltree.xpath("descendant-or-self::*[contains(normalize-space(text()), '%s')]" % nameish.strip()) 471 599 if len(nameishmatches) > 0: 472 600 logging.info("adjusted for serial italic tags in a name field for %s" % nameish.strip()) 473 601 deepestnameishmatch = nameishmatches[-1] 602 elif m_postfix_numeral: 603 # there's a postfix numeral that may be causing trouble; try again without it 604 nameishmatches = namecell.xpath('descendant-or-self::*[contains(normalize-space(text()), "%s")]' % m_postfix_numeral.groups(1)[0].strip()) 605 if len(nameishmatches) > 0: 606 logging.info("adjusted for postfix numerial in a name field for %s" % nameish.strip()) 607 else: 608 logging.critical("tried to adjust for a postfix numeral in a name field for %s but it didn't work\n\tcelltext: '%s'\n\tcelltxml:\n%s\n\tregex group: '%s'" % (nameish.strip(), celltext, cellxml, m_postfix_numeral.groups(1)[0].strip())) 609 deepestnameishmatch = nameishmatches[-1] 474 610 else: 475 logging.critical (" serial italics do not seem to be the problem; error detail: %s" % detail)611 logging.critical ("index error: serial italics do not seem to be the problem\n\terror detail: %s\n\tnameish: %s\n\tcelltext: %s\n\tcelltxml:\n%s" % (detail, nameish.strip(), celltext, cellxml)) 476 612 raise 477 613
