Changeset 1344

Show
Ignore:
Timestamp:
07/22/08 13:36:47 (3 months ago)
Author:
thomase
Message:

suppress coastal change; develop more nuanced handling of bad markup inside names; add parsing support for many table types including: canal, causeway, cemetery, church 'cluster', clausurae, coastal change, dams, dikes, forts, lettered sites/places/water, levees, mines/quarries, monuments, numbered fort groups, road stations, springs, tumuli, wells

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • BADataMunger/trunk/batlaspipe.py

    r1339 r1344  
    156156                grid = place.grid 
    157157 
    158             if place.dirtype in ['road']: 
     158            if place.dirtype in ['road','coastal-change']: 
    159159                logging.info("batlaspipe.idit() deliberately suppressed ID creation for %s: (%s, %s, %s, %s)" % (place.dirtype, place.namestring, place.grid, place.locdesc, u' - '.join(place.itinerary))) 
    160160            else: 
  • BADataMunger/trunk/bidmaker.py

    r1338 r1344  
    4444        phrase = origstring.strip() 
    4545         
    46         if dirtype in ['road']: 
     46        if dirtype in ['road','coastal-change']: 
    4747            # suppress certain types because they're not helpful 
    4848            return u'' 
    4949             
    50         elif dirtype in ['earthworks']: 
     50        elif dirtype in ['canal', 'earthworks']: 
    5151            # might be named or unnamed 
    5252            if len(phrase) > 0: 
     
    5656                phrase = "%s %s" % (dirtype, locdesc.strip()) 
    5757         
    58         elif len(placenames) > 0 or dirtype in ['name','numbered']: 
     58        elif len(placenames) > 0 or dirtype in ['name','numbered','lettered']: 
    5959            # the most basic case: features with names: 
    6060            #     use the batlas directory namestring 
     
    6666         
    6767             
    68         elif dirtype in ['aqueduct', 'aqueduct-group', 'bridge', 'levee', 'quarry', 'villa', 'villa-group', 'wall']: 
     68        elif dirtype in ['aqueduct', 'aqueduct-group', 'bridge', 'bridge-group', 'causeway', 'cemetery', 'church-group', 'dam', 'fort', 'fort-group', 'levee', 'monument', 'quarry', 'quarry-group', 'road-station', 'spring', 'tumulus', 'villa', 'villa-group', 'wall', 'well']: 
    6969            # second most common case: more-or-less normal features without names 
    7070            phrase = "%s %s %s" % (dirtype, phrase, locdesc) 
  • BADataMunger/trunk/tableparser.py

    r1337 r1344  
    1616CMAP_TMPL_NAMED_5 = {'grid':0,'placenames':1,'namestring':1, 'periods':2,'locdesc':3,'references':4} 
    1717 
    18 GROUP_COUNT_REGEX = re.compile(r'\s+\((.)\)$') 
     18CLEAN_NAMEISH_REGEX = re.compile(r'[\*\[\]]') 
     19 
     20GROUP_COUNT_REGEX = re.compile(r'\s+\((\d)\)$') 
     21 
     22POSTFIX_NUMERAL_REGEX = re.compile(r'([^\d]+)\d$') 
    1923 
    2024tabletypes = { 
     
    2327    "Bridge":"bridge", 
    2428    "Bridges":"bridge", 
     29    "Canals (unnamed)":"canal", 
     30    "Canal (unnamed)":"canal", 
     31    "Canals (named)":"canal_named", 
     32    "Causeway":"causeway", 
     33    "Causeways":"causeway", 
     34    "Cemeteries":"cemetery", 
     35    "Church Cluster":"church_group", 
     36    "Clausurae (linear barriers)":"clausura", 
     37    "Coastal Change":"coastal_change", 
     38    "Dam":"dam", 
     39    "Dikes / Levees":"levee", 
    2540    "Earthworks":"earthworks", 
    2641    "False Toponyms":"false", 
    2742    "False Toponym":"false", 
     43    "Fort":"fort", 
     44    "Forts":"fort", 
     45    "Forts / Fortified Settlements":"fort_set", 
     46    "Lettered Sites":"lettered_site", 
     47    "Lettered Place Names":"lettered_places", 
     48    "Lettered Water Names":"lettered_water", 
    2849    "Levee":"levee", 
    2950    "Levees":"levee", 
     51    "Mines / Quarries":"quarry", 
     52    "Monuments":"monument", 
    3053    "Name":"name", 
    3154    "Names":"name", 
    3255    "Numbered Sites":"numbered", 
     56    "Numbered Fort Group":"numbered_fort_group", 
    3357    "Quarry":"quarry", 
    3458    "Quarries":"quarry", 
    3559    "Roads":"road", 
    3660    "Road":"road",     
     61    "Road Station":"road_station", 
     62    "Springs":"spring", 
     63    "Tumuli":"tumulus", 
    3764    "Unlocated Toponyms":"unlocated", 
    3865    "Unlocated Toponym":"unlocated", 
     
    4067    "Villa":"villa", 
    4168    "Walls":"wall", 
    42     "Wall":"wall" 
     69    "Wall":"wall", 
     70    "Well":"well" 
    4371} 
    4472 
     
    310338                    print "cellmap:" 
    311339                    print cellmap 
    312             if k == 'number'
     340            if k == 'number' or k == 'letter'
    313341                self.number = normalizetext(getalltext(cells[v])) 
    314342            if k == 'itinerary': 
     
    346374        self.dirtype = 'bridge' 
    347375        cmap = CMAP_TMPL_UN_4 
    348         self.addinator(cells, cmap)             
     376        self.addinator(cells, cmap)        
     377         
     378    def add_canal(self, cells): 
     379        self.dirtype = 'canal' 
     380        cmap = CMAP_TMPL_UN_4 
     381        self.addinator(cells, cmap) 
     382         
     383    def add_canal_named(self, cells): 
     384        self.dirtype = 'canal' 
     385        cmap = CMAP_TMPL_NAMED_5         
     386        self.addinator(cells,cmap) 
     387         
     388    def add_causeway(self, cells): 
     389        self.dirtype = 'causeway' 
     390        cmap = CMAP_TMPL_UN_4 
     391        self.addinator(cells, cmap) 
     392         
     393    def add_cemetery(self, cells): 
     394        self.dirtype = 'cemetery' 
     395        cmap = CMAP_TMPL_UN_4 
     396        self.addinator(cells, cmap) 
     397         
     398    def add_church_group(self, cells): 
     399        self.dirtype = 'church-group' 
     400        cmap = CMAP_TMPL_UN_4 
     401        self.addinator(cells, cmap) 
     402         
     403    def add_clausura(self, cells): 
     404        self.dirtype = 'wall' 
     405        self.types.append('clausura') 
     406        cmap = CMAP_TMPL_UN_4 
     407        self.addinator(cells, cmap) 
     408         
     409    def add_coastal_change(self, cells): 
     410        self.dirtype = 'coastal-change' 
     411        cmap = CMAP_TMPL_UN_4 
     412        self.addinator(cells, cmap) 
     413         
     414    def add_dam(self, cells): 
     415        self.dirtype = 'dam' 
     416        cmap = CMAP_TMPL_UN_4 
     417        self.addinator(cells, cmap) 
    349418         
    350419    def add_earthworks(self, cells): 
     
    367436        cmap = {'grid':0, 'locdesc':1, 'periods':2, 'material':3, 'references':4} 
    368437        self.addinator(cells,cmap) 
     438         
     439    def add_monument(self, cells): 
     440        self.dirtype = 'monument' 
     441        cmap = CMAP_TMPL_UN_4 
     442        self.addinator(cells, cmap) 
    369443     
    370444    def add_name(self, cells): 
     
    379453        self.placenames = self.parse_placenames(cells[0], None, cells[1]) 
    380454         
     455    def add_fort(self, cells): 
     456        self.dirtype = 'fort' 
     457        cmap = CMAP_TMPL_UN_4 
     458        self.addinator(cells, cmap) 
     459         
     460    def add_fort_set(self, cells): 
     461        self.dirtype = 'fort' 
     462        self.types.append('fortified-settlement') 
     463        cmap = CMAP_TMPL_UN_4 
     464        self.addinator(cells, cmap) 
     465         
    381466    def add_numbered(self, cells): 
    382467        self.dirtype = 'numbered' 
     
    384469        self.addinator(cells, cmap) 
    385470         
     471    def add_numbered_fort_group(self, cells): 
     472        self.dirtype = 'numbered' 
     473        self.types.append('fort-group') 
     474        cmap = {'number':0, 'namestring':0, 'grid':1, 'locdesc':2, 'periods':3, 'references':4}  
     475        self.addinator(cells, cmap) 
     476         
     477    def add_lettered_site(self, cells): 
     478        self.dirtype = 'lettered' 
     479        self.types.append('site') 
     480        cmap = {'letter':0, 'grid':1, 'placenames':2, 'namestring':2, 'periods':3, 'locdesc':4, 'references':5} 
     481        self.addinator(cells, cmap) 
     482         
     483    def add_lettered_places(self, cells): 
     484        self.dirtype = 'lettered' 
     485        self.types.append('place') 
     486        cmap = {'letter':0, 'grid':1, 'placenames':2, 'namestring':2, 'periods':3, 'locdesc':4, 'references':5} 
     487        self.addinator(cells, cmap) 
     488         
     489    def add_lettered_water(self, cells): 
     490        self.dirtype = 'lettered' 
     491        self.types.append('water') 
     492        cmap = {'letter':0, 'grid':1, 'placenames':2, 'namestring':2, 'periods':3, 'locdesc':4, 'references':5} 
     493        self.addinator(cells, cmap) 
     494 
    386495    def add_road(self, cells): 
    387496        self.dirtype = 'road' 
     
    389498        self.addinator(cells,cmap) 
    390499         
     500    def add_road_station(self, cells): 
     501        self.dirtype = 'road-station' 
     502        cmap = CMAP_TMPL_UN_4 
     503        self.addinator(cells, cmap) 
     504         
     505    def add_tumulus(self, cells): 
     506        self.dirtype = 'tumulus' 
     507        cmap = CMAP_TMPL_UN_4 
     508        self.addinator(cells, cmap) 
     509         
     510    def add_spring(self, cells): 
     511        self.dirtype = 'spring' 
     512        cmap = CMAP_TMPL_UN_4 
     513        self.addinator(cells, cmap) 
     514         
    391515    def add_unlocated(self, cells): 
    392516        self.dirtype = 'unlocated' 
     
    401525    def add_wall(self, cells): 
    402526        self.dirtype = 'wall' 
     527        cmap = CMAP_TMPL_UN_4 
     528        self.addinator(cells, cmap) 
     529         
     530    def add_well(self, cells): 
     531        self.dirtype = 'well' 
    403532        cmap = CMAP_TMPL_UN_4 
    404533        self.addinator(cells, cmap) 
     
    449578        for i, nameish in enumerate(nameishes): 
    450579            # test to see if the nameish follows a "diamond" symbol (i.e., it is a minor alternative name) 
     580            diamond_regex = re.compile(u".+\u00A7.*%s([\s\/].*|$)" % CLEAN_NAMEISH_REGEX.sub('', nameish.strip())) 
     581            diamond_m = diamond_regex.search(celltext) 
     582             
     583            # test for italic tag (and deal with MSWord formatting goofiness along the way) 
    451584            try: 
    452                 diamond_regex = re.compile(u".+\u00A7.*%s" % nameish.strip().replace('*', '')) 
    453             except sre_constants.error, detail: 
    454                 logging.critical("error attempting to create a diamond_regex for nameish = '%s'; \n\terror detail: %s" % (nameish.strip(), detail)) 
     585                nameishmatches = namecell.xpath('descendant-or-self::*[contains(normalize-space(text()),"%s")]' % nameish.strip()) 
     586            except etree.XPathSyntaxError, detail: 
     587                logging.critical ("incorporating nameish (%s) into an xpath expression caused a fatal error: %s" % (nameish.strip(), detail)) 
    455588                raise 
    456             diamond_m = diamond_regex.match(celltext) 
    457              
    458             # test for italic tag 
    459             nameishmatches = namecell.xpath("descendant-or-self::*[contains(normalize-space(text()),'%s')]" % nameish.strip()) 
    460589            try: 
    461590                deepestnameishmatch = nameishmatches[-1] 
    462591            except IndexError, detail: 
    463592                cellxml = etree.tostring(namecell) 
    464                 #logging.warning("error attempting to use xpath to find '%s' in xml\n:%s\nattempting to trap for serial italic tags" % (nameish.strip(), cellxml)) 
     593                m_postfix_numeral = POSTFIX_NUMERAL_REGEX.match(nameish.strip()) 
    465594                if '</i><i>' in cellxml: 
    466                     #logging.info('found </i></i> in cell xml, attempting to remove and retry') 
     595                    # serial italic tags (pointless msword artifact); replace them and try again 
    467596                    cellxml = cellxml.replace('</i><i>', '') 
    468597                    celltree = etree.XML(cellxml) 
    469                     #logging.info("result of replacement attempt: '%s'" % etree.tostring(celltree)) 
    470598                    nameishmatches = celltree.xpath("descendant-or-self::*[contains(normalize-space(text()), '%s')]" % nameish.strip()) 
    471599                    if len(nameishmatches) > 0: 
    472600                        logging.info("adjusted for serial italic tags in a name field for %s" % nameish.strip()) 
    473601                    deepestnameishmatch = nameishmatches[-1] 
     602                elif m_postfix_numeral: 
     603                    # there's a postfix numeral that may be causing trouble; try again without it 
     604                    nameishmatches = namecell.xpath('descendant-or-self::*[contains(normalize-space(text()), "%s")]' % m_postfix_numeral.groups(1)[0].strip()) 
     605                    if len(nameishmatches) > 0: 
     606                        logging.info("adjusted for postfix numerial in a name field for %s" % nameish.strip()) 
     607                    else: 
     608                        logging.critical("tried to adjust for a postfix numeral in a name field for %s but it didn't work\n\tcelltext: '%s'\n\tcelltxml:\n%s\n\tregex group: '%s'" % (nameish.strip(), celltext, cellxml, m_postfix_numeral.groups(1)[0].strip())) 
     609                    deepestnameishmatch = nameishmatches[-1] 
    474610                else: 
    475                     logging.critical ("serial italics do not seem to be the problem; error detail: %s" % detail
     611                    logging.critical ("index error: serial italics do not seem to be the problem\n\terror detail: %s\n\tnameish: %s\n\tcelltext: %s\n\tcelltxml:\n%s" % (detail, nameish.strip(), celltext, cellxml)
    476612                    raise 
    477613