Changeset 1330

Show
Ignore:
Timestamp:
07/11/08 15:48:17 (3 months ago)
Author:
thomase
Message:

better reporting and failure detection; handle earthworks properly; more robust handling of italicized names in source (previously some locations were miscategorized as historical names); distinguish between different types of name variants; more robust handling of whitespace and type-related postfix strings for island and river names

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • BADataMunger/trunk/tableparser.py

    r1179 r1330  
    22import operator 
    33import re 
     4import sre_constants 
    45 
    56import lxml.etree as etree 
     
    5354    places = [] 
    5455    logging.info("BEGIN parsing a %s table: tableparser.parsetable()" % tabletype) 
    55     for ri, row in enumerate(table[1].xpath("descendant::*[local-name()='tr']")): 
     56    rows = table[1].xpath("descendant::*[local-name()='tr']") 
     57    rows_outref = 0 
     58    rows_inref = 0 
     59    rows_header = 0 
     60    rows_blank = 0 
     61    rows_xref = 0 
     62    for ri, row in enumerate(rows): 
    5663        rowtext = normalizetext(getalltext(row)) 
    5764        if len(row.xpath("ancestor::*[local-name()='thead']")) > 0 or ri == 0: 
    5865            # ignore stuff in the table header 
    59             pass 
     66            rows_header+=1 
    6067        elif rowtext.find('See Map') != -1: 
    6168            # ignore external cross references 
    62             pass 
    63         elif rowtext.find('See Roads') != -1: 
     69            rows_outref+=1 
     70        elif rowtext.find('See ') != -1: 
    6471            # ignore cross references to other tables in this directory 
    65             pass 
    66         elif rowtext.find('See Earthworks') != -1: 
    67             pass 
    68         elif rowtext.find('See Walls') != -1: 
    69             pass 
     72            rows_inref+=1 
    7073        elif rowtext == u'' or rowtext == u' ': 
    7174            # ignore empty rows 
    72             pass 
     75            rows_blank+=1 
    7376        elif rowtext.find(u' = ') != -1: 
    7477            # ignore internal cross references 
    75             pass 
    76         elif tabletype == 'earthworks': 
    77             print "%s: %s" % (tabletype, rowtext.encode('ascii', 'backslashreplace')) 
     78            rows_xref+=1 
     79#        elif tabletype == 'earthworks': 
     80#            logging.warning("no handling for %s\n>>>> table header: %s\n>>>> xml this row:\n%s" % (tabletype, getalltext(table[1].xpath("descendant::*[local-name()='tr']")[0]).replace('\n',' ').strip(), etree.tostring(row).encode('ascii', 'backslashreplace'))) 
    7881        else: 
    7982            p = Place() 
     
    97100                     
    98101            places = places + [p] 
    99     logging.info("DONE with tableparser.parsetable(); found %s places of type %s" % (len(places), tabletype)) 
     102    rows_act = len(rows) - rows_outref - rows_inref - rows_header - rows_blank - rows_xref 
     103    lmsg = "DONE with tableparser.parsetable(); created %s places of type %s from a table containing %s actionable rows" % (len(places), tabletype, rows_act) 
     104    if len(places) != rows_act: 
     105        logging.warning(lmsg) 
     106    else: 
     107        logging.info(lmsg) 
    100108         
    101109         
     
    159167         
    160168    def __str__(self): 
    161         result = '----------\n%s\n' % self.type 
     169        result = '----------\n%s\n' % self.dirtype 
    162170        if len(self.grid) > 0: 
    163171            result += 'grid: %s\n' % self.grid 
     
    286294         
    287295    def add_earthworks(self, cells): 
    288         # grid | locdesc | periods | references 
     296        # grid | name/locdesc | periods | references 
    289297        self.dirtype = 'earthworks' 
    290298        self.grid = normalizetext(getalltext(cells[0])) 
    291         self.locdesc = normalizetext(getalltext(cells[1])) 
     299        # parse_placenames tests for content inside italic or not; if inside italic, assumed to be locdesc, and added accordingly; otherwise 
     300        # assumed to be placename, also added accordingly 
     301        self.placenames = self.parse_placenames(cells[1], cells[2], cells[3]) 
     302        # self.locdesc = normalizetext(getalltext(cells[1])) 
    292303        self.periods = self.parse_periods(cells[2]) 
    293304        self.references = self.parse_references(cells[3]) 
     
    317328        self.namestring = normalizetext(getalltext(cells[1])) 
    318329        self.references = self.parse_references(cells[4]) 
    319          
    320330         
    321331    def add_false(self, cells): 
     
    409419            nameishes = [celltext,] 
    410420        for i, nameish in enumerate(nameishes): 
    411             if len(namecell.xpath("descendant::*[text()='%s'][last()]/ancestor-or-self::*[local-name()='i']" % nameish)) != 0: 
     421            # test to see if the nameish follows a "diamond" symbol (i.e., it is a minor alternative name) 
     422            try: 
     423                diamond_regex = re.compile(u".+\u00A7.*%s" % nameish.strip().replace('*', '')) 
     424            except sre_constants.error, detail: 
     425                logging.critical("error attempting to create a diamond_regex for nameish = '%s'; \n\terror detail: %s" % (nameish.strip(), detail)) 
     426                raise 
     427            diamond_m = diamond_regex.match(celltext) 
     428             
     429            # test for italic tag 
     430            nameishmatches = namecell.xpath("descendant-or-self::*[contains(normalize-space(text()),'%s')]" % nameish.strip()) 
     431            deepestnameishmatch = nameishmatches[-1] 
     432            nameishitalics = deepestnameishmatch.xpath("ancestor-or-self::*[local-name()='i']") 
     433            #if 'Brazda' in nameish: 
     434                #logging.debug("nameish = '%s'\n>>>> cellxml:\n%s\n" % (nameish, etree.tostring(namecell))) 
     435                #logging.debug("%s nameish matches" % len(nameishmatches)) 
     436                #for nm in nameishmatches: 
     437                #    logging.debug("nameish match xml:\n%s\n" % etree.tostring(nm)) 
     438                #logging.debug("deepestnameish match xml:\n%s\n" % deepestnameishmatch) 
     439                #logging.debug("%s nameishitalics" % len(nameishitalics)) 
     440                 
     441            if len(nameishitalics) != 0: 
    412442                # this is actually the modern location, not the placename 
     443                #logging.debug("interpreting italicized content in names column as location description for %s" % nameish.strip()) 
    413444                if len(self.locdesc) == 0: 
    414                     self.locdesc = nameish 
     445                    self.locdesc = nameish.strip() 
    415446                else: 
    416                     self.locdesc = u"%s, %s" % (nameish, self.locdesc) 
     447                    self.locdesc = u"%s, %s" % (nameish.strip(), self.locdesc) 
    417448            elif nameish.find(u'(') != -1: 
    418                 variants = parse_name_variants(nameish) 
     449                # this nameish contains parentheses, which means that we need to generate and store multiple variants 
     450                variants = parse_name_variants(nameish.strip()) 
    419451                for variant in variants: 
    420                     pn = self.munge_placename(namecell, variant, i
     452                    pn = self.munge_placename(namecell, variant, i, variant=True
    421453                    placenames.append(pn) 
     454            elif diamond_m: 
     455                # this nameish is a minor variant 
     456                pn = self.munge_placename(namecell, nameish.strip(), i, minor=True) 
     457                placenames.append(pn) 
    422458            else: 
    423                 pn = self.munge_placename(namecell, nameish, i) 
     459                # nothing special about this nameish; just record it as a placename 
     460                pn = self.munge_placename(namecell, nameish.strip(), i) 
    424461                placenames.append(pn) 
    425462                 
     
    456493         
    457494 
    458     def munge_placename(self, namecell, nametext, sequence): 
     495    def munge_placename(self, namecell, nametext, sequence,variant=False, minor=False): 
    459496        ntext = nametext 
    460497         
     
    499536            #print "'%s' is inferred (will become '%s')" % (ntext.encode('ascii', 'backslashreplace'), newntext.encode('ascii', 'backslashreplace')) 
    500537            ntext = newntext 
    501          
     538 
     539        # handle river name nonsense             
     540        if ntext[-3:] == u'fl.': 
     541            if 'river' not in self.types: 
     542                self.types.append('river') 
     543            ntext = ntext[:-3].strip() 
     544                 
     545        # handle island name nonsense 
     546        if ntext[-4:] == u'Ins.': 
     547            if 'island' not in self.types: 
     548                self.types.append('island') 
     549            ntext = ntext[:-4].strip() 
     550        if ntext[-5:] == u'Inss.': 
     551            if 'island-groups' not in self.types: 
     552                self.types.append('island-group') 
     553            ntext = ntext[:-5].strip() 
    502554         
    503555        # objectivy the placename 
    504         pn = Placename(ntext, normalizetext(getalltext(namecell)), sequence, completeness, certainty, accuracy, inferred
     556        pn = Placename(ntext, normalizetext(getalltext(namecell)), sequence, completeness, certainty, accuracy, inferred, variant, minor
    505557        return pn 
    506558         
     
    528580 
    529581class Placename: 
    530     def __init__(self, nstring=u'', onstring=u'', sequence=0, completeness='complete', certainty='certain', accuracy='accurate', inferred=False): 
     582    def __init__(self, nstring=u'', onstring=u'', sequence=0, completeness='complete', certainty='certain', accuracy='accurate', inferred=False,variant=False,minor=False): 
    531583        self.name=nstring 
    532584        self.originalNameString = onstring 
     
    538590        self.periods=[] 
    539591        self.references=[] 
    540          
    541          
     592        self.variant=variant 
     593        self.minorAlternative=minor 
    542594         
    543595    def __str__(self):