Changeset 1427 for BADataMunger

Show
Ignore:
Timestamp:
09/12/08 15:45:31 (2 months ago)
Author:
thomase
Message:

sticking a fork in the aspect of BADataMunger that is the creation of batlas ids, or so I hope

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • BADataMunger/trunk

    • Property svn:ignore set to
      pleiades
      scratch
  • BADataMunger/trunk/batlaspipe.py

    r1371 r1427  
    178178                        if len(bid) > 0: 
    179179                            place.batlasids.append(bid) 
    180                         altlabel = maker.buildAltLabel(label, place.locdesc, placetypes) 
     180                        if len(place.placenames) > 0: 
     181                            altlabel = maker.buildAltLabel(label, place.placenames[0].name, placetypes) 
     182                        else: 
     183                            altlabel = maker.buildAltLabel(label, place.locdesc, placetypes) 
    181184                        bid = maker.makeID(altlabel, mapnum, grid, phraseprefix=label) 
    182185                        bid = self.vetid(bid, priorids) 
     
    195198                    
    196199                    if len(place.placenames) == 0: 
    197                         if place.dirtype == 'numbered'
     200                        if place.dirtype == 'numbered' and len(place.placenames) == 0
    198201                            names = place.locdesc.split('/') 
    199202                            prefix = label 
     
    292295                        citname = SLASH_REGEX.sub('/',p.namestring.strip()) 
    293296                        citcontent = "BAtlas %s false name %s" % (self.map_number, citname) 
    294                     elif p.dirtype == 'numbered'
     297                    elif p.dirtype == 'numbered' and len(p.placenames) == 0
    295298                        citname = SLASH_REGEX.sub('/',p.locdesc.strip()) 
     299                        citcontent = "BAtlas %s %s no. %s (%s)" % (self.map_number, p.grid, p.namestring, citname) 
     300                    elif p.dirtype == 'numbered' and len(p.placenames) > 0: 
     301                        citname = SLASH_REGEX.sub('/',p.placenames[0].name.strip()) 
    296302                        citcontent = "BAtlas %s %s no. %s (%s)" % (self.map_number, p.grid, p.namestring, citname) 
    297303                    elif p.dirtype == 'name' and len(p.namestring) == 0: 
  • BADataMunger/trunk/bidmaker.py

    r1370 r1427  
    6666         
    6767             
    68         elif dirtype in ['aqueduct', 'aqueduct-group', 'bath', 'bridge', 'bridge-group', 'causeway', 'cemetery', 'centuriation', 'centuriation-group', 'church-group', 'dam', 'dam-group', 'dike', 'dike-group', 'feature', 'feature-group', 'fort', 'fort-group', 'levee', 'lighthouse', 'lighthouse-group', 'mine', 'mole', 'monastery', 'monument', 'monument-group', 'pass', 'quarry', 'quarry-group', 'road-station', 'salt-pans', 'spring', 'tumulus', 'tunnel', 'villa', 'villa-group', 'wall', 'wall-group', 'waterwheel', 'well']: 
     68        elif dirtype in ['aqueduct', 'aqueduct-group', 'bath', 'bridge', 'bridge-group', 'causeway', 'cemetery', 'centuriation', 'centuriation-group', 'church-group', 'dam', 'dam-group', 'dike', 'dike-group', 'feature', 'feature-group', 'fort', 'fort-group', 'levee', 'lighthouse', 'lighthouse-group', 'mine', 'mine-group', 'mole', 'monastery', 'monument', 'monument-group', 'pass', 'quarry', 'quarry-group', 'reservoir', 'road-station', 'salt-pans', 'spring', 'tumulus', 'tunnel', 'villa', 'villa-group', 'wall', 'wall-group', 'waterwheel', 'well']: 
    6969            # second most common case: more-or-less normal features without names 
    7070            phrase = "%s %s %s" % (dirtype, phrase, locdesc) 
  • BADataMunger/trunk/etc/wordhtml

    • Property svn:ignore set to
      Shorcut*
      FinalDirVersions*
  • BADataMunger/trunk/tableparser.py

    r1369 r1427  
    2626SERIAL_ITALICS_SPACES_REGEX = re.compile('</i>(\s+)<i>') 
    2727SERIAL_ITALICS_COMMA_REGEX = re.compile('</i>(,\s+)<i>') 
     28EMPTY_ITALICS_REGEX = re.compile('<i> </i>') 
    2829 
    2930tabletypes = { 
     
    7374    "Mines / Quarries":"quarry", 
    7475    "Mines":"mine", 
     76    "Mines GLPMR":"mine_glpmr", 
    7577    "Mine GLPMR":"mine_glpmr", 
    7678    "Mines (Named)":"mine_glpmr_named", 
     
    8385    "Names":"name", 
    8486    "Names found only in Avienus":"names_avienus", 
     87    "Named Dioceses":"named_diocese", 
     88    "Named Provinces":"named_province", 
     89    "Named Provinces With Loc":"named_province_loc", 
     90    "Numbered Provinces":"numbered_province", 
     91    "Numbered Provinces With Loc":"numbered_province_loc", 
    8592    "Numbered Bridges":"numbered_bridge", 
    8693    "Numbered Features":"numbered", 
     
    9198    "Numbered Villas":"numbered_villas", 
    9299    "Pass":"pass", 
     100    "Pass (dated)":"pass_dated", 
    93101    "Passes (dated)":"pass_dated", 
    94102    "Quarry":"quarry", 
    95103    "Quarries":"quarry", 
    96104    "Quarry (dateless)":"quarry_dateless", 
     105    "Reservoir":"reservoir", 
    97106    "Roads":"road", 
    98107    "Road":"road",     
     
    107116    "Unlocated Toponyms":"unlocated", 
    108117    "Unlocated Toponym":"unlocated", 
     118    "Unlocated Toponyms (dateless)":"unlocated_dateless", 
     119    "Unlocated Toponym (dateless)":"unlocated_dateless", 
    109120    "Unlocated Coin Names":"unlocated_coin", 
     121    "Unlocated Provinces":"unlocated_province", 
    110122    "Unnamed Sites":"unnamed_site", 
    111123    "Villas":"villa", 
     
    113125    "Walls":"wall", 
    114126    "Wall":"wall", 
     127    "Walls (named)":"wall", 
    115128    "Wall (Named No Loc)" : "wall_named_noloc", 
    116129    "Wall (Unnamed)" : "wall_unnamed", 
     130    "Wall (unnamed)" : "wall_unnamed", 
    117131    "Walls (Unnamed)" : "wall_unnamed", 
     132    "Walls (unnamed)" : "wall_unnamed", 
    118133    "Walls (Dateless)":"wall_dateless", 
    119134    "Walls / Fortifications":"wall", 
     
    165180            # ignore empty rows 
    166181            rows_blank+=1 
    167         elif rowtext.find(u' = ') != -1: 
     182        elif rowtext.find(u'=') != -1: 
    168183            # ignore internal cross references 
    169184            rows_xref+=1 
     
    176191            p.rowi = ri 
    177192            rowcells = row.xpath("*[local-name()='td']") 
     193            #rowlen = 0 
     194            #rowlens = [len(''.join(cell.text.split())) for cell in rowcells] 
     195            #for l in rowlens: 
     196             #   rowlen = rowlen + l 
     197            #if rowlen < 5: 
     198             #   logging.critical("There's no content in here!") 
    178199            configs = pparser_config = context['config'].xpath('//%s' % tabletype) 
    179200            if len(configs) == 1: 
     
    614635     
    615636    def add_name(self, cells): 
     637        logging.debug("adding names for:\n\t%s\n\t%s\n\t%s\n\t%s\n\t%s" % tuple([etree.tostring(cell).replace('\n', ' ') for cell in cells])) 
    616638        self.dirtype = 'name' 
    617639        cmap = CMAP_TMPL_NAMED_5         
     
    700722        self.addinator(cells, cmap) 
    701723 
     724    def add_reservoir(self, cells): 
     725        self.dirtype = 'reservoir' 
     726        cmap = CMAP_TMPL_UN_4 
     727        self.addinator(cells, cmap) 
     728         
    702729    def add_road(self, cells): 
    703730        self.dirtype = 'road' 
     
    745772        self.addinator(cells, cmap) 
    746773         
     774    def add_unlocated_dateless(self, cells): 
     775        self.dirtype = 'unlocated' 
     776        cmap = {'placenames':0, 'namestring':0, 'locdesc':1, 'references':2} 
     777        self.addinator(cells, cmap) 
     778         
    747779    def add_unlocated_coin(self, cells): 
    748780        self.dirtype = 'unlocated_coin' 
     
    750782        self.addinator(cells, cmap) 
    751783         
     784    def add_unlocated_province(self, cells): 
     785        self.dirtype = 'unlocated' 
     786        self.types.append('province') 
     787        cmap = {'placenames':0, 'namestring':0, 'locdesc':1} 
     788        self.addinator(cells, cmap) 
     789 
    752790    def add_unnamed_site(self, cells): 
    753791        self.dirtype = 'feature' 
     
    758796        self.dirtype = 'unlocated' 
    759797        cmap = {'placenames':0, 'namestring':0, 'avienus':1, 'locdesc':2, 'references':3} 
     798        self.addinator(cells, cmap) 
     799         
     800    def add_named_diocese(self, cells): 
     801        self.dirtype = 'name' 
     802        self.types.append('diocese') 
     803        cmap = {'grid':0, 'namestring':1, 'placenames':1} 
     804        self.addinator(cells, cmap) 
     805         
     806    def add_named_province(self, cells): 
     807        self.dirtype = 'name' 
     808        self.types.append('province') 
     809        cmap = {'grid':0, 'placenames':1, 'namestring':1} 
     810        self.addinator(cells, cmap) 
     811         
     812    def add_named_province_loc(self, cells): 
     813        self.dirtype = 'name' 
     814        self.types.append('province') 
     815        cmap = {'grid':0, 'placenames':1, 'namestring':1, 'locdesc':2} 
     816        self.addinator(cells, cmap) 
     817         
     818    def add_numbered_province_loc(self, cells): 
     819        self.dirtype = 'numbered' 
     820        self.types.append('province') 
     821        cmap = {'number':0, 'grid':1, 'namestring':0, 'placenames':2, 'locdesc':3} 
     822        self.addinator(cells, cmap) 
     823         
     824    def add_numbered_province(self, cells): 
     825        self.dirtype = 'numbered' 
     826        self.types.append('province') 
     827        cmap = {'number':0, 'grid':1, 'namestring':0, 'placenames':2} 
    760828        self.addinator(cells, cmap) 
    761829         
     
    846914            # test to see if the nameish follows a "diamond" symbol (i.e., it is a minor alternative name) 
    847915            try: 
    848                 diamond_expr = u".+\u00A7.*%s([\s\/].*|$)" % CLEAN_NAMEISH_REGEX.sub('', nameish.strip()) 
     916                #diamond_expr = u".+\u00A7.*%s([\s\/].*|$)" % CLEAN_NAMEISH_REGEX.sub('', nameish.strip()) 
     917                diamond_expr = u".+\u00A7\s+%s" % CLEAN_NAMEISH_REGEX.sub('', nameish.strip()) 
     918                logging.debug("diamond expression is: '%s'" % diamond_expr) 
    849919                diamond_regex = re.compile(diamond_expr) 
    850920            except sre_constants.error, detail: 
     
    852922                raise 
    853923            diamond_m = diamond_regex.search(celltext) 
     924            if diamond_m: 
     925                logging.debug("nameish (%s) follows a diamond: %s" % (nameish.strip(), diamond_m.group())) 
    854926                 
    855927            # test for italic tag (and deal with MSWord formatting goofiness along the way) 
     928            thexpath = 'descendant-or-self::*[contains(normalize-space(text()),"%s")]' % nameish.strip() 
    856929            try: 
    857                 nameishmatches = namecell.xpath('descendant-or-self::*[contains(normalize-space(text()),"%s")]' % nameish.strip()
     930                nameishmatches = namecell.xpath(thexpath
    858931            except etree.XPathSyntaxError, detail: 
    859932                logging.critical ("incorporating nameish (%s) into an xpath expression caused a fatal error: %s" % (nameish.strip(), detail)) 
    860933                raise 
    861                  
     934             
     935            if len(nameishmatches) == 0: 
     936                logging.debug("there were %s nameishmatches for nameish = '%s'" % (len(nameishmatches), nameish.strip())) 
     937                logging.debug("xpath was '%s'" % thexpath) 
     938             
    862939            try: 
    863940                deepestnameishmatch = nameishmatches[-1] 
     
    866943                m_postfix_numeral = POSTFIX_NUMERAL_REGEX.match(nameish.strip()) 
    867944                m_postfix_parenthetical = POSTFIX_PARENTHETICAL_REGEX.match(nameish.strip()) 
    868                 newcellxml = SERIAL_ITALICS_REGEX.sub('', cellxml) 
     945                newcellxml = cellxml 
     946                newcellxml = EMPTY_ITALICS_REGEX.sub('', newcellxml) 
     947                newcellxml = SERIAL_ITALICS_REGEX.sub('', newcellxml) 
    869948                newcellxml = SERIAL_ITALICS_SPACES_REGEX.sub(' ', newcellxml) 
    870949                newcellxml = SERIAL_ITALICS_COMMA_REGEX.sub(', ', newcellxml) 
  • BADataMunger/trunk/wordnormalizer.py

    r1302 r1427  
    3636    ('&#160;',' '),     # non-breaking space 
    3737    ('&nbsp;',' '),     # non-breaking space bis 
    38     ('&#2026;','...')   # horizontal ellipsis 
     38    ('&#2026;','...') ,  # horizontal ellipsis 
     39    ('\xc2\xa0;', ' ') # non-breaking space in utf-8 
    3940] 
    4041 
  • BADataMunger/trunk/wordstripper.xsl

    r1304 r1427  
    4949 
    5050    <!-- suppress all style and br elements --> 
    51     <xsl:template match="*[local-name()='style' or local-name()='br']"/> 
     51    <xsl:template match="*[local-name()='style' or local-name()='br' or local-name()='sup']"/> 
     52     
     53    <xsl:template match="*[local-name()='i' and normalize-space(.)='&#803;']">&#803;</xsl:template> 
    5254     
    5355    <!-- strip span tags, but pass through their sub nodes, note that line breaks may matter :| --> 
     
    5860        </xsl:choose> 
    5961    </xsl:template> 
     62     
     63    <xsl:template match="*[local-name()='i' and normalize-space(text()) = ' ']"/> 
    6064     
    6165    <xsl:template match="*"><xsl:call-template name="elepassthrough"/></xsl:template>