Changeset 1330
- Timestamp:
- 07/11/08 15:48:17 (3 months ago)
- Files:
-
- BADataMunger/trunk/tableparser.py (modified) (11 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
BADataMunger/trunk/tableparser.py
r1179 r1330 2 2 import operator 3 3 import re 4 import sre_constants 4 5 5 6 import lxml.etree as etree … … 53 54 places = [] 54 55 logging.info("BEGIN parsing a %s table: tableparser.parsetable()" % tabletype) 55 for ri, row in enumerate(table[1].xpath("descendant::*[local-name()='tr']")): 56 rows = table[1].xpath("descendant::*[local-name()='tr']") 57 rows_outref = 0 58 rows_inref = 0 59 rows_header = 0 60 rows_blank = 0 61 rows_xref = 0 62 for ri, row in enumerate(rows): 56 63 rowtext = normalizetext(getalltext(row)) 57 64 if len(row.xpath("ancestor::*[local-name()='thead']")) > 0 or ri == 0: 58 65 # ignore stuff in the table header 59 pass66 rows_header+=1 60 67 elif rowtext.find('See Map') != -1: 61 68 # ignore external cross references 62 pass63 elif rowtext.find('See Roads') != -1:69 rows_outref+=1 70 elif rowtext.find('See ') != -1: 64 71 # ignore cross references to other tables in this directory 65 pass 66 elif rowtext.find('See Earthworks') != -1: 67 pass 68 elif rowtext.find('See Walls') != -1: 69 pass 72 rows_inref+=1 70 73 elif rowtext == u'' or rowtext == u' ': 71 74 # ignore empty rows 72 pass75 rows_blank+=1 73 76 elif rowtext.find(u' = ') != -1: 74 77 # ignore internal cross references 75 pass76 elif tabletype == 'earthworks':77 print "%s: %s" % (tabletype, rowtext.encode('ascii', 'backslashreplace'))78 rows_xref+=1 79 # elif tabletype == 'earthworks': 80 # logging.warning("no handling for %s\n>>>> table header: %s\n>>>> xml this row:\n%s" % (tabletype, getalltext(table[1].xpath("descendant::*[local-name()='tr']")[0]).replace('\n',' ').strip(), etree.tostring(row).encode('ascii', 'backslashreplace'))) 78 81 else: 79 82 p = Place() … … 97 100 98 101 places = places + [p] 99 logging.info("DONE with tableparser.parsetable(); found %s places of type %s" % (len(places), tabletype)) 102 rows_act = len(rows) - rows_outref - rows_inref - rows_header - rows_blank - rows_xref 103 lmsg = "DONE with tableparser.parsetable(); created %s places of type %s from a table containing %s actionable rows" % (len(places), tabletype, rows_act) 104 if len(places) != rows_act: 105 logging.warning(lmsg) 106 else: 107 logging.info(lmsg) 100 108 101 109 … … 159 167 160 168 def __str__(self): 161 result = '----------\n%s\n' % self. type169 result = '----------\n%s\n' % self.dirtype 162 170 if len(self.grid) > 0: 163 171 result += 'grid: %s\n' % self.grid … … 286 294 287 295 def add_earthworks(self, cells): 288 # grid | locdesc | periods | references296 # grid | name/locdesc | periods | references 289 297 self.dirtype = 'earthworks' 290 298 self.grid = normalizetext(getalltext(cells[0])) 291 self.locdesc = normalizetext(getalltext(cells[1])) 299 # parse_placenames tests for content inside italic or not; if inside italic, assumed to be locdesc, and added accordingly; otherwise 300 # assumed to be placename, also added accordingly 301 self.placenames = self.parse_placenames(cells[1], cells[2], cells[3]) 302 # self.locdesc = normalizetext(getalltext(cells[1])) 292 303 self.periods = self.parse_periods(cells[2]) 293 304 self.references = self.parse_references(cells[3]) … … 317 328 self.namestring = normalizetext(getalltext(cells[1])) 318 329 self.references = self.parse_references(cells[4]) 319 320 330 321 331 def add_false(self, cells): … … 409 419 nameishes = [celltext,] 410 420 for i, nameish in enumerate(nameishes): 411 if len(namecell.xpath("descendant::*[text()='%s'][last()]/ancestor-or-self::*[local-name()='i']" % nameish)) != 0: 421 # test to see if the nameish follows a "diamond" symbol (i.e., it is a minor alternative name) 422 try: 423 diamond_regex = re.compile(u".+\u00A7.*%s" % nameish.strip().replace('*', '')) 424 except sre_constants.error, detail: 425 logging.critical("error attempting to create a diamond_regex for nameish = '%s'; \n\terror detail: %s" % (nameish.strip(), detail)) 426 raise 427 diamond_m = diamond_regex.match(celltext) 428 429 # test for italic tag 430 nameishmatches = namecell.xpath("descendant-or-self::*[contains(normalize-space(text()),'%s')]" % nameish.strip()) 431 deepestnameishmatch = nameishmatches[-1] 432 nameishitalics = deepestnameishmatch.xpath("ancestor-or-self::*[local-name()='i']") 433 #if 'Brazda' in nameish: 434 #logging.debug("nameish = '%s'\n>>>> cellxml:\n%s\n" % (nameish, etree.tostring(namecell))) 435 #logging.debug("%s nameish matches" % len(nameishmatches)) 436 #for nm in nameishmatches: 437 # logging.debug("nameish match xml:\n%s\n" % etree.tostring(nm)) 438 #logging.debug("deepestnameish match xml:\n%s\n" % deepestnameishmatch) 439 #logging.debug("%s nameishitalics" % len(nameishitalics)) 440 441 if len(nameishitalics) != 0: 412 442 # this is actually the modern location, not the placename 443 #logging.debug("interpreting italicized content in names column as location description for %s" % nameish.strip()) 413 444 if len(self.locdesc) == 0: 414 self.locdesc = nameish 445 self.locdesc = nameish.strip() 415 446 else: 416 self.locdesc = u"%s, %s" % (nameish , self.locdesc)447 self.locdesc = u"%s, %s" % (nameish.strip(), self.locdesc) 417 448 elif nameish.find(u'(') != -1: 418 variants = parse_name_variants(nameish) 449 # this nameish contains parentheses, which means that we need to generate and store multiple variants 450 variants = parse_name_variants(nameish.strip()) 419 451 for variant in variants: 420 pn = self.munge_placename(namecell, variant, i )452 pn = self.munge_placename(namecell, variant, i, variant=True) 421 453 placenames.append(pn) 454 elif diamond_m: 455 # this nameish is a minor variant 456 pn = self.munge_placename(namecell, nameish.strip(), i, minor=True) 457 placenames.append(pn) 422 458 else: 423 pn = self.munge_placename(namecell, nameish, i) 459 # nothing special about this nameish; just record it as a placename 460 pn = self.munge_placename(namecell, nameish.strip(), i) 424 461 placenames.append(pn) 425 462 … … 456 493 457 494 458 def munge_placename(self, namecell, nametext, sequence ):495 def munge_placename(self, namecell, nametext, sequence,variant=False, minor=False): 459 496 ntext = nametext 460 497 … … 499 536 #print "'%s' is inferred (will become '%s')" % (ntext.encode('ascii', 'backslashreplace'), newntext.encode('ascii', 'backslashreplace')) 500 537 ntext = newntext 501 538 539 # handle river name nonsense 540 if ntext[-3:] == u'fl.': 541 if 'river' not in self.types: 542 self.types.append('river') 543 ntext = ntext[:-3].strip() 544 545 # handle island name nonsense 546 if ntext[-4:] == u'Ins.': 547 if 'island' not in self.types: 548 self.types.append('island') 549 ntext = ntext[:-4].strip() 550 if ntext[-5:] == u'Inss.': 551 if 'island-groups' not in self.types: 552 self.types.append('island-group') 553 ntext = ntext[:-5].strip() 502 554 503 555 # objectivy the placename 504 pn = Placename(ntext, normalizetext(getalltext(namecell)), sequence, completeness, certainty, accuracy, inferred )556 pn = Placename(ntext, normalizetext(getalltext(namecell)), sequence, completeness, certainty, accuracy, inferred, variant, minor) 505 557 return pn 506 558 … … 528 580 529 581 class Placename: 530 def __init__(self, nstring=u'', onstring=u'', sequence=0, completeness='complete', certainty='certain', accuracy='accurate', inferred=False ):582 def __init__(self, nstring=u'', onstring=u'', sequence=0, completeness='complete', certainty='certain', accuracy='accurate', inferred=False,variant=False,minor=False): 531 583 self.name=nstring 532 584 self.originalNameString = onstring … … 538 590 self.periods=[] 539 591 self.references=[] 540 541 592 self.variant=variant 593 self.minorAlternative=minor 542 594 543 595 def __str__(self):
