Changeset 1337
- Timestamp:
- 07/18/08 17:08:16 (3 months ago)
- Files:
-
- BADataMunger/trunk/tableparser.py (modified) (9 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
BADataMunger/trunk/tableparser.py
r1330 r1337 10 10 from etreehelps import getalltext 11 11 from texthelps import normalizetext 12 13 CMAP_TMPL_PN_4 = {'grid':0, 'placenames':1, 'periods':2, 'references':3} 14 CMAP_TMPL_UN_4 = { 'grid' : 0, 'locdesc' : 1, 'periods':2, 'references':3} 15 CMAP_TMPL_MAT_DATELESS = {'grid':0,'placenames':1,'material':2,'references':3} 16 CMAP_TMPL_NAMED_5 = {'grid':0,'placenames':1,'namestring':1, 'periods':2,'locdesc':3,'references':4} 17 18 GROUP_COUNT_REGEX = re.compile(r'\s+\((.)\)$') 12 19 13 20 tabletypes = { … … 19 26 "False Toponyms":"false", 20 27 "False Toponym":"false", 28 "Levee":"levee", 29 "Levees":"levee", 21 30 "Name":"name", 22 31 "Names":"name", … … 28 37 "Unlocated Toponyms":"unlocated", 29 38 "Unlocated Toponym":"unlocated", 39 "Villas":"villa", 40 "Villa":"villa", 30 41 "Walls":"wall", 31 42 "Wall":"wall" … … 156 167 self.placenames=[] 157 168 self.namestring = u'' 169 self.orignamestring = u'' 158 170 self.number='' 159 171 self.periods=[] … … 163 175 self.dirtype='' 164 176 self.types=[] 177 self.featurecount = 1 178 self.material = '' 165 179 self.matched = False 166 180 … … 276 290 print "Untrapped tag = '%s'" % d.tag 277 291 278 292 def addinator(self, cells, cellmap): 293 # assumes that self.dirtype has already been set 294 295 for k,v in cellmap.items(): 296 if k in ['comment', 'grid', 'material', 'namestring']: 297 setattr(self, k, normalizetext(getalltext(cells[v]))) 298 299 if k == 'namestring': 300 self.orignamestring = normalizetext(getalltext(cells[v])) 301 if k == 'periods': 302 self.periods = self.parse_periods(cells[v]) 303 if k == 'references': 304 self.references = self.parse_references(cells[v]) 305 if k == 'placenames': 306 try: 307 self.placenames = self.parse_placenames(cells[v], cells[cellmap['periods']], cells[cellmap['references']]) 308 except KeyError, detail: 309 logging.critical("In attempting to parse placenames, there was an error dereferencing an expected key value in the cellmap dictionary; error detail: %s" % detail) 310 print "cellmap:" 311 print cellmap 312 if k == 'number': 313 self.number = normalizetext(getalltext(cells[v])) 314 if k == 'itinerary': 315 self.itinraw = normalizetext(getalltext(cells[v])) 316 self.itinerary = self.parse_itinerary(cells[v]) 317 if k == 'locdesc': 318 q = normalizetext(getalltext(cells[v])) 319 if len(q) > 0: 320 if len(self.locdesc) > 0: 321 self.locdesc += ", %s" % q 322 else: 323 self.locdesc = q 324 325 q = u'' 326 if len(self.namestring) > 0: 327 q = self.namestring 328 elif len(self.locdesc) > 0: 329 q = self.locdesc 330 if len(q) > 0: 331 m = GROUP_COUNT_REGEX.search(q) 332 if m: 333 self.featurecount = int(m.groups()[0]) 334 if q == self.namestring: 335 self.namestring = GROUP_COUNT_REGEX.sub('', q).strip() 336 else: 337 self.locdesc = GROUP_COUNT_REGEX.sub('', q).strip() 338 self.dirtype += '-group' 339 279 340 def add_aqueduct(self, cells): 280 # grid | locdesc | periods | references281 341 self.dirtype = 'aqueduct' 282 self.grid = normalizetext(getalltext(cells[0])) 283 self.locdesc = normalizetext(getalltext(cells[1])) 284 self.periods = self.parse_periods(cells[2]) 285 self.references = self.parse_references(cells[3]) 342 cmap = CMAP_TMPL_UN_4 343 self.addinator(cells, cmap) 286 344 287 345 def add_bridge(self, cells): 288 # grid | locdesc | periods | references289 346 self.dirtype = 'bridge' 290 self.grid = normalizetext(getalltext(cells[0])) 291 self.locdesc = normalizetext(getalltext(cells[1])) 292 self.periods = self.parse_periods(cells[2]) 293 self.references = self.parse_references(cells[3]) 347 cmap = CMAP_TMPL_UN_4 348 self.addinator(cells, cmap) 294 349 295 350 def add_earthworks(self, cells): 296 # grid | name/locdesc | periods | references297 351 self.dirtype = 'earthworks' 298 self.grid = normalizetext(getalltext(cells[0]))299 # parse_placenames tests for content inside italic or not; if inside italic, assumed to be locdesc, and added accordingly; otherwise300 # assumed to be placename, also added accordingly 301 self.placenames = self.parse_placenames(cells[1], cells[2], cells[3])302 # self.locdesc = normalizetext(getalltext(cells[1]))303 self.periods = self.parse_periods(cells[2])304 self. references = self.parse_references(cells[3])305 352 cmap = CMAP_TMPL_PN_4 353 self.addinator(cells, cmap) 354 355 def add_levee(self, cells): 356 self.dirtype = 'levee' 357 cmap = CMAP_TMPL_UN_4 358 self.addinator(cells, cmap) 359 306 360 def add_mine(self, cells): 307 # grid | locdesc | material | references308 361 self.dirtype = 'mine' 309 self.grid = normalizetext(getalltext(cells[0])) 310 self.locdesc = normalizetext(getalltext(cells[1])) + "; material(s) extracted: %s" % normalizetext(getalltext(cells[2])) 311 self.references = self.parse_references(cells[3]) 362 cmap = CMAP_TMPL_MAT_DATELESS 363 self.addinator(cells,cmap) 312 364 313 365 def add_quarry(self, cells): 314 # grid | locdesc | material | references315 366 self.dirtype = 'quarry' 316 self.grid = normalizetext(getalltext(cells[0])) 317 self.locdesc = normalizetext(getalltext(cells[1])) + "; material(s) extracted: %s" % normalizetext(getalltext(cells[2])) 318 self.references = self.parse_references(cells[3]) 319 367 cmap = {'grid':0, 'locdesc':1, 'periods':2, 'material':3, 'references':4} 368 self.addinator(cells,cmap) 320 369 321 370 def add_name(self, cells): 322 # grid | placename(s) | periods | locdesc | references323 371 self.dirtype = 'name' 324 self.grid = normalizetext(getalltext(cells[0])) 325 self.periods = self.parse_periods(cells[2]) 326 self.locdesc = normalizetext(getalltext(cells[3])) 327 self.placenames = self.parse_placenames(cells[1], cells[2], cells[4]) 328 self.namestring = normalizetext(getalltext(cells[1])) 329 self.references = self.parse_references(cells[4]) 372 cmap = CMAP_TMPL_NAMED_5 373 self.addinator(cells,cmap) 330 374 331 375 def add_false(self, cells): 332 # name | references | comment333 376 self.dirtype = 'false' 377 cmap = {'namestring':0, 'references':1, 'comment':2} 378 self.addinator(cells, cmap) 334 379 self.placenames = self.parse_placenames(cells[0], None, cells[1]) 335 self.namestring = normalizetext(getalltext(cells[0]))336 self.references = self.parse_references(cells[1])337 self.comment = normalizetext(getalltext(cells[2]))338 380 339 381 def add_numbered(self, cells): 340 # number | grid | location | period | reference341 382 self.dirtype = 'numbered' 342 self.number = normalizetext(getalltext(cells[0])) 343 self.namestring = normalizetext(getalltext(cells[0])) 344 self.grid = normalizetext(getalltext(cells[1])) 345 self.locdesc = normalizetext(getalltext(cells[2])) 346 self.periods = self.parse_periods(cells[3]) 347 self.references = self.parse_references(cells[4]) 348 383 cmap = {'number':0, 'namestring':0, 'grid':1, 'locdesc':2, 'periods':3, 'references':4} 384 self.addinator(cells, cmap) 349 385 350 386 def add_road(self, cells): 351 # itinerary | period | reference352 387 self.dirtype = 'road' 353 self.itinraw = normalizetext(getalltext(cells[0])) 354 self.itinerary = self.parse_itinerary(cells[0]) 355 self.periods = self.parse_periods(cells[1]) 356 self.references = self.parse_references(cells[2]) 357 388 cmap = {'itinerary':0, 'periods':1, 'references':2} 389 self.addinator(cells,cmap) 358 390 359 391 def add_unlocated(self, cells): 360 # name | period | probable location | reference361 392 self.dirtype = 'unlocated' 362 self.periods = self.parse_periods(cells[1]) 363 self.locdesc = normalizetext(getalltext(cells[2])) 364 self.placenames = self.parse_placenames(cells[0], cells[1], cells[3]) 365 self.namestring = normalizetext(getalltext(cells[0])) 366 self.references = self.parse_references(cells[3]) 367 368 393 cmap = {'placenames':0, 'namestring':0, 'periods':1, 'locdesc':2, 'references':3} 394 self.addinator(cells, cmap) 395 396 def add_villa(self, cells): 397 self.dirtype = 'villa' 398 cmap = CMAP_TMPL_UN_4 399 self.addinator(cells, cmap) 400 369 401 def add_wall(self, cells): 370 # grid | locdesc | periods | references371 402 self.dirtype = 'wall' 372 self.grid = normalizetext(getalltext(cells[0])) 373 self.locdesc = normalizetext(getalltext(cells[1])) 374 self.periods = self.parse_periods(cells[2]) 375 self.references = self.parse_references(cells[3]) 403 cmap = CMAP_TMPL_UN_4 404 self.addinator(cells, cmap) 376 405 377 406 def parse_periods(self, periodcell): … … 429 458 # test for italic tag 430 459 nameishmatches = namecell.xpath("descendant-or-self::*[contains(normalize-space(text()),'%s')]" % nameish.strip()) 431 deepestnameishmatch = nameishmatches[-1] 460 try: 461 deepestnameishmatch = nameishmatches[-1] 462 except IndexError, detail: 463 cellxml = etree.tostring(namecell) 464 #logging.warning("error attempting to use xpath to find '%s' in xml\n:%s\nattempting to trap for serial italic tags" % (nameish.strip(), cellxml)) 465 if '</i><i>' in cellxml: 466 #logging.info('found </i></i> in cell xml, attempting to remove and retry') 467 cellxml = cellxml.replace('</i><i>', '') 468 celltree = etree.XML(cellxml) 469 #logging.info("result of replacement attempt: '%s'" % etree.tostring(celltree)) 470 nameishmatches = celltree.xpath("descendant-or-self::*[contains(normalize-space(text()), '%s')]" % nameish.strip()) 471 if len(nameishmatches) > 0: 472 logging.info("adjusted for serial italic tags in a name field for %s" % nameish.strip()) 473 deepestnameishmatch = nameishmatches[-1] 474 else: 475 logging.critical ("serial italics do not seem to be the problem; error detail: %s" % detail) 476 raise 477 432 478 nameishitalics = deepestnameishmatch.xpath("ancestor-or-self::*[local-name()='i']") 433 #if 'Brazda' in nameish:434 #logging.debug("nameish = '%s'\n>>>> cellxml:\n%s\n" % (nameish, etree.tostring(namecell)))435 #logging.debug("%s nameish matches" % len(nameishmatches))436 #for nm in nameishmatches:437 # logging.debug("nameish match xml:\n%s\n" % etree.tostring(nm))438 #logging.debug("deepestnameish match xml:\n%s\n" % deepestnameishmatch)439 #logging.debug("%s nameishitalics" % len(nameishitalics))440 479 441 480 if len(nameishitalics) != 0: 442 481 # this is actually the modern location, not the placename 443 #logging.debug("interpreting italicized content in names column as location description for %s" % nameish.strip())444 482 if len(self.locdesc) == 0: 445 483 self.locdesc = nameish.strip() 446 484 else: 447 485 self.locdesc = u"%s, %s" % (nameish.strip(), self.locdesc) 448 elif nameish. find(u'(') != -1:486 elif nameish.replace(u'(...)', '').find(u'(') != -1: 449 487 # this nameish contains parentheses, which means that we need to generate and store multiple variants 450 488 variants = parse_name_variants(nameish.strip()) … … 490 528 pname.references = self.references 491 529 530 # normalize capitalization in placenames (this is mainly about dealing with generated variants beginning with parens 531 for pname in placenames: 532 if pname.variant: 533 pwords = pname.name.split() 534 qword = [] 535 revariant = u'' 536 for pword in pwords: 537 if pword[0].isupper(): 538 qword = pword.capitalize() 539 else: 540 qword = pword 541 revariant = u"%s %s" % (revariant, qword) 542 pname.name = revariant 543 492 544 return placenames 493 545 … … 517 569 accuracy = 'accurate' 518 570 pattern = u'\u2018[^\u2019]+\u2019' 519 m = re. match(pattern, ntext)571 m = re.search(pattern, ntext) 520 572 if m: 521 573 accuracy = 'inaccurate'
