Changeset 845
- Timestamp:
- 06/18/07 15:11:41 (2 years ago)
- Files:
-
- BADataMunger/trunk/biblioextractor.py (modified) (4 diffs)
- BADataMunger/trunk/gismixer.py (modified) (5 diffs)
- BADataMunger/trunk/modsmixer.py (modified) (13 diffs)
- BADataMunger/trunk/pipeline.py (modified) (8 diffs)
- BADataMunger/trunk/placesaver.py (modified) (3 diffs)
- BADataMunger/trunk/tableparser.py (modified) (4 diffs)
- BADataMunger/trunk/wordhtml2xml.py (modified) (3 diffs)
- BADataMunger/trunk/wordstripper.py (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
BADataMunger/trunk/biblioextractor.py
r792 r845 1 1 import re 2 import logging 2 3 3 4 import lxml.etree as etree … … 10 11 """Extract bibliography from cleaned up wordxml.""" 11 12 13 logging.info("BEGIN extracting bibliography from cleaned-up word html") 14 12 15 # determine which div contains the bibliography 13 16 biblist = [] … … 19 22 if text == 'Bibliography': 20 23 bibdiv = divs[i+1] # yes, really, the next div 24 logging.info("bibliography div is div %s" % (i+1)) 21 25 elif text.find('Abbreviation') > 0: 22 26 dirlistdiv = divs[i] 27 logging.info("abbreviation div is div %s" % i) 23 28 24 29 25 30 # parse the bibliography div 26 31 paras = bibdiv.xpath("*[local-name()='p']") 27 print len(paras) 28 29 32 logging.info ("the bibliography div contains %s paragraphs" % len(paras)) 30 33 for p in paras: 31 34 text = p.text.replace(u'\n', u' ').strip() … … 48 51 table = dirlistdiv.xpath("descendant::*[local-name()='p' and contains(., 'Abbreviation')]")[0] 49 52 rows = table.xpath("following-sibling::*[local-name()='table'][1]/*[local-name()='tr']") 53 logging.info("the abbreviations table has %s rows" % len(rows)) 50 54 for row in rows: 51 55 shorttitle = getalltext(row.xpath("*[local-name()='td']")[0]).replace('\n', ' ').strip() 52 56 citation = row.xpath("*[local-name()='td'][2]/*[local-name()='p']")[0] 53 print etree.tostring(citation)57 #print etree.tostring(citation) 54 58 atitle = extract_title(citation) 55 59 biblist.append((shorttitle, atitle, citation)) 56 60 61 logging.info("DONE extracting bibliography from cleaned-up word html") 62 57 63 return biblist 58 64 BADataMunger/trunk/gismixer.py
r840 r845 1 import logging 2 1 3 import lxml.etree as etree 2 4 … … 34 36 self.fieldvalues.append([normalizetext(getalltext(e_value)) for e_value in e_values]) 35 37 36 print "\ngisMixer got %s records from %s" % (len(self.fieldvalues), source)37 print "field names are:"38 logging.info ("gisMixer got %s records from %s" % (len(self.fieldvalues), source)) 39 logging.debug("FIELD NAMES ARE:") 38 40 for name in self.fieldnames: 39 print "%s" % name40 print "first record content:"41 logging.debug("%s" % name ) 42 logging.debug("FIRST RECORD CONTENT:") 41 43 for item in self.fieldvalues[0]: 42 print "%s" % item44 logging.debug("%s" % item) 43 45 44 46 def mixin(self, config, place): … … 77 79 78 80 else: 79 print "mixin failed to match %s: '%s' (%s - %s), rowi=%s" % (place.type, label.encode('ascii', 'backslashreplace'), place.locdesc.encode('ascii', 'backslashreplace'), place.itinraw.encode('ascii', 'backslashreplace'), place.rowi)80 81 try: 81 print "\tdisambiguator=%s" % disambiguator82 logging.warning( "failed to match %s: '%s' (%s - %s), rowi=%s, disambiguator: %s" % (place.type, label.encode('ascii', 'backslashreplace'), place.locdesc.encode('ascii', 'backslashreplace'), place.itinraw.encode('ascii', 'backslashreplace'), place.rowi, disambiguator)) 82 83 except: 83 pass84 logging.warning( "failed to match %s: '%s' (%s - %s), rowi=%s" % (place.type, label.encode('ascii', 'backslashreplace'), place.locdesc.encode('ascii', 'backslashreplace'), place.itinraw.encode('ascii', 'backslashreplace'), place.rowi)) 84 85 85 86 … … 91 92 gridmatches = [value for value in self.fieldvalues if value[self.gridi] == gridsquare] 92 93 except ValueError: 93 print "gisMixer could not find any data for gridsquare = %s (seeking %s)" % (gridsquare, label)94 logging.warning( "gisMixer could not find any data for gridsquare = %s (seeking %s)" % (gridsquare, label)) 94 95 if len(gridmatches)>0: 95 96 try: 96 97 labelmatches = [value for value in gridmatches if value[self.labeli] == label] 97 98 except ValueError: 98 print "gisMixer could not find any data for label = '%s' in grid '%s'" % (label, gridsquare)99 logging.warning("gisMixer could not find any data for label = '%s' in grid '%s'" % (label, gridsquare)) 99 100 if len(labelmatches) > 1 and disambiguator != None: 100 101 try: 101 102 matches = [value for value in labelmatches if value[self.disambiguatori] == disambiguator] 102 103 except ValueError: 103 print "gisMaker could not find any data for label = '%s' and disambiguator = '%s' in grid '%s'" % (label, gridsquare, disambiguator)104 logging.warning("gisMaker could not find any data for label = '%s' and disambiguator = '%s' in grid '%s'" % (label, gridsquare, disambiguator)) 104 105 else: 105 106 matches = labelmatches … … 108 109 elif len(matches) > 1: 109 110 if disambiguator != None: 110 print "gisMaker found %s apparent matches for label = '%s' and disambiguator = '%s' in grid '%s'" % (label, gridsquare, disambiguator)111 logging.warning("gisMaker found %s apparent matches for label = '%s' and disambiguator = '%s' in grid '%s'" % (label, gridsquare, disambiguator)) 111 112 else: 112 print "gisMaker found %s apparent matches for label = '%s' in grid '%s'" % (len(matches), label, gridsquare)113 logging.warning("gisMaker found %s apparent matches for label = '%s' in grid '%s'" % (len(matches), label, gridsquare)) 113 114 return None 114 115 else: BADataMunger/trunk/modsmixer.py
r824 r845 1 1 import re 2 import logging 2 3 3 4 import lxml.etree as etree … … 29 30 def __init__(self, student, library, destination): 30 31 32 33 logging.info("INITIALIZING: %s" % self.__class__) 34 35 # read basic mods information from the "student" file: this is the stuff we want to enhance 31 36 f = open(student) 32 37 content = f.read() … … 34 39 f.close() 35 40 self.student = etree.XML(content) 41 logging.info("read mods information to enhance (aka 'student file') from: %s" % student) 42 logging.info("there are %s mods records in the 'student file'" % len(self.student.xpath(u"//mods:mods", NSD))) 43 44 # read a library of mods information from the "library" file: this is where we will pull the enhancements from 36 45 f = open(library) 37 46 content = f.read() … … 39 48 content = content.replace('<modsCollection>', '<modsCollection xmlns="http://www.loc.gov/mods/v3" xmlns:xlink="http://www.w3.org/1999/xlink">') 40 49 self.library = etree.XML(content) 50 logging.info("read mods enhancement information (aka 'library file') from: %s" % library) 51 logging.info("there are %s mods records in the 'library file'" % len(self.library.xpath(u"//mods:mods", NSD))) 41 52 42 53 snodes = self.student.xpath(u"//mods:mods", NSD) … … 48 59 s_title = u' '.join(s_title_node.text.replace('\n', ' ').strip().split()) 49 60 except: 50 print 'Something is wrong with the student title node for short_title = %s' % s_short_title61 logging.warning('Something is wrong with the student title node for short_title = %s' % s_short_title) 51 62 52 63 lquery = u"//mods:mods/mods:titleInfo[@type='abbreviated']/mods:title[normalize-space(.) = '%s']/ancestor::mods:mods" % s_short_title … … 58 69 if len(lnodes) == 1: 59 70 if len(lnodes[0].xpath(u"mods:titleInfo[@type='abbreviated']/mods:title[normalize-space(.) = '%s']/ancestor::mods:mods" % s_short_title, NSD)) != 1: 60 print "found match in library on full title '%s', but short title did not match '%s'" % (s_title, s_short_title)71 logging.warning("found match in library on full title '%s', but short title did not match '%s'" % (s_title, s_short_title)) 61 72 lnodes = self.library.xpath(lquery, NSD) 62 73 elif len(lnodes) > 1: 63 print "found multiple matches on long title in library for '%s = %s" % (s_short_title.encode('latin', 'xmlcharrefreplace'), s_title.encode('latin', 'xmlcharrefreplace'))64 else: 65 print "found multiple matches in on short title library for '%s = %s" % (s_short_title.encode('latin', 'xmlcharrefreplace'), s_title.encode('latin', 'xmlcharrefreplace'))74 logging.warning("found multiple matches on long title in library for '%s = %s" % (s_short_title.encode('latin', 'xmlcharrefreplace'), s_title.encode('latin', 'xmlcharrefreplace'))) 75 else: 76 logging.warning("found multiple matches in on short title library for '%s = %s" % (s_short_title.encode('latin', 'xmlcharrefreplace'), s_title.encode('latin', 'xmlcharrefreplace'))) 66 77 if len(lnodes) == 0: 67 78 squery = u"*[local-name()='abstract']/descendant::*[local-name()='i' and contains(normalize-space(.),'%s')]/ancestor::mods:mods" % s_title 68 79 if len(snode.xpath(squery)) == 0: 69 print " notice: unmatched article '%s'" % s_short_title.encode('latin', 'xmlcharrefreplace')70 else: 71 print "WARNING: unmatched book '%s = %s" % (s_short_title.encode('latin', 'xmlcharrefreplace'), s_title.encode('latin', 'xmlcharrefreplace'))80 logging.warning("unmatched article '%s'" % s_short_title.encode('latin', 'xmlcharrefreplace')) 81 else: 82 logging.warning("unmatched book '%s = %s" % (s_short_title.encode('latin', 'xmlcharrefreplace'), s_title.encode('latin', 'xmlcharrefreplace'))) 72 83 if len(lnodes)==1: 73 84 # one match, so gather up all the data we will need … … 78 89 l_title = u' '.join(l_title_node.text.replace('\n', ' ').strip().split()) 79 90 except: 80 print "no title node found in library match for %s = %s" % (s_short_title.encode('latin', 'backslashreplace'), s_title.encode('latin', 'backslashreplace'))91 logging.warning("no title node found in library match for %s = %s" % (s_short_title.encode('latin', 'backslashreplace'), s_title.encode('latin', 'backslashreplace'))) 81 92 try: 82 93 l_title_lang = l_title_node.xpath(u"../@*[local-name()='lang']")[0] … … 90 101 snode.attrib['ID'] = lnode.attrib['ID'] 91 102 else: 92 print "no ID!!!!"103 logging.warning("no ID attribute on mods element %s of %s" % (len(lnode.xpath("previous::*[local-name()='mods']")), len(lnode.xpath("//*[local-name()='mods']")))) 93 104 94 105 # proper title type attributes … … 147 158 s_title_node.xpath('..')[0].attrib['{http://www.w3.org/XML/1998/namespace}lang']=l_title_lang 148 159 else: 149 print "No title language found for '%s'\n" % s_short_title.encode('latin1', 'xmlcharrefreplace')160 logging.warning("No title language found for '%s'\n" % s_short_title.encode('latin1', 'xmlcharrefreplace')) 150 161 151 162 # copy over items verbatim from library record to student record … … 171 182 ricount = len(relatedItems) 172 183 richecked = 0 184 orphtot = 0 173 185 while orphans: 174 186 for relatedItem in relatedItems: … … 177 189 lnodes = self.library.xpath("//mods:mods[@ID='%s']" % ri_id, NSD) 178 190 if len(lnodes) != 1: 179 print "failure seeking %s in library file; match count = %s" % (ri_id, len(lnodes))191 logging.warning("failure seeking %s in library file; match count = %s" % (ri_id, len(lnodes))) 180 192 break 181 193 else: 182 print "adding %s" % ri_id194 orphtot += 1 183 195 self.student.xpath("//mods:modsCollection", NSD)[0].append(lnodes[0]) 184 196 richecked += 1 … … 189 201 richecked = 0 190 202 203 logging.info("added to 'student' an additional %s records from 'library' because they are related works" % orphtot) 204 191 205 pcontent = etree.tostring(self.student).encode('utf-8') 192 206 pcontent = '<?xml version="1.0" encoding="UTF-8"?>\n' + pcontent … … 194 208 f.write(pcontent) 195 209 f.close() 210 211 logging.info("saved enhanced records to %s" % destination) 212 213 logging.info("DONE: %s\n" % self.__class__) 214 BADataMunger/trunk/pipeline.py
r844 r845 1 1 from os.path import normpath, normcase, isdir, isfile, splitdrive, splitext, split, join 2 2 import os 3 import logging 3 4 4 5 import lxml.etree as etree … … 34 35 file 35 36 """ 36 37 logging.basicConfig(level=logging.DEBUG, format='%(levelname)-8s %(message)s') 38 logging.info("INITIALIZING: %s" % self.__class__) 37 39 38 40 self.data = {} 39 41 self['contextpath'] = os.getcwd() 40 print "contextpath: %s" % self['contextpath']42 logging.info("contextpath: %s" % self['contextpath']) 41 43 42 44 self['configfile'] = normpath(normcase(configfile)) … … 69 71 self['wordhtml'] = f.read() 70 72 f.close() 73 logging.info("read and stored directory file from %s" % self['dirfile']) 71 74 72 75 # read and store the config file … … 74 77 config_text = f.read() 75 78 f.close() 79 logging.info("read and stored config file from %s" % self['configfile']) 76 80 77 81 # get some essential info from the config file … … 81 85 except: 82 86 self.map_number = 'XYZ' 87 logging.info("map number from config file: %s" % self.map_number) 88 83 89 self.creators = [] 84 90 self.contributors = [] … … 94 100 95 101 self.mixer = gismixer.gisMixer(self['gisfile']) 96 102 103 logging.info("DONE INITIALIZING: %s\n" % self.__class__) 104 97 105 98 106 def __getitem__(self, key): return self.data[key] … … 103 111 """Cycle through all steps in the pipeline""" 104 112 113 logging.info("CYCLING: %s" % self.__class__) 114 105 115 self['wordxml'] = wordhtml2xml.convert(self['wordhtml']) 106 116 self['cleanxml'] = wordstripper.strip(self['contextpath'], self['wordxml']) … … 109 119 self['dirtables'] = tablegroker.grok(self['cleanxml']) 110 120 self['places'] = tableparser.parse(self, self['dirtables'], self.map_number) 121 logging.info("BEGIN attempting to match and combine %s directory places with %s map places" % (len(self['places']),len(self.mixer.fieldvalues))) 111 122 for place in self['places']: 112 123 if place.type == 'unlocated' or place.type == 'false': 113 self.matched = True124 place.matched = True 114 125 else: 115 126 self.mixer.mixin(self['config'], place) 116 127 logging.info("DONE mixing directory and map places") 128 logging.warning("places in GIS data that have no corresponding directory entry are ignored!") 117 129 #placesaver.save_places_tei(self) 118 130 placesaver.save_places_frank(self) 119 131 132 logging.info("DONE CYCLING: %s\n" % self.__class__) 133 134 135 120 136 def save(self, itemkey, encoding='utf-8'): 121 137 """Basic save-with-encoding function for writing content to an arbitrary BADataMunger/trunk/placesaver.py
r843 r845 1 import logging 1 2 import re 2 3 from os.path import join … … 33 34 """Save all the places using the Pleiades frankenformat.""" 34 35 36 35 37 # iterate through the places list, creating a corresponding frankenfile for each place 36 38 places = self['places'] 37 for i, place in enumerate([place for place in places if place.matched]): 39 matchedplaces = [place for place in places if place.matched] 40 logging.info("BEGIN: attempting to save (in frankenformat) all %s matched places out of %s total places" % (len(matchedplaces), len(places))) 41 logging.warning("Nothing is done about places from the GIS/map data that don't have corresponding directory entries!") 42 for i, place in enumerate(matchedplaces): 38 43 39 44 … … 232 237 g.write(pcontent) 233 238 g.close() 239 240 logging.info("DONE saving in frankenformat") 234 241 235 242 def save_places_tei(self): BADataMunger/trunk/tableparser.py
r842 r845 1 import logging 1 2 import operator 2 3 import re … … 33 34 Return the info in a list.""" 34 35 36 logging.info("BEGIN attempt to parse the tables from the directory: tableparser.parse()") 35 37 places = [] 36 38 for key in tabledict.keys(): … … 42 44 places += parsetable(context, tabletype, tabledict[key], map_number) 43 45 46 logging.info("DONE with tableparser.parse()") 44 47 return places 45 48 46 49 def parsetable(context, tabletype, table, map_number): 47 50 places = [] 51 logging.info("BEGIN parsing a %s table: tableparser.parsetable()" % tabletype) 48 52 for ri, row in enumerate(table[1].xpath("descendant::*[local-name()='tr']")): 49 53 rowtext = normalizetext(getalltext(row)) … … 85 89 86 90 places = places + [p] 87 91 logging.info("DONE with tableparser.parsetable(); found %s places of type %s" % (len(places), tabletype)) 88 92 return places 89 93 BADataMunger/trunk/wordhtml2xml.py
r784 r845 1 import logging 2 1 3 from BeautifulSoup import BeautifulSoup 2 4 import lxml.etree as etree … … 8 10 def convert(source): 9 11 """Make html exported from Microsoft Word both well-formed and valid.""" 12 13 logging.info("BEGIN attempt to make MSWord-exported HTML both well-formed and valid: wordhtml2xml.convert()") 10 14 soup = BeautifulSoup(source) 11 15 html = soup.findAll('html')[0] … … 15 19 tree = etree.XML(XMLDOCTYPE + unicode(soup)) 16 20 tree = wordnormalizer.normalize(tree) 21 logging.info("DONE with wordhtml2xml.convert()") 17 22 return tree 18 23 BADataMunger/trunk/wordstripper.py
r784 r845 1 import logging 1 2 from os.path import join 2 3 … … 7 8 def strip(contextpath, source): 8 9 """Strip unneeded formatting inherited from MSWord.""" 10 logging.info("BEGIN attempt to strip unneeded formatting inherited from MSWord using %s: wordstripper.strip()" % XSLTFILE) 9 11 xslt_doc = etree.parse(join(contextpath, XSLTFILE)) 10 12 transform = etree.XSLT(xslt_doc) 11 13 result = etree.XML(unicode(transform(source))) 14 logging.info("DONE with wordstripper.strip()") 12 15 return result 13 16
