Changeset 845

Show
Ignore:
Timestamp:
06/18/07 15:11:41 (2 years ago)
Author:
thomase
Message:

Fixed logic bug that prevented unlocated places from getting written to frankenfiles; incorporated uniform and more informative info and warning logging, using the python standard logging module.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • BADataMunger/trunk/biblioextractor.py

    r792 r845  
    11import re 
     2import logging 
    23 
    34import lxml.etree as etree 
     
    1011    """Extract bibliography from cleaned up wordxml.""" 
    1112     
     13    logging.info("BEGIN extracting bibliography from cleaned-up word html") 
     14     
    1215    # determine which div contains the bibliography 
    1316    biblist = [] 
     
    1922        if text == 'Bibliography': 
    2023            bibdiv = divs[i+1]  # yes, really, the next div 
     24            logging.info("bibliography div is div %s" % (i+1)) 
    2125        elif text.find('Abbreviation') > 0: 
    2226            dirlistdiv = divs[i] 
     27            logging.info("abbreviation div is div %s" % i) 
    2328            
    2429         
    2530    # parse the bibliography div 
    2631    paras = bibdiv.xpath("*[local-name()='p']") 
    27     print len(paras) 
    28      
    29      
     32    logging.info ("the bibliography div contains %s paragraphs" % len(paras)) 
    3033    for p in paras: 
    3134        text = p.text.replace(u'\n', u' ').strip() 
     
    4851        table = dirlistdiv.xpath("descendant::*[local-name()='p' and contains(., 'Abbreviation')]")[0] 
    4952        rows = table.xpath("following-sibling::*[local-name()='table'][1]/*[local-name()='tr']") 
     53        logging.info("the abbreviations table has %s rows" % len(rows)) 
    5054        for row in rows: 
    5155            shorttitle = getalltext(row.xpath("*[local-name()='td']")[0]).replace('\n', ' ').strip() 
    5256            citation = row.xpath("*[local-name()='td'][2]/*[local-name()='p']")[0] 
    53             print etree.tostring(citation) 
     57            #print etree.tostring(citation) 
    5458            atitle = extract_title(citation) 
    5559            biblist.append((shorttitle, atitle, citation)) 
    5660         
     61    logging.info("DONE extracting bibliography from cleaned-up word html") 
     62     
    5763    return biblist 
    5864     
  • BADataMunger/trunk/gismixer.py

    r840 r845  
     1import logging 
     2 
    13import lxml.etree as etree 
    24 
     
    3436            self.fieldvalues.append([normalizetext(getalltext(e_value)) for e_value in e_values]) 
    3537             
    36         print "\ngisMixer got %s records from %s" % (len(self.fieldvalues), source
    37         print "field names are:" 
     38        logging.info ("gisMixer got %s records from %s" % (len(self.fieldvalues), source)
     39        logging.debug("FIELD NAMES ARE:") 
    3840        for name in self.fieldnames: 
    39             print "%s" % name  
    40         print "first record content:" 
     41            logging.debug("%s" % name ) 
     42        logging.debug("FIRST RECORD CONTENT:") 
    4143        for item in self.fieldvalues[0]: 
    42             print "%s" % item   
     44            logging.debug("%s" % item) 
    4345             
    4446    def mixin(self, config, place): 
     
    7779             
    7880        else: 
    79             print "mixin failed to match %s: '%s' (%s - %s), rowi=%s" % (place.type, label.encode('ascii', 'backslashreplace'), place.locdesc.encode('ascii', 'backslashreplace'), place.itinraw.encode('ascii', 'backslashreplace'), place.rowi) 
    8081            try: 
    81                 print "\tdisambiguator=%s" % disambiguator 
     82                logging.warning( "failed to match %s: '%s' (%s - %s), rowi=%s, disambiguator: %s" % (place.type, label.encode('ascii', 'backslashreplace'), place.locdesc.encode('ascii', 'backslashreplace'), place.itinraw.encode('ascii', 'backslashreplace'), place.rowi, disambiguator)) 
    8283            except: 
    83                 pass 
     84                logging.warning( "failed to match %s: '%s' (%s - %s), rowi=%s" % (place.type, label.encode('ascii', 'backslashreplace'), place.locdesc.encode('ascii', 'backslashreplace'), place.itinraw.encode('ascii', 'backslashreplace'), place.rowi)) 
    8485         
    8586             
     
    9192            gridmatches = [value for value in self.fieldvalues if value[self.gridi] == gridsquare] 
    9293        except ValueError: 
    93             print "gisMixer could not find any data for gridsquare = %s (seeking %s)" % (gridsquare, label
     94            logging.warning( "gisMixer could not find any data for gridsquare = %s (seeking %s)" % (gridsquare, label)
    9495        if len(gridmatches)>0: 
    9596            try:  
    9697                labelmatches = [value for value in gridmatches if value[self.labeli] == label] 
    9798            except ValueError: 
    98                 print "gisMixer could not find any data for label = '%s' in grid '%s'" % (label, gridsquare
     99                logging.warning("gisMixer could not find any data for label = '%s' in grid '%s'" % (label, gridsquare)
    99100            if len(labelmatches) > 1 and disambiguator != None: 
    100101                try: 
    101102                    matches = [value for value in labelmatches if value[self.disambiguatori] == disambiguator] 
    102103                except ValueError: 
    103                     print "gisMaker could not find any data for label = '%s' and disambiguator = '%s' in grid '%s'" % (label, gridsquare, disambiguator
     104                    logging.warning("gisMaker could not find any data for label = '%s' and disambiguator = '%s' in grid '%s'" % (label, gridsquare, disambiguator)
    104105            else: 
    105106                matches = labelmatches 
     
    108109        elif len(matches) > 1: 
    109110            if disambiguator != None: 
    110                 print "gisMaker found %s apparent matches for label = '%s' and disambiguator = '%s' in grid '%s'" % (label, gridsquare, disambiguator
     111                logging.warning("gisMaker found %s apparent matches for label = '%s' and disambiguator = '%s' in grid '%s'" % (label, gridsquare, disambiguator)
    111112            else: 
    112                 print "gisMaker found %s apparent matches for label = '%s' in grid '%s'" % (len(matches), label, gridsquare
     113                logging.warning("gisMaker found %s apparent matches for label = '%s' in grid '%s'" % (len(matches), label, gridsquare)
    113114            return None 
    114115        else: 
  • BADataMunger/trunk/modsmixer.py

    r824 r845  
    11import re 
     2import logging 
    23 
    34import lxml.etree as etree 
     
    2930    def __init__(self, student, library, destination): 
    3031         
     32         
     33        logging.info("INITIALIZING: %s" % self.__class__) 
     34         
     35        # read basic mods information from the "student" file: this is the stuff we want to enhance 
    3136        f = open(student) 
    3237        content = f.read() 
     
    3439        f.close() 
    3540        self.student = etree.XML(content) 
     41        logging.info("read mods information to enhance (aka 'student file') from: %s" % student) 
     42        logging.info("there are %s mods records in the 'student file'" % len(self.student.xpath(u"//mods:mods", NSD))) 
     43         
     44        # read a library of mods information from the "library" file: this is where we will pull the enhancements from 
    3645        f = open(library) 
    3746        content = f.read() 
     
    3948        content = content.replace('<modsCollection>', '<modsCollection xmlns="http://www.loc.gov/mods/v3" xmlns:xlink="http://www.w3.org/1999/xlink">') 
    4049        self.library = etree.XML(content) 
     50        logging.info("read mods enhancement information (aka 'library file') from: %s" % library) 
     51        logging.info("there are %s mods records in the 'library file'" % len(self.library.xpath(u"//mods:mods", NSD))) 
    4152         
    4253        snodes = self.student.xpath(u"//mods:mods", NSD) 
     
    4859                s_title = u' '.join(s_title_node.text.replace('\n', ' ').strip().split()) 
    4960            except: 
    50                 print 'Something is wrong with the student title node for short_title = %s' % s_short_title 
     61                logging.warning('Something is wrong with the student title node for short_title = %s' % s_short_title) 
    5162             
    5263            lquery = u"//mods:mods/mods:titleInfo[@type='abbreviated']/mods:title[normalize-space(.) = '%s']/ancestor::mods:mods" % s_short_title 
     
    5869                if len(lnodes) == 1: 
    5970                    if len(lnodes[0].xpath(u"mods:titleInfo[@type='abbreviated']/mods:title[normalize-space(.) = '%s']/ancestor::mods:mods" % s_short_title, NSD)) != 1: 
    60                         print "found match in library on full title '%s', but short title did not match '%s'" % (s_title, s_short_title
     71                        logging.warning("found match in library on full title '%s', but short title did not match '%s'" % (s_title, s_short_title)
    6172                        lnodes = self.library.xpath(lquery, NSD) 
    6273                elif len(lnodes) > 1: 
    63                     print "found multiple matches on long title in library for '%s = %s" % (s_short_title.encode('latin', 'xmlcharrefreplace'), s_title.encode('latin', 'xmlcharrefreplace')) 
    64                 else: 
    65                     print "found multiple matches in on short title library for '%s = %s" % (s_short_title.encode('latin', 'xmlcharrefreplace'), s_title.encode('latin', 'xmlcharrefreplace')) 
     74                    logging.warning("found multiple matches on long title in library for '%s = %s" % (s_short_title.encode('latin', 'xmlcharrefreplace'), s_title.encode('latin', 'xmlcharrefreplace'))) 
     75                else: 
     76                    logging.warning("found multiple matches in on short title library for '%s = %s" % (s_short_title.encode('latin', 'xmlcharrefreplace'), s_title.encode('latin', 'xmlcharrefreplace'))) 
    6677            if len(lnodes) == 0: 
    6778                squery = u"*[local-name()='abstract']/descendant::*[local-name()='i' and contains(normalize-space(.),'%s')]/ancestor::mods:mods" % s_title 
    6879                if len(snode.xpath(squery)) == 0: 
    69                     print "    notice: unmatched article '%s'" % s_short_title.encode('latin', 'xmlcharrefreplace'
    70                 else: 
    71                     print "WARNING: unmatched book '%s = %s" % (s_short_title.encode('latin', 'xmlcharrefreplace'), s_title.encode('latin', 'xmlcharrefreplace')) 
     80                    logging.warning("unmatched article '%s'" % s_short_title.encode('latin', 'xmlcharrefreplace')
     81                else: 
     82                    logging.warning("unmatched book '%s = %s" % (s_short_title.encode('latin', 'xmlcharrefreplace'), s_title.encode('latin', 'xmlcharrefreplace'))) 
    7283            if len(lnodes)==1: 
    7384                # one match, so gather up all the data we will need 
     
    7889                    l_title = u' '.join(l_title_node.text.replace('\n', ' ').strip().split()) 
    7990                except: 
    80                     print "no title node found in library match for %s = %s" % (s_short_title.encode('latin', 'backslashreplace'), s_title.encode('latin', 'backslashreplace')) 
     91                    logging.warning("no title node found in library match for %s = %s" % (s_short_title.encode('latin', 'backslashreplace'), s_title.encode('latin', 'backslashreplace'))) 
    8192                try: 
    8293                    l_title_lang = l_title_node.xpath(u"../@*[local-name()='lang']")[0] 
     
    90101                    snode.attrib['ID'] = lnode.attrib['ID'] 
    91102                else: 
    92                     print "no ID!!!!" 
     103                    logging.warning("no ID attribute on mods element %s of %s" % (len(lnode.xpath("previous::*[local-name()='mods']")), len(lnode.xpath("//*[local-name()='mods']")))) 
    93104 
    94105                # proper title type attributes 
     
    147158                    s_title_node.xpath('..')[0].attrib['{http://www.w3.org/XML/1998/namespace}lang']=l_title_lang 
    148159                else: 
    149                     print "No title language found for '%s'\n" % s_short_title.encode('latin1', 'xmlcharrefreplace'
     160                    logging.warning("No title language found for '%s'\n" % s_short_title.encode('latin1', 'xmlcharrefreplace')
    150161                     
    151162                # copy over items verbatim from library record to student record 
     
    171182        ricount = len(relatedItems) 
    172183        richecked = 0 
     184        orphtot = 0 
    173185        while orphans: 
    174186            for relatedItem in relatedItems: 
     
    177189                    lnodes = self.library.xpath("//mods:mods[@ID='%s']" % ri_id, NSD) 
    178190                    if len(lnodes) != 1: 
    179                         print "failure seeking %s in library file; match count = %s" % (ri_id, len(lnodes)) 
     191                        logging.warning("failure seeking %s in library file; match count = %s" % (ri_id, len(lnodes))) 
    180192                        break 
    181193                    else: 
    182                         print "adding %s" % ri_id 
     194                        orphtot += 1 
    183195                        self.student.xpath("//mods:modsCollection", NSD)[0].append(lnodes[0]) 
    184196                richecked += 1 
     
    189201            richecked = 0 
    190202             
     203        logging.info("added to 'student' an additional %s records from 'library' because they are related works" % orphtot) 
     204         
    191205        pcontent = etree.tostring(self.student).encode('utf-8') 
    192206        pcontent = '<?xml version="1.0" encoding="UTF-8"?>\n' + pcontent 
     
    194208        f.write(pcontent) 
    195209        f.close() 
     210         
     211        logging.info("saved enhanced records to %s" % destination) 
     212         
     213        logging.info("DONE: %s\n" % self.__class__) 
     214         
  • BADataMunger/trunk/pipeline.py

    r844 r845  
    11from os.path import normpath, normcase, isdir, isfile, splitdrive, splitext, split, join 
    22import os 
     3import logging 
    34 
    45import lxml.etree as etree 
     
    3435        file 
    3536        """ 
    36          
     37        logging.basicConfig(level=logging.DEBUG, format='%(levelname)-8s %(message)s') 
     38        logging.info("INITIALIZING: %s" % self.__class__) 
    3739 
    3840        self.data = {} 
    3941        self['contextpath'] = os.getcwd() 
    40         print "contextpath: %s" % self['contextpath'] 
     42        logging.info("contextpath: %s" % self['contextpath']) 
    4143         
    4244        self['configfile'] = normpath(normcase(configfile)) 
     
    6971                self['wordhtml'] = f.read() 
    7072                f.close() 
     73                logging.info("read and stored directory file from %s" % self['dirfile']) 
    7174                 
    7275                # read and store the config file 
     
    7477                config_text = f.read() 
    7578                f.close() 
     79                logging.info("read and stored config file from %s" % self['configfile']) 
    7680                 
    7781                # get some essential info from the config file 
     
    8185                except: 
    8286                    self.map_number = 'XYZ' 
     87                logging.info("map number from config file: %s" % self.map_number) 
     88                 
    8389                self.creators = [] 
    8490                self.contributors = [] 
     
    94100                 
    95101                self.mixer = gismixer.gisMixer(self['gisfile']) 
    96              
     102 
     103        logging.info("DONE INITIALIZING: %s\n" % self.__class__) 
     104         
    97105             
    98106    def __getitem__(self, key): return self.data[key] 
     
    103111        """Cycle through all steps in the pipeline""" 
    104112         
     113        logging.info("CYCLING: %s" % self.__class__) 
     114 
    105115        self['wordxml'] = wordhtml2xml.convert(self['wordhtml']) 
    106116        self['cleanxml'] = wordstripper.strip(self['contextpath'], self['wordxml']) 
     
    109119        self['dirtables'] = tablegroker.grok(self['cleanxml']) 
    110120        self['places'] = tableparser.parse(self, self['dirtables'], self.map_number) 
     121        logging.info("BEGIN attempting to match and combine %s directory places with %s map places" % (len(self['places']),len(self.mixer.fieldvalues))) 
    111122        for place in self['places']: 
    112123            if place.type == 'unlocated' or place.type == 'false': 
    113                 self.matched = True 
     124                place.matched = True 
    114125            else: 
    115126                self.mixer.mixin(self['config'], place) 
    116          
     127        logging.info("DONE mixing directory and map places") 
     128        logging.warning("places in GIS data that have no corresponding directory entry are ignored!") 
    117129        #placesaver.save_places_tei(self) 
    118130        placesaver.save_places_frank(self) 
    119131         
     132        logging.info("DONE CYCLING: %s\n" % self.__class__) 
     133         
     134             
     135 
    120136    def save(self, itemkey, encoding='utf-8'): 
    121137        """Basic save-with-encoding function for writing content to an arbitrary  
  • BADataMunger/trunk/placesaver.py

    r843 r845  
     1import logging 
    12import re 
    23from os.path import join 
     
    3334    """Save all the places using the Pleiades frankenformat.""" 
    3435     
     36     
    3537    # iterate through the places list, creating a corresponding frankenfile for each place 
    3638    places = self['places'] 
    37     for i, place in enumerate([place for place in places if place.matched]): 
     39    matchedplaces = [place for place in places if place.matched] 
     40    logging.info("BEGIN: attempting to save (in frankenformat) all %s matched places out of %s total places" % (len(matchedplaces), len(places))) 
     41    logging.warning("Nothing is done about places from the GIS/map data that don't have corresponding directory entries!") 
     42    for i, place in enumerate(matchedplaces): 
    3843         
    3944         
     
    232237        g.write(pcontent) 
    233238        g.close() 
     239         
     240    logging.info("DONE saving in frankenformat") 
    234241     
    235242def save_places_tei(self): 
  • BADataMunger/trunk/tableparser.py

    r842 r845  
     1import logging 
    12import operator 
    23import re 
     
    3334    Return the info in a list.""" 
    3435     
     36    logging.info("BEGIN attempt to parse the tables from the directory: tableparser.parse()") 
    3537    places = [] 
    3638    for key in tabledict.keys(): 
     
    4244            places += parsetable(context, tabletype, tabledict[key], map_number) 
    4345             
     46    logging.info("DONE with tableparser.parse()") 
    4447    return places 
    4548     
    4649def parsetable(context, tabletype, table, map_number): 
    4750    places = [] 
     51    logging.info("BEGIN parsing a %s table: tableparser.parsetable()" % tabletype) 
    4852    for ri, row in enumerate(table[1].xpath("descendant::*[local-name()='tr']")): 
    4953        rowtext = normalizetext(getalltext(row)) 
     
    8589                     
    8690            places = places + [p] 
    87          
     91    logging.info("DONE with tableparser.parsetable(); found %s places of type %s" % (len(places), tabletype)) 
    8892    return places 
    8993     
  • BADataMunger/trunk/wordhtml2xml.py

    r784 r845  
     1import logging 
     2 
    13from BeautifulSoup import BeautifulSoup 
    24import lxml.etree as etree 
     
    810def convert(source): 
    911    """Make html exported from Microsoft Word both well-formed and valid.""" 
     12     
     13    logging.info("BEGIN attempt to make MSWord-exported HTML both well-formed and valid: wordhtml2xml.convert()") 
    1014    soup = BeautifulSoup(source) 
    1115    html = soup.findAll('html')[0] 
     
    1519    tree = etree.XML(XMLDOCTYPE + unicode(soup)) 
    1620    tree = wordnormalizer.normalize(tree) 
     21    logging.info("DONE with wordhtml2xml.convert()") 
    1722    return tree 
    1823     
  • BADataMunger/trunk/wordstripper.py

    r784 r845  
     1import logging 
    12from os.path import join 
    23 
     
    78def strip(contextpath, source): 
    89    """Strip unneeded formatting inherited from MSWord.""" 
     10    logging.info("BEGIN attempt to strip unneeded formatting inherited from MSWord using %s: wordstripper.strip()" % XSLTFILE) 
    911    xslt_doc = etree.parse(join(contextpath, XSLTFILE)) 
    1012    transform = etree.XSLT(xslt_doc) 
    1113    result = etree.XML(unicode(transform(source))) 
     14    logging.info("DONE with wordstripper.strip()") 
    1215    return result 
    1316