root/BADataMunger/trunk/modsmixer.py

Revision 1331, 12.4 kB (checked in by thomase, 6 months ago)

cleaner code for error messages on IDless mods elements

  • Property svn:eol-style set to native
Line 
1 import re
2 import logging
3
4 import lxml.etree as etree
5
6 MODS = "http://www.loc.gov/mods/v3"
7 XLINK = "http://www.w3.org/1999/xlink"
8 NSD = {'mods': MODS, 'xlink': XLINK}
9
10 def flatten(text, addit=u''):
11     """Rip all punctuation and whitespace out of string, lower-case it."""
12     result = text
13     result = re.sub(u'[\s\n]',u'', result)
14    
15     result = re.sub(u"[_:;,\.\-\?\(\)/]", u'', result)
16     if addit != u'':
17         result = re.sub(addit, u'', result)
18     result = result.lower()
19     return result
20    
21 class Mixer:
22     """Mix together mods from two data sources.
23    
24     Use like:
25         import modsmixer as mx
26         m = mx.Mixer(r'C:\TomDocs\awmcwork\pleiadesact\svnbox\BADataMunger\wordhtml\batl038_-biblio-mods.xml',
27             r'C:\TomDocs\awmcwork\pleiadesact\svnbox\BADataMunger\wordhtml\awmcbib-mods.xml',
28             r'C:\TomDocs\awmcwork\pleiadesact\svnbox\BADataMunger\wordhtml\ba038modsOnSteroids.xml')
29     """
30     def __init__(self, student, library, destination):
31        
32        
33         logging.info("INITIALIZING: %s" % self.__class__)
34        
35         # read basic mods information from the "student" file: this is the stuff we want to enhance
36         f = open(student)
37         content = f.read()
38         content = content.replace('<modsCollection xmlns="http://www.loc.gov/mods/v3">', '<modsCollection xmlns="http://www.loc.gov/mods/v3" xmlns:xlink="http://www.w3.org/1999/xlink">')
39         f.close()
40         self.student = etree.XML(content)
41         logging.info("read mods information to enhance (aka 'student file') from: %s" % student)
42         logging.info("there are %s mods records in the 'student file'" % len(self.student.xpath(u"//mods:mods", NSD)))
43        
44         # read a library of mods information from the "library" file: this is where we will pull the enhancements from
45         f = open(library)
46         content = f.read()
47         f.close()
48         content = content.replace('<modsCollection>', '<modsCollection xmlns="http://www.loc.gov/mods/v3" xmlns:xlink="http://www.w3.org/1999/xlink">')
49         self.library = etree.XML(content)
50         logging.info("read mods enhancement information (aka 'library file') from: %s" % library)
51         logging.info("there are %s mods records in the 'library file'" % len(self.library.xpath(u"//mods:mods", NSD)))
52        
53         snodes = self.student.xpath(u"//mods:mods", NSD)
54         for snode in snodes:
55             s_short_title_node = snode.xpath(u"mods:titleInfo[@type='abbreviated']/mods:title", NSD)
56             s_short_title = u' '.join(s_short_title_node[0].text.replace('\n', ' ').strip().split())
57             s_title_node = snode.xpath(u"mods:titleInfo[not(@type)]/mods:title", NSD)[0]
58             try:
59                 s_title = u' '.join(s_title_node.text.replace('\n', ' ').strip().split())
60             except:
61                 logging.warning('Something is wrong with the student title node for short_title = %s' % s_short_title)
62            
63             lquery = u"//mods:mods/mods:titleInfo[@type='abbreviated']/mods:title[normalize-space(.) = '%s']/ancestor::mods:mods" % s_short_title
64             lnodes = self.library.xpath(lquery, NSD)
65             if len(lnodes) > 1:
66                
67                 lquery = u"//mods:mods/mods:titleInfo[@type='uniform']/mods:title[normalize-space(.) = '%s']/ancestor::mods:mods" % s_title
68                 lnodes = self.library.xpath(lquery, NSD)
69                 if len(lnodes) == 1:
70                     if len(lnodes[0].xpath(u"mods:titleInfo[@type='abbreviated']/mods:title[normalize-space(.) = '%s']/ancestor::mods:mods" % s_short_title, NSD)) != 1:
71                         logging.warning("found match in library on full title '%s', but short title did not match '%s'" % (s_title, s_short_title))
72                         lnodes = self.library.xpath(lquery, NSD)
73                 elif len(lnodes) > 1:
74                     logging.warning("found multiple matches on long title in library for '%s = %s" % (s_short_title.encode('latin', 'xmlcharrefreplace'), s_title.encode('latin', 'xmlcharrefreplace')))
75                 else:
76                     logging.warning("found multiple matches in on short title library for '%s = %s" % (s_short_title.encode('latin', 'xmlcharrefreplace'), s_title.encode('latin', 'xmlcharrefreplace')))
77             if len(lnodes) == 0:
78                 squery = u"*[local-name()='abstract']/descendant::*[local-name()='i' and contains(normalize-space(),'%s')]/ancestor::*[local-name() = 'mods']" % s_title.encode('ascii', 'xmlcharrefreplace')
79                 try:
80                     if len(snode.xpath(squery)) == 0:
81                         logging.warning("unmatched article '%s'" % s_short_title.encode('ascii', 'xmlcharrefreplace'))
82                     else:
83                         logging.warning("unmatched book '%s = %s" % (s_short_title.encode('ascii', 'xmlcharrefreplace'), s_title.encode('latin', 'xmlcharrefreplace')))
84                 except:
85                     logging.warning("something went horribly wrong")
86                     logging.warning("squery = %s" % squery.encode('ascii', 'backslashreplace'))
87             if len(lnodes)==1:
88                 # one match, so gather up all the data we will need
89                 lnode = lnodes[0]
90                 l_title_nodes = lnode.xpath(u"mods:titleInfo[@type='uniform']/mods:title", NSD)
91                 try:
92                     l_title_node = l_title_nodes[0]
93                     l_title = u' '.join(l_title_node.text.replace('\n', ' ').strip().split())
94                 except:
95                     logging.warning("no title node found in library match for %s = %s" % (s_short_title.encode('latin', 'backslashreplace'), s_title.encode('latin', 'backslashreplace')))
96                 try:
97                     l_title_lang = l_title_node.xpath(u"../@*[local-name()='lang']")[0]
98                 except:
99                     l_title_lang = u''
100
101                 # MODIFY THE STUDENT
102                
103                 # ID attribute on mods element
104                 if len(lnode.xpath("./@ID")) > 0:
105                     snode.attrib['ID'] = lnode.attrib['ID']
106                 else:
107                     prevnodes = lnode.xpath("preceding::*[local-name()='mods']")
108                     totnodes = lnode.xpath("//*[local-name()='mods']")
109                     logging.warning("no ID attribute on mods element %s of %s" % (len(prevnodes), len(totnodes)))
110
111                 # proper title type attributes
112                 s_title_node.xpath("..")[0].attrib['type']='uniform'
113                 s_short_title_node[0].xpath("..")[0].attrib['type'] = 'abbreviated'
114                
115                 # long title
116                 if l_title != s_title:
117                     flat_l_title = flatten(l_title, u'[\'\u2018\u2019]')
118                     flat_s_title = flatten(s_title, u'[\'\u2018\u2019]')
119                     #print '\nflat_s_title: ' + flat_s_title.encode('ascii', 'xmlcharrefreplace') + '\n'
120                     #print 'flat_l_title: ' + flat_l_title.encode('ascii', 'xmlcharrefreplace') + '\n'
121                    
122                     if flat_s_title == flat_l_title:
123                         # it's just a minor punctuation difference: figure out what and fix
124                         quot_s_title = re.sub(u'[\u2018\u2019]',u'\'', flatten(s_title))
125                         quot_l_title = re.sub(u'[\u2018\u2019]', u'\'', flatten(l_title))
126                        
127                         # print ">>>> quot_s_title: " + quot_s_title.encode('ascii', 'xmlcharrefreplace') + '\n'
128                         # print ">>>> quot_l_title: " + quot_l_title.encode('ascii', 'xmlcharrefreplace') + '\n'
129                        
130                         if quot_s_title == flatten(l_title):
131                             # curly quotes in the student but straight quotes in the library, keep student
132                             pass
133                         elif quot_l_title == flatten(s_title):
134                             # curly quotes in the library but straight quotes in the student, use library
135                             s_title_node.text = l_title
136                         elif flat_s_title.replace(flat_l_title, u'') == u'.':
137                             # period at end of student title, use library
138                             s_title_node.text = l_title
139                         elif s_title == l_title.replace(u'_', u''):
140                             # defective LOC transliteration in library, keep student
141                             pass
142                         else:
143                             # can't figure it out, call in the humans
144                             snode.append(l_title_node.xpath("..")[0])
145                     elif flat_l_title.startswith(flat_s_title):
146                         # use l_title b/c s_title is ommitting a subtitle
147                         s_title_node.text = l_title
148                     elif flat_l_title.endswith(flat_s_title):
149                         # look for missing articles
150                         prefix = flat_l_title.replace(flat_s_title,'')
151                         if prefix == "the" or prefix == "a" or prefix == "an":
152                             # student omits an initial article, use library
153                             s_title_node.text = l_title
154                         else:
155                             # let a human sort it out
156                             snode.append(l_title_node.xpath("..")[0])
157                     else:
158                         # add alternative title, including its language attribute ... a human will have to sort out
159                         snode.append(l_title_node.xpath("..")[0])
160                        
161                 # long title language and script
162                 if l_title_lang.strip() != '':
163                     # add xml:lang attribute to existing title on scholar
164                     s_title_node.xpath('..')[0].attrib['{http://www.w3.org/XML/1998/namespace}lang']=l_title_lang
165                 else:
166                     logging.warning("No title language found for '%s'\n" % s_short_title.encode('latin1', 'xmlcharrefreplace'))
167                    
168                 # copy over items verbatim from library record to student record
169                 nodestocopy = []
170                 nodestocopy += lnode.xpath(u"mods:name", NSD)
171                 nodestocopy += lnode.xpath(u"mods:typeOfResource", NSD)
172                 nodestocopy += lnode.xpath(u"mods:genre", NSD)
173                 nodestocopy += lnode.xpath(u"mods:originInfo", NSD)
174                 nodestocopy += lnode.xpath(u"mods:relatedItem", NSD)
175                 nodestocopy += lnode.xpath(u"mods:identifier", NSD)
176                 nodestocopy += lnode.xpath(u"mods:location", NSD)
177                 nodestocopy += lnode.xpath(u"mods:recordInfo", NSD)
178                
179                 if len(nodestocopy) > 0:
180                     for node in nodestocopy:
181                         snode.append(node)
182                        
183         # what about related items?????   
184         # find all relatedItems and harvest ids from href attrs
185         # iterate to check student for appropriate records
186         # if they're not there, add them from library
187         orphans = True
188         relatedItems = self.student.xpath("//mods:relatedItem", NSD)
189         ricount = len(relatedItems)
190         richecked = 0
191         orphtot = 0
192         libfails = []
193         while orphans:
194             for relatedItem in relatedItems:
195                 ri_id = relatedItem.xpath("./@xlink:href", NSD)[0].replace("#", "")     
196                 if len(self.student.xpath("//mods:mods[@ID='%s']" % ri_id, NSD)) == 0:
197                     lnodes = self.library.xpath("//mods:mods[@ID='%s']" % ri_id, NSD)
198                     if len(lnodes) != 1:
199                         logging.warning("failure seeking %s in library file; match count = %s" % (ri_id, len(lnodes)))
200                         libfails.append(ri_id)
201                     else:
202                         orphtot += 1
203                         self.student.xpath("//mods:modsCollection", NSD)[0].append(lnodes[0])
204                 richecked += 1
205             relatedItems = self.student.xpath("//mods:relatedItem", NSD)
206             ricount = len(relatedItems)
207             if ricount - len(libfails) <= richecked:
208                 orphans = False
209             richecked = 0
210            
211         logging.info("added to 'student' an additional %s records from 'library' because they are related works" % orphtot)
212        
213         pcontent = etree.tostring(self.student).encode('utf-8')
214         pcontent = '<?xml version="1.0" encoding="UTF-8"?>\n' + pcontent
215         f = open(destination, 'w')
216         f.write(pcontent)
217         f.close()
218        
219         logging.info("saved enhanced records to %s" % destination)
220        
221         logging.info("DONE: %s\n" % self.__class__)
222        
Note: See TracBrowser for help on using the browser.