Changeset 824
- Timestamp:
- 06/12/07 14:08:16 (2 years ago)
- Files:
-
- BADataMunger/trunk/modsmixer.py (modified) (9 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
BADataMunger/trunk/modsmixer.py
r813 r824 2 2 3 3 import lxml.etree as etree 4 5 MODS = "http://www.loc.gov/mods/v3" 6 XLINK = "http://www.w3.org/1999/xlink" 7 NSD = {'mods': MODS, 'xlink': XLINK} 4 8 5 9 def flatten(text, addit=u''): … … 27 31 f = open(student) 28 32 content = f.read() 33 content = content.replace('<modsCollection xmlns="http://www.loc.gov/mods/v3">', '<modsCollection xmlns="http://www.loc.gov/mods/v3" xmlns:xlink="http://www.w3.org/1999/xlink">') 29 34 f.close() 30 35 self.student = etree.XML(content) … … 32 37 content = f.read() 33 38 f.close() 39 content = content.replace('<modsCollection>', '<modsCollection xmlns="http://www.loc.gov/mods/v3" xmlns:xlink="http://www.w3.org/1999/xlink">') 34 40 self.library = etree.XML(content) 35 41 36 snodes = self.student.xpath(u"// *[local-name()='mods']")42 snodes = self.student.xpath(u"//mods:mods", NSD) 37 43 for snode in snodes: 38 s_short_title_node = snode.xpath(u" *[local-name()='titleInfo' and @type='abbreviated']/*[local-name()='title']")44 s_short_title_node = snode.xpath(u"mods:titleInfo[@type='abbreviated']/mods:title", NSD) 39 45 s_short_title = u' '.join(s_short_title_node[0].text.replace('\n', ' ').strip().split()) 40 s_title_node = snode.xpath(u" *[local-name()='titleInfo' and not(@type)]/*[local-name()='title']")[0]46 s_title_node = snode.xpath(u"mods:titleInfo[not(@type)]/mods:title", NSD)[0] 41 47 try: 42 48 s_title = u' '.join(s_title_node.text.replace('\n', ' ').strip().split()) … … 44 50 print 'Something is wrong with the student title node for short_title = %s' % s_short_title 45 51 46 lquery = u"//mods /titleInfo[@type='abbreviated']/title[normalize-space(.) = '%s']/ancestor::mods" % s_short_title47 lnodes = self.library.xpath(lquery )52 lquery = u"//mods:mods/mods:titleInfo[@type='abbreviated']/mods:title[normalize-space(.) = '%s']/ancestor::mods:mods" % s_short_title 53 lnodes = self.library.xpath(lquery, NSD) 48 54 if len(lnodes) > 1: 49 55 50 lquery = u"//mods /titleInfo[@type='uniform']/title[normalize-space(.) = '%s']/ancestor::mods" % s_title51 lnodes = self.library.xpath(lquery )56 lquery = u"//mods:mods/mods:titleInfo[@type='uniform']/mods:title[normalize-space(.) = '%s']/ancestor::mods:mods" % s_title 57 lnodes = self.library.xpath(lquery, NSD) 52 58 if len(lnodes) == 1: 53 if len(lnodes[0].xpath(u" titleInfo[@type='abbreviated']/title[normalize-space(.) = '%s']/ancestor::mods" % s_short_title)) != 1:59 if len(lnodes[0].xpath(u"mods:titleInfo[@type='abbreviated']/mods:title[normalize-space(.) = '%s']/ancestor::mods:mods" % s_short_title, NSD)) != 1: 54 60 print "found match in library on full title '%s', but short title did not match '%s'" % (s_title, s_short_title) 55 lnodes = self.library.xpath(lquery )61 lnodes = self.library.xpath(lquery, NSD) 56 62 elif len(lnodes) > 1: 57 63 print "found multiple matches on long title in library for '%s = %s" % (s_short_title.encode('latin', 'xmlcharrefreplace'), s_title.encode('latin', 'xmlcharrefreplace')) … … 59 65 print "found multiple matches in on short title library for '%s = %s" % (s_short_title.encode('latin', 'xmlcharrefreplace'), s_title.encode('latin', 'xmlcharrefreplace')) 60 66 if len(lnodes) == 0: 61 squery = u"*[local-name()='abstract']/descendant::*[local-name()='i' and contains(normalize-space(.),'%s')]/ancestor:: *[local-name()='mods']" % s_title67 squery = u"*[local-name()='abstract']/descendant::*[local-name()='i' and contains(normalize-space(.),'%s')]/ancestor::mods:mods" % s_title 62 68 if len(snode.xpath(squery)) == 0: 63 69 print " notice: unmatched article '%s'" % s_short_title.encode('latin', 'xmlcharrefreplace') … … 67 73 # one match, so gather up all the data we will need 68 74 lnode = lnodes[0] 69 l_title_nodes = lnode.xpath(u" titleInfo[@type='uniform']/title")75 l_title_nodes = lnode.xpath(u"mods:titleInfo[@type='uniform']/mods:title", NSD) 70 76 try: 71 77 l_title_node = l_title_nodes[0] … … 83 89 if len(lnode.xpath("./@ID")) > 0: 84 90 snode.attrib['ID'] = lnode.attrib['ID'] 91 else: 92 print "no ID!!!!" 85 93 86 94 # proper title type attributes … … 137 145 if l_title_lang.strip() != '': 138 146 # add xml:lang attribute to existing title on scholar 139 s_title_node. attrib['{http://www.w3.org/XML/1998/namespace}lang']=l_title_lang147 s_title_node.xpath('..')[0].attrib['{http://www.w3.org/XML/1998/namespace}lang']=l_title_lang 140 148 else: 141 149 print "No title language found for '%s'\n" % s_short_title.encode('latin1', 'xmlcharrefreplace') … … 143 151 # copy over items verbatim from library record to student record 144 152 nodestocopy = [] 145 nodestocopy += lnode.xpath(u" name")146 nodestocopy += lnode.xpath(u" typeOfResource")147 nodestocopy += lnode.xpath(u" genre")148 nodestocopy += lnode.xpath(u" originInfo")149 nodestocopy += lnode.xpath(u" relatedItem")150 nodestocopy += lnode.xpath(u" identifier")151 nodestocopy += lnode.xpath(u" location")153 nodestocopy += lnode.xpath(u"mods:name", NSD) 154 nodestocopy += lnode.xpath(u"mods:typeOfResource", NSD) 155 nodestocopy += lnode.xpath(u"mods:genre", NSD) 156 nodestocopy += lnode.xpath(u"mods:originInfo", NSD) 157 nodestocopy += lnode.xpath(u"mods:relatedItem", NSD) 158 nodestocopy += lnode.xpath(u"mods:identifier", NSD) 159 nodestocopy += lnode.xpath(u"mods:location", NSD) 152 160 153 161 if len(nodestocopy) > 0: 154 162 for node in nodestocopy: 155 163 snode.append(node) 156 164 165 # what about related items????? 166 # find all relatedItems and harvest ids from href attrs 167 # iterate to check student for appropriate records 168 # if they're not there, add them from library 169 orphans = True 170 relatedItems = self.student.xpath("//mods:relatedItem", NSD) 171 ricount = len(relatedItems) 172 richecked = 0 173 while orphans: 174 for relatedItem in relatedItems: 175 ri_id = relatedItem.xpath("./@xlink:href", NSD)[0].replace("#", "") 176 if len(self.student.xpath("//mods:mods[@ID='%s']" % ri_id, NSD)) == 0: 177 lnodes = self.library.xpath("//mods:mods[@ID='%s']" % ri_id, NSD) 178 if len(lnodes) != 1: 179 print "failure seeking %s in library file; match count = %s" % (ri_id, len(lnodes)) 180 break 181 else: 182 print "adding %s" % ri_id 183 self.student.xpath("//mods:modsCollection", NSD)[0].append(lnodes[0]) 184 richecked += 1 185 relatedItems = self.student.xpath("//mods:relatedItem", NSD) 186 ricount = len(relatedItems) 187 if ricount <= richecked: 188 orphans = False 189 richecked = 0 190 157 191 pcontent = etree.tostring(self.student).encode('utf-8') 192 pcontent = '<?xml version="1.0" encoding="UTF-8"?>\n' + pcontent 158 193 f = open(destination, 'w') 159 194 f.write(pcontent)
