| 1 |
import re |
|---|
| 2 |
import logging |
|---|
| 3 |
|
|---|
| 4 |
import lxml.etree as etree |
|---|
| 5 |
|
|---|
| 6 |
MODS = "http://www.loc.gov/mods/v3" |
|---|
| 7 |
XLINK = "http://www.w3.org/1999/xlink" |
|---|
| 8 |
NSD = {'mods': MODS, 'xlink': XLINK} |
|---|
| 9 |
|
|---|
| 10 |
def flatten(text, addit=u''): |
|---|
| 11 |
"""Rip all punctuation and whitespace out of string, lower-case it.""" |
|---|
| 12 |
result = text |
|---|
| 13 |
result = re.sub(u'[\s\n]',u'', result) |
|---|
| 14 |
|
|---|
| 15 |
result = re.sub(u"[_:;,\.\-\?\(\)/]", u'', result) |
|---|
| 16 |
if addit != u'': |
|---|
| 17 |
result = re.sub(addit, u'', result) |
|---|
| 18 |
result = result.lower() |
|---|
| 19 |
return result |
|---|
| 20 |
|
|---|
| 21 |
class Mixer: |
|---|
| 22 |
"""Mix together mods from two data sources. |
|---|
| 23 |
|
|---|
| 24 |
Use like: |
|---|
| 25 |
import modsmixer as mx |
|---|
| 26 |
m = mx.Mixer(r'C:\TomDocs\awmcwork\pleiadesact\svnbox\BADataMunger\wordhtml\batl038_-biblio-mods.xml', |
|---|
| 27 |
r'C:\TomDocs\awmcwork\pleiadesact\svnbox\BADataMunger\wordhtml\awmcbib-mods.xml', |
|---|
| 28 |
r'C:\TomDocs\awmcwork\pleiadesact\svnbox\BADataMunger\wordhtml\ba038modsOnSteroids.xml') |
|---|
| 29 |
""" |
|---|
| 30 |
def __init__(self, student, library, destination): |
|---|
| 31 |
|
|---|
| 32 |
|
|---|
| 33 |
logging.info("INITIALIZING: %s" % self.__class__) |
|---|
| 34 |
|
|---|
| 35 |
# read basic mods information from the "student" file: this is the stuff we want to enhance |
|---|
| 36 |
f = open(student) |
|---|
| 37 |
content = f.read() |
|---|
| 38 |
content = content.replace('<modsCollection xmlns="http://www.loc.gov/mods/v3">', '<modsCollection xmlns="http://www.loc.gov/mods/v3" xmlns:xlink="http://www.w3.org/1999/xlink">') |
|---|
| 39 |
f.close() |
|---|
| 40 |
self.student = etree.XML(content) |
|---|
| 41 |
logging.info("read mods information to enhance (aka 'student file') from: %s" % student) |
|---|
| 42 |
logging.info("there are %s mods records in the 'student file'" % len(self.student.xpath(u"//mods:mods", NSD))) |
|---|
| 43 |
|
|---|
| 44 |
# read a library of mods information from the "library" file: this is where we will pull the enhancements from |
|---|
| 45 |
f = open(library) |
|---|
| 46 |
content = f.read() |
|---|
| 47 |
f.close() |
|---|
| 48 |
content = content.replace('<modsCollection>', '<modsCollection xmlns="http://www.loc.gov/mods/v3" xmlns:xlink="http://www.w3.org/1999/xlink">') |
|---|
| 49 |
self.library = etree.XML(content) |
|---|
| 50 |
logging.info("read mods enhancement information (aka 'library file') from: %s" % library) |
|---|
| 51 |
logging.info("there are %s mods records in the 'library file'" % len(self.library.xpath(u"//mods:mods", NSD))) |
|---|
| 52 |
|
|---|
| 53 |
snodes = self.student.xpath(u"//mods:mods", NSD) |
|---|
| 54 |
for snode in snodes: |
|---|
| 55 |
s_short_title_node = snode.xpath(u"mods:titleInfo[@type='abbreviated']/mods:title", NSD) |
|---|
| 56 |
s_short_title = u' '.join(s_short_title_node[0].text.replace('\n', ' ').strip().split()) |
|---|
| 57 |
s_title_node = snode.xpath(u"mods:titleInfo[not(@type)]/mods:title", NSD)[0] |
|---|
| 58 |
try: |
|---|
| 59 |
s_title = u' '.join(s_title_node.text.replace('\n', ' ').strip().split()) |
|---|
| 60 |
except: |
|---|
| 61 |
logging.warning('Something is wrong with the student title node for short_title = %s' % s_short_title) |
|---|
| 62 |
|
|---|
| 63 |
lquery = u"//mods:mods/mods:titleInfo[@type='abbreviated']/mods:title[normalize-space(.) = '%s']/ancestor::mods:mods" % s_short_title |
|---|
| 64 |
lnodes = self.library.xpath(lquery, NSD) |
|---|
| 65 |
if len(lnodes) > 1: |
|---|
| 66 |
|
|---|
| 67 |
lquery = u"//mods:mods/mods:titleInfo[@type='uniform']/mods:title[normalize-space(.) = '%s']/ancestor::mods:mods" % s_title |
|---|
| 68 |
lnodes = self.library.xpath(lquery, NSD) |
|---|
| 69 |
if len(lnodes) == 1: |
|---|
| 70 |
if len(lnodes[0].xpath(u"mods:titleInfo[@type='abbreviated']/mods:title[normalize-space(.) = '%s']/ancestor::mods:mods" % s_short_title, NSD)) != 1: |
|---|
| 71 |
logging.warning("found match in library on full title '%s', but short title did not match '%s'" % (s_title, s_short_title)) |
|---|
| 72 |
lnodes = self.library.xpath(lquery, NSD) |
|---|
| 73 |
elif len(lnodes) > 1: |
|---|
| 74 |
logging.warning("found multiple matches on long title in library for '%s = %s" % (s_short_title.encode('latin', 'xmlcharrefreplace'), s_title.encode('latin', 'xmlcharrefreplace'))) |
|---|
| 75 |
else: |
|---|
| 76 |
logging.warning("found multiple matches in on short title library for '%s = %s" % (s_short_title.encode('latin', 'xmlcharrefreplace'), s_title.encode('latin', 'xmlcharrefreplace'))) |
|---|
| 77 |
if len(lnodes) == 0: |
|---|
| 78 |
squery = u"*[local-name()='abstract']/descendant::*[local-name()='i' and contains(normalize-space(),'%s')]/ancestor::*[local-name() = 'mods']" % s_title.encode('ascii', 'xmlcharrefreplace') |
|---|
| 79 |
try: |
|---|
| 80 |
if len(snode.xpath(squery)) == 0: |
|---|
| 81 |
logging.warning("unmatched article '%s'" % s_short_title.encode('ascii', 'xmlcharrefreplace')) |
|---|
| 82 |
else: |
|---|
| 83 |
logging.warning("unmatched book '%s = %s" % (s_short_title.encode('ascii', 'xmlcharrefreplace'), s_title.encode('latin', 'xmlcharrefreplace'))) |
|---|
| 84 |
except: |
|---|
| 85 |
logging.warning("something went horribly wrong") |
|---|
| 86 |
logging.warning("squery = %s" % squery.encode('ascii', 'backslashreplace')) |
|---|
| 87 |
if len(lnodes)==1: |
|---|
| 88 |
# one match, so gather up all the data we will need |
|---|
| 89 |
lnode = lnodes[0] |
|---|
| 90 |
l_title_nodes = lnode.xpath(u"mods:titleInfo[@type='uniform']/mods:title", NSD) |
|---|
| 91 |
try: |
|---|
| 92 |
l_title_node = l_title_nodes[0] |
|---|
| 93 |
l_title = u' '.join(l_title_node.text.replace('\n', ' ').strip().split()) |
|---|
| 94 |
except: |
|---|
| 95 |
logging.warning("no title node found in library match for %s = %s" % (s_short_title.encode('latin', 'backslashreplace'), s_title.encode('latin', 'backslashreplace'))) |
|---|
| 96 |
try: |
|---|
| 97 |
l_title_lang = l_title_node.xpath(u"../@*[local-name()='lang']")[0] |
|---|
| 98 |
except: |
|---|
| 99 |
l_title_lang = u'' |
|---|
| 100 |
|
|---|
| 101 |
# MODIFY THE STUDENT |
|---|
| 102 |
|
|---|
| 103 |
# ID attribute on mods element |
|---|
| 104 |
if len(lnode.xpath("./@ID")) > 0: |
|---|
| 105 |
snode.attrib['ID'] = lnode.attrib['ID'] |
|---|
| 106 |
else: |
|---|
| 107 |
prevnodes = lnode.xpath("preceding::*[local-name()='mods']") |
|---|
| 108 |
totnodes = lnode.xpath("//*[local-name()='mods']") |
|---|
| 109 |
logging.warning("no ID attribute on mods element %s of %s" % (len(prevnodes), len(totnodes))) |
|---|
| 110 |
|
|---|
| 111 |
# proper title type attributes |
|---|
| 112 |
s_title_node.xpath("..")[0].attrib['type']='uniform' |
|---|
| 113 |
s_short_title_node[0].xpath("..")[0].attrib['type'] = 'abbreviated' |
|---|
| 114 |
|
|---|
| 115 |
# long title |
|---|
| 116 |
if l_title != s_title: |
|---|
| 117 |
flat_l_title = flatten(l_title, u'[\'\u2018\u2019]') |
|---|
| 118 |
flat_s_title = flatten(s_title, u'[\'\u2018\u2019]') |
|---|
| 119 |
#print '\nflat_s_title: ' + flat_s_title.encode('ascii', 'xmlcharrefreplace') + '\n' |
|---|
| 120 |
#print 'flat_l_title: ' + flat_l_title.encode('ascii', 'xmlcharrefreplace') + '\n' |
|---|
| 121 |
|
|---|
| 122 |
if flat_s_title == flat_l_title: |
|---|
| 123 |
# it's just a minor punctuation difference: figure out what and fix |
|---|
| 124 |
quot_s_title = re.sub(u'[\u2018\u2019]',u'\'', flatten(s_title)) |
|---|
| 125 |
quot_l_title = re.sub(u'[\u2018\u2019]', u'\'', flatten(l_title)) |
|---|
| 126 |
|
|---|
| 127 |
# print ">>>> quot_s_title: " + quot_s_title.encode('ascii', 'xmlcharrefreplace') + '\n' |
|---|
| 128 |
# print ">>>> quot_l_title: " + quot_l_title.encode('ascii', 'xmlcharrefreplace') + '\n' |
|---|
| 129 |
|
|---|
| 130 |
if quot_s_title == flatten(l_title): |
|---|
| 131 |
# curly quotes in the student but straight quotes in the library, keep student |
|---|
| 132 |
pass |
|---|
| 133 |
elif quot_l_title == flatten(s_title): |
|---|
| 134 |
# curly quotes in the library but straight quotes in the student, use library |
|---|
| 135 |
s_title_node.text = l_title |
|---|
| 136 |
elif flat_s_title.replace(flat_l_title, u'') == u'.': |
|---|
| 137 |
# period at end of student title, use library |
|---|
| 138 |
s_title_node.text = l_title |
|---|
| 139 |
elif s_title == l_title.replace(u'_', u''): |
|---|
| 140 |
# defective LOC transliteration in library, keep student |
|---|
| 141 |
pass |
|---|
| 142 |
else: |
|---|
| 143 |
# can't figure it out, call in the humans |
|---|
| 144 |
snode.append(l_title_node.xpath("..")[0]) |
|---|
| 145 |
elif flat_l_title.startswith(flat_s_title): |
|---|
| 146 |
# use l_title b/c s_title is ommitting a subtitle |
|---|
| 147 |
s_title_node.text = l_title |
|---|
| 148 |
elif flat_l_title.endswith(flat_s_title): |
|---|
| 149 |
# look for missing articles |
|---|
| 150 |
prefix = flat_l_title.replace(flat_s_title,'') |
|---|
| 151 |
if prefix == "the" or prefix == "a" or prefix == "an": |
|---|
| 152 |
# student omits an initial article, use library |
|---|
| 153 |
s_title_node.text = l_title |
|---|
| 154 |
else: |
|---|
| 155 |
# let a human sort it out |
|---|
| 156 |
snode.append(l_title_node.xpath("..")[0]) |
|---|
| 157 |
else: |
|---|
| 158 |
# add alternative title, including its language attribute ... a human will have to sort out |
|---|
| 159 |
snode.append(l_title_node.xpath("..")[0]) |
|---|
| 160 |
|
|---|
| 161 |
# long title language and script |
|---|
| 162 |
if l_title_lang.strip() != '': |
|---|
| 163 |
# add xml:lang attribute to existing title on scholar |
|---|
| 164 |
s_title_node.xpath('..')[0].attrib['{http://www.w3.org/XML/1998/namespace}lang']=l_title_lang |
|---|
| 165 |
else: |
|---|
| 166 |
logging.warning("No title language found for '%s'\n" % s_short_title.encode('latin1', 'xmlcharrefreplace')) |
|---|
| 167 |
|
|---|
| 168 |
# copy over items verbatim from library record to student record |
|---|
| 169 |
nodestocopy = [] |
|---|
| 170 |
nodestocopy += lnode.xpath(u"mods:name", NSD) |
|---|
| 171 |
nodestocopy += lnode.xpath(u"mods:typeOfResource", NSD) |
|---|
| 172 |
nodestocopy += lnode.xpath(u"mods:genre", NSD) |
|---|
| 173 |
nodestocopy += lnode.xpath(u"mods:originInfo", NSD) |
|---|
| 174 |
nodestocopy += lnode.xpath(u"mods:relatedItem", NSD) |
|---|
| 175 |
nodestocopy += lnode.xpath(u"mods:identifier", NSD) |
|---|
| 176 |
nodestocopy += lnode.xpath(u"mods:location", NSD) |
|---|
| 177 |
nodestocopy += lnode.xpath(u"mods:recordInfo", NSD) |
|---|
| 178 |
|
|---|
| 179 |
if len(nodestocopy) > 0: |
|---|
| 180 |
for node in nodestocopy: |
|---|
| 181 |
snode.append(node) |
|---|
| 182 |
|
|---|
| 183 |
# what about related items????? |
|---|
| 184 |
# find all relatedItems and harvest ids from href attrs |
|---|
| 185 |
# iterate to check student for appropriate records |
|---|
| 186 |
# if they're not there, add them from library |
|---|
| 187 |
orphans = True |
|---|
| 188 |
relatedItems = self.student.xpath("//mods:relatedItem", NSD) |
|---|
| 189 |
ricount = len(relatedItems) |
|---|
| 190 |
richecked = 0 |
|---|
| 191 |
orphtot = 0 |
|---|
| 192 |
libfails = [] |
|---|
| 193 |
while orphans: |
|---|
| 194 |
for relatedItem in relatedItems: |
|---|
| 195 |
ri_id = relatedItem.xpath("./@xlink:href", NSD)[0].replace("#", "") |
|---|
| 196 |
if len(self.student.xpath("//mods:mods[@ID='%s']" % ri_id, NSD)) == 0: |
|---|
| 197 |
lnodes = self.library.xpath("//mods:mods[@ID='%s']" % ri_id, NSD) |
|---|
| 198 |
if len(lnodes) != 1: |
|---|
| 199 |
logging.warning("failure seeking %s in library file; match count = %s" % (ri_id, len(lnodes))) |
|---|
| 200 |
libfails.append(ri_id) |
|---|
| 201 |
else: |
|---|
| 202 |
orphtot += 1 |
|---|
| 203 |
self.student.xpath("//mods:modsCollection", NSD)[0].append(lnodes[0]) |
|---|
| 204 |
richecked += 1 |
|---|
| 205 |
relatedItems = self.student.xpath("//mods:relatedItem", NSD) |
|---|
| 206 |
ricount = len(relatedItems) |
|---|
| 207 |
if ricount - len(libfails) <= richecked: |
|---|
| 208 |
orphans = False |
|---|
| 209 |
richecked = 0 |
|---|
| 210 |
|
|---|
| 211 |
logging.info("added to 'student' an additional %s records from 'library' because they are related works" % orphtot) |
|---|
| 212 |
|
|---|
| 213 |
pcontent = etree.tostring(self.student).encode('utf-8') |
|---|
| 214 |
pcontent = '<?xml version="1.0" encoding="UTF-8"?>\n' + pcontent |
|---|
| 215 |
f = open(destination, 'w') |
|---|
| 216 |
f.write(pcontent) |
|---|
| 217 |
f.close() |
|---|
| 218 |
|
|---|
| 219 |
logging.info("saved enhanced records to %s" % destination) |
|---|
| 220 |
|
|---|
| 221 |
logging.info("DONE: %s\n" % self.__class__) |
|---|
| 222 |
|
|---|