Changeset 1332
- Timestamp:
- 07/11/08 15:52:30 (3 months ago)
- Files:
-
- BADataMunger/trunk/batlaspipe.py (modified) (10 diffs)
- BADataMunger/trunk/bidmaker.py (modified) (5 diffs)
- BADataMunger/trunk/pipebiblio.py (modified) (2 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
BADataMunger/trunk/batlaspipe.py
r1317 r1332 5 5 import unicodedata 6 6 7 import datetime as dt 8 7 9 import lxml.etree as etree 8 10 … … 11 13 import tablegroker 12 14 import tableparser 13 import placesaver14 15 from etreehelps import getalltext 15 16 from texthelps import normalizetext … … 17 18 from bidmaker import BAtlasIDMaker, DIAMOND_STOP_REGEX 18 19 19 SLASH_REGEX = re.compile(r"/\s*") 20 21 20 SLASH_REGEX = re.compile(r"\s*/\s*") 21 NAMESPACE = 'http://atlantides.org/batlas' 22 XMLDECL = '<?xml version="1.0" encoding="UTF-8"?>' 23 PREPARER = 'Tom Elliott' 22 24 23 25 class Pipe: 24 """String together a series of transformations and operations to munge a 25 BAtlas directory file into Pleiades-ready data 26 """String together a series of operations to create URIs for features from a Barrington Atlas map 26 27 27 28 Use like: 28 29 29 import pipegeo30 p = pipegeo.Pipe(r'/TomDocs/awmcwork/pleiadesact/svnbox/BADataMunger/config/BATL065_config.xml', r'/badigit/wordhtml/BATL065_.htm', r'/badigit/scratchem')30 import batlaspipe as bp 31 p = bp.Pipe(r'./config/BATL038_config.xml', r'/Users/tom/Sandbox/pleiades/wordhtml/BATL038_.htm', r'/Users/tom/Sandbox/scratch/batlasid38') 31 32 p.cycle() 32 33 p.idit() 34 p.outit() 33 35 34 36 """ … … 39 41 file 40 42 """ 41 logging.basicConfig(level=logging. DEBUG, format='%(levelname)-8s %(message)s')43 logging.basicConfig(level=logging.INFO, format='%(levelname)-8s %(message)s') 42 44 logging.info("INITIALIZING: %s" % self.__class__) 43 45 … … 51 53 52 54 if not isfile(self['configfile']): 53 raise Error, "No file found at path: %s" % self['configfile']55 logging.critical( "No file found at path: %s" % self['configfile']) 54 56 elif not isfile(self['dirfile']): 55 raise Error, "No file found at path: %s" % self['dirfile']57 logging.critical ("No file found at path: %s" % self['dirfile']) 56 58 elif not isdir(self['destdir']): 57 raise Error, "No directory found at path: %s" % self['destdir']59 logging.critical( "No directory found at path: %s" % self['destdir']) 58 60 else: 59 61 … … 118 120 def idit(self): 119 121 """ Construct all appropriate id strings (primary and alternate) for each place 120 121 the very first id should be the canonical id, composed from the entire label,122 and then we support various proxies based on the individual slash-delimited123 names within the label124 Note that we have to massage some minor directory convention differences125 from label conventions:126 - drop last part of label from any "diamond" (i.e., unicode section mark)127 when making the primary id as these don't appear in the label (but do128 treat them as proxy ids129 - drop ", T." as this is included in dirs, but omitted in labels130 - propagate final " fl." onto each prior name in a single label131 - haven't decided what to do about numbered items and unnamed features132 - unlocated features do get ids; substitute "unlocated" for gridsquare value133 define and compile static regexes134 122 """ 135 123 … … 148 136 grid = place.grid 149 137 150 label = maker.buildLabel(place.namestring, place.placenames, place.dirtype, place.locdesc, u' '.join(place.itinerary)) 151 if len(label) > 0: 152 place.batlasids.append(maker.makeID(label, mapnum, grid)) 153 154 names = [] 155 placetypes = [place.dirtype] + place.types 156 if len(place.placenames) == 0: 157 names = place.namestring.split('/') 138 if place.dirtype in ['road']: 139 logging.info("batlaspipe.idit() deliberately suppressed ID creation for %s: (%s, %s, %s, %s)" % (place.dirtype, place.namestring, place.grid, place.locdesc, u' - '.join(place.itinerary))) 140 else: 141 label = maker.buildLabel(place.namestring, place.placenames, place.dirtype, place.locdesc, u' '.join(place.itinerary)) 142 143 if len(label) > 0: 144 placetypes = [place.dirtype] + place.types 145 146 if place.dirtype == 'numbered': 147 label = "(%s)" % label 148 place.batlasids.append(maker.makeID('', mapnum, grid, label)) 149 altlabel = maker.buildAltLabel(label, place.locdesc, placetypes) 150 place.batlasids.append(maker.makeID(altlabel, mapnum, grid, phraseprefix=label)) 151 else: 152 place.batlasids.append(maker.makeID(label, mapnum, grid)) 153 154 155 names = [] 156 prefix = '' 157 postfix = '' 158 159 if len(place.placenames) == 0: 160 if place.dirtype == 'numbered': 161 names = place.locdesc.split('/') 162 prefix = label 163 else: 164 names = place.namestring.split('/') 165 else: 166 names = [p.name for p in place.placenames] 167 if len(names) > 1: 168 for name in names: 169 label = maker.buildAltLabel(place.namestring, name, placetypes) 170 if len(label) > 0: 171 bid = maker.makeID(label, mapnum, grid, postfix, prefix) 172 if bid != place.batlasids[0]: 173 place.batlasids.append(bid) 174 175 if len(place.batlasids) == 0: 176 logging.critical("did not create id for %s: (%s, %s, %s, %s)" % (place.dirtype, place.namestring, place.grid, place.locdesc, u' - '.join(place.itinerary))) 158 177 else: 159 names = [p.name for p in place.placenames] 160 if len(names) > 1: 161 for name in names: 162 label = maker.buildAltLabel(place.namestring, name, placetypes) 163 if len(label) > 0: 164 bid = maker.makeID(label, mapnum, grid) 165 if bid != place.batlasids[0]: 166 place.batlasids.append(bid) 167 168 if len(place.batlasids) == 0: 169 logging.critical("did not create id for %s: (%s, %s, %s, %s)" % (place.dirtype, place.namestring, place.grid, place.locdesc, u' - '.join(place.itinerary))) 170 else: 171 msg = "made %s id(s):" % len(place.batlasids) 172 for bid in place.batlasids: 173 msg += ("\n\t id: %s" % bid) 174 logging.info(msg) 178 msg = "made %s id(s):" % len(place.batlasids) 179 for bid in place.batlasids: 180 msg += ("\n\t id: %s" % bid) 181 logging.debug(msg) 175 182 176 183 def outit(self): 177 NAMESPACE = 'http://pleiades.stoa.org/batlas' 184 """Output an xml file containing the ids, along with some descriptive info 185 """ 186 187 178 188 places = self['places'] 179 189 d = etree.Element('featurelist') 180 190 d.attrib['mapnum'] = self.map_number 191 q = etree.SubElement(d, 'uribase') 192 q.text = "%s/" % NAMESPACE 181 193 for p in places: 182 194 if len(p.batlasids) > 0: 183 e = etree.SubElement(d, 'feature', id=p.batlasids[0]) 195 e = etree.SubElement(d, 'feature') 196 197 # write all id/aliases 184 198 for i, bid in enumerate(p.batlasids): 185 199 if i == 0 or bid != p.batlasids[0]: 186 q = etree.SubElement(e, 'uri') 187 q.text = '/'.join((NAMESPACE, bid)) 188 e.xpath("*[local-name() = 'uri']")[0].attrib['primary'] = 'yes' 189 q = etree.SubElement(e, 'mapnumber') 190 q.text = self.map_number 200 q = etree.SubElement(e, 'alias', id=bid) 201 e.xpath("*[local-name() = 'alias']")[0].attrib['primary'] = 'yes' 202 203 # q = etree.SubElement(e, 'mapnumber') 204 # q.text = self.map_number 205 206 # feature type 191 207 if p.dirtype == 'name': 192 208 ptype = 'labeled feature' 209 elif p.dirtype == 'numbered': 210 ptype = 'numbered feature' 193 211 elif p.dirtype == 'unlocated': 194 212 ptype = 'unlocated toponym' … … 199 217 q = etree.SubElement(e, 'type') 200 218 q.text = ptype 219 220 # additional feature types 201 221 for i, t in enumerate(p.types): 202 222 if i == 0: … … 205 225 q = etree.SubElement(e, 'subtype') 206 226 q.text = t 227 228 # gridsquare 207 229 if len(p.grid) > 0: 208 230 q = etree.SubElement(e, 'gridsquare') 209 231 q.text = p.grid 232 233 # labels actually appearing on the map 210 234 if len(p.namestring) > 0: 211 if p.dirtype in ['unlocated', 'false']: 212 q = etree.SubElement(e, 'label', context='directory') 213 else: 235 if p.dirtype not in ['unlocated', 'false']: 214 236 q = etree.SubElement(e, 'label', context='map') 215 txt = SLASH_REGEX.sub('/', p.namestring) 216 txt = DIAMOND_STOP_REGEX.sub('', txt) 217 q.text = txt 218 q = etree.SubElement(e, 'citation') 219 if p.dirtype in ['unlocated', 'false']: 220 q.text = "BAtlas %s unlocated %s" % (self.map_number, txt) 221 else: 222 q.text = "BAtlas %s %s %s" % (self.map_number, p.grid, txt) 223 for n in p.placenames: 224 q = etree.SubElement(e, 'placename') 225 q.text = n.name 226 if len(p.placenames) > 1: 227 txt = u'' 237 txt = SLASH_REGEX.sub('/', p.namestring) 238 txt = DIAMOND_STOP_REGEX.sub('', txt) 239 q.text = txt.strip() 240 241 # citations 242 q = etree.SubElement(e, 'citation') 243 if p.dirtype == 'unlocated': 244 citname =SLASH_REGEX.sub('/', p.namestring.strip()) 245 q.text = "BAtlas %s unlocated %s" % (self.map_number, citname) 246 elif p.dirtype =='false': 247 citname = SLASH_REGEX.sub('/',p.namestring.strip()) 248 q.text = "BAtlas %s false name %s" % (self.map_number, citname) 249 elif p.dirtype == 'numbered': 250 citname = SLASH_REGEX.sub('/',p.locdesc.strip()) 251 q.text = "BAtlas %s %s no. %s (%s)" % (self.map_number, p.grid, p.namestring, citname) 252 elif p.dirtype == 'name' and len(p.namestring) == 0: 253 citname = SLASH_REGEX.sub('/',p.locdesc.strip()) 254 q.text = "BAtlas %s %s %s" % (self.map_number, p.grid, citname) 255 elif len(p.namestring) == 0: 256 citname = SLASH_REGEX.sub('/',p.locdesc.strip()) 257 q.text = "BAtlas %s %s unnamed %s %s" % (self.map_number, p.grid, p.dirtype, citname) 258 else: 259 citname = txt.strip() 260 q.text = "BAtlas %s %s %s" % (self.map_number, p.grid, citname) 261 262 # enumerate placenames 263 for i, n in enumerate(p.placenames): 264 q = etree.SubElement(e, 'geogname') 265 q.text = n.name.strip() 266 # if q.text not in citname: 267 # q.attrib['type'] = 'variant' 268 if n.variant: 269 q.attrib['type'] = 'variant' 270 elif n.minorAlternative: 271 q.attrib['type'] = 'minor-alternate' 272 if n.completeness != 'complete': 273 q.attrib['completeness'] = n.completeness 274 if n.accuracy != 'accurate': 275 q.attrib['accuracy'] = n.accuracy 276 if n.inferred: 277 q.attrib['inferred'] = 'yes' 278 if n.certainty != 'certain': 279 q.attrib['certainty'] = n.certainty 280 if len(p.placenames) > 1 and citname != q.text: 281 txt = q.text 282 if n.completeness == 'reconstructable': 283 txt = u"*%s" % txt 284 if n.accuracy == 'inaccurate': 285 txt = u"\u2018%s\u2019" % txt 286 if n.inferred == True: 287 txt = u"[%s]" % txt 288 if n.certainty != 'certain': 289 txt = u"%s?" % txt 290 if n.minorAlternative: 291 txt = u"\u00A7%s" % txt 228 292 if p.dirtype == 'unlocated': 229 txt = "BAtlas %s unlocated %s" % (self.map_number, n.name)293 txt = "BAtlas %s unlocated %s" % (self.map_number, txt) 230 294 elif p.dirtype == 'false': 231 txt = "BAtlas %s false %s" % (self.map_number, n.name)295 txt = "BAtlas %s false %s" % (self.map_number, txt) 232 296 elif len(p.placenames) > 1: 233 txt = "BAtlas %s %s %s" % (self.map_number, p.grid, n.name)297 txt = "BAtlas %s %s %s" % (self.map_number, p.grid, txt) 234 298 if len(txt) > 0: 235 299 q = etree.SubElement(e, 'citation') 236 300 q.text = txt 301 302 # location description 237 303 if len(p.locdesc) > 0: 238 304 q = etree.SubElement(e, 'location') 239 q.text = p.locdesc.strip() 240 if len(p.placenames) == 0: 241 q = etree.SubElement(e, 'citation') 242 q.text = "BAtlas %s %s unnamed %s %s" % (self.map_number, p.grid, p.dirtype, p.locdesc.strip()) 305 q.text = SLASH_REGEX.sub('/',p.locdesc.strip()) 306 307 # itineraries 243 308 if len(p.itinraw) > 0: 244 309 q = etree.SubElement(e, 'itinerary') … … 246 311 q = etree.SubElement(e, 'citation') 247 312 q.text = "BATlas %s %s (%s)" % (self.map_number, p.dirtype, p.itinraw.strip()) 248 fpath = join(self['destdir'], 'idlist.xml') 313 314 cmntf = open(r'./etc/batlasxmlcomment.txt') 315 cmnt = cmntf.read() 316 cmntf.close() 317 318 dtime = dt.datetime.utcnow() 319 dtstamp = dtime.isoformat() 320 dtyear = dtime.year 321 322 fn = "map%s.xml" % self.map_number 323 fpath = join(self['destdir'], fn) 249 324 f = open(fpath, 'w') 325 f.write(XMLDECL) 326 cmnt = cmnt % (fn, self.map_number, dtime.isoformat(), PREPARER, dtime.year, self.map_number, fn) 327 f.write(cmnt) 250 328 etree.ElementTree(d).write(f) 251 329 f.close() 330 logging.info("wrote output result xml file on %s" % fpath) 331 BADataMunger/trunk/bidmaker.py
r1316 r1332 12 12 13 13 class BAtlasIDMaker: 14 """A class for making URIs for Barrington Atlas features""" 14 15 15 def makeID (self, phrase, mapnum, gridsq ):16 def makeID (self, phrase, mapnum, gridsq, phrasepostfix='', phraseprefix=''): 16 17 """Make an ID from a properly prepared string.""" 17 idbase = u"%s %s %s" % (phrase, mapnum, gridsq) 18 ident = u'-'.join(normalizer.normalizeN(idbase)) 18 rawpieces = [phrase, mapnum, gridsq] 19 pieces = [] 20 for p in rawpieces: 21 q = u'-'.join(normalizer.normalizeN(p)) 22 pieces.append(q) 23 if len(phrasepostfix) == 0: 24 pass 25 else: 26 pieces.insert(1, phrasepostfix) 27 if len(phraseprefix) == 0: 28 pass 29 else: 30 pieces.insert(0, phraseprefix) 31 pieces = [p for p in pieces if len(p) > 0] 32 ident = u'-'.join(pieces) 19 33 return ident 34 20 35 21 36 def buildLabel (self, origstring, placenames, dirtype, locdesc = u'', itin = u''): … … 31 46 return u'' 32 47 33 if len(placenames) > 0 or dirtype == 'name': 48 elif dirtype in ['earthworks']: 49 # might be named or unnamed 50 if len(phrase) > 0: 51 phrase = DIAMOND_STOP_REGEX.sub('', phrase) 52 phrase = SILENT_POSTFIX_REGEX.sub('', phrase) 53 else: 54 phrase = "%s %s" % (dirtype, locdesc.strip()) 55 56 elif len(placenames) > 0 or dirtype in ['name','numbered']: 34 57 # the most basic case: features with names: 35 58 # use the batlas directory namestring … … 38 61 phrase = DIAMOND_STOP_REGEX.sub('', phrase) 39 62 phrase = SILENT_POSTFIX_REGEX.sub('', phrase) 63 64 40 65 41 66 elif dirtype in ['aqueduct', 'bridge', 'wall']: … … 47 72 phrase = ITIN_IGNORE_REGEX.sub(u'', phrase) 48 73 74 75 76 49 77 # there are various missing classes 78 else: 79 logging.warning("bidmaker.buildLabel did nothing for origstring='%s', dirtype='%s', locdesc='%s'" % (origstring, dirtype, locdesc)) 50 80 51 81 if dirtype == 'false': … … 67 97 return phrase.strip() 68 98 69 70 99 def addSmartSuffix (self, name, placetypes): 71 100 """Sweet blackberry surprise.""" BADataMunger/trunk/pipebiblio.py
r951 r1332 1 from os.path import normpath, normcase, isdir, isfile, splitdrive, splitext, split, join 1 # =========================================================================== 2 # Copyright (C) 2006-2008 Ancient World Mapping Center (UNC-CH) and the 3 # Institute for the Study of the Ancient World (NYU) 4 # 5 # This program is free software; you can redistribute it and/or modify 6 # it under the terms of the GNU General Public License as published by 7 # the Free Software Foundation; either version 2 of the License, or 8 # (at your option) any later version. 9 # 10 # This program is distributed in the hope that it will be useful, 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 # GNU General Public License for more details. 14 # 15 # You should have received a copy of the GNU General Public License along 16 # with this program; if not, write to the Free Software Foundation, Inc., 17 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 18 # 19 # About Pleiades 20 # -------------- 21 # 22 # Pleiades is an international research network and associated web portal and 23 # content management system devoted to the study of ancient geography. 24 # 25 # See http://pleiades.stoa.org 26 # 27 # Funding for the creation of this software was provided by a grant from the 28 # U.S. National Endowment for the Humanities (http://www.neh.gov), and 29 # by the Institute for the Study of the Ancient World at New York University 30 # (http://www.nyu.edu/isaw) 31 # =========================================================================== 32 """ 33 python pipebiblio.py --directory=/path/to/directory-file --library=/path/to/bibliographic-library-file --destination=/path/to/destination/directory/ 34 35 This script extracts bibliographic citations from an HTML version of a Barrington 36 Atlas Map-by-Map Directory and reformats them into MODS XML. 37 """ 38 39 from os.path import abspath, normcase, isdir, isfile, splitdrive, splitext, split, join, exists 2 40 import os 3 41 import logging 42 import sys 43 import getopt 4 44 5 45 import lxml.etree as etree … … 13 53 14 54 class Pipe: 55 """ A processing pipeline for bibliographic extraction and munging.""" 15 56 16 def __init__(self , dirfile, biblibfile, bibdestdir):57 def __init__(self): 17 58 logging.basicConfig(level=logging.DEBUG, format='%(levelname)-8s %(message)s') 18 data = {} 19 f = open(normpath(normcase(dirfile))) 20 wordhtml = f.read() 59 self.data = {} 60 self.data['contextpath'] = os.getcwd() 61 62 def cycle(self, dirfile, biblibfile, bibdestdir): 63 self.loaddirfile(dirfile) 64 self.fixdir() 65 self.extractbiblio() 66 self.savebiblio(biblibfile, bibdestdir) 67 68 def loaddirfile(self, dirfile): 69 f = open(dirfile) 70 self.data['wordhtml'] = f.read() 21 71 f.close() 22 wordxml = wordhtml2xml.convert(wordhtml)23 data['contextpath'] = os.getcwd()24 cleanxml = wordstripper.strip(data['contextpath'], wordxml)25 data['bibliography'] = biblioextractor.extract(cleanxml)26 72 drive, dirpath = splitdrive(dirfile) 27 73 filepath, filename = split(dirpath) 28 data['filenameroot'], extension = splitext(filename) 29 data['bibdestdir'] = normpath(normcase(bibdestdir)) 30 data['biblibfile'] = normpath(normcase(biblibfile)) 31 bibliosaver.save_biblio_mods(data) 74 self.data['filenameroot'], extension = splitext(filename) 32 75 76 def fixdir(self): 77 self.data['wordxml'] = wordhtml2xml.convert(self.data['wordhtml']) 78 self.data['cleanxml'] = wordstripper.strip(self.data['contextpath'], self.data['wordxml']) 79 80 def extractbiblio(self): 81 self.data['bibliography'] = biblioextractor.extract(self.data['cleanxml']) 82 83 def savebiblio(self,biblibfile,bibdestdir): 84 self.data['bibdestdir'] = bibdestdir 85 self.data['biblibfile'] = biblibfile 86 bibliosaver.save_biblio_mods(self.data) 87 88 def main(argv): 89 90 dirfile = None 91 bibdestdir = None 92 biblibfile = None 93 94 try: 95 opts, args = getopt.getopt(argv, "s:d:l:h", ["directory=", "destination=", "library=", "help"]) 96 except getopt.GetoptError: 97 print __doc__ 98 sys.exit(2) 99 100 for opt, arg in opts: 101 if opt in ("-s", "--directory"): 102 dirfile = abspath(normcase(arg)) 103 elif opt in ("-d", "--destination"): 104 bibdestdir = abspath(normcase(arg)) 105 elif opt in ("-l", "--library"): 106 biblibfile = abspath(normcase(arg)) 107 elif opt in ("-h", "--help"): 108 print __doc__ 109 sys.exit(0) 110 111 if not dirfile: 112 print 'a source directory file must be specified with the -s (--directory) option' 113 sys.exit(0) 114 if not bibdestdir: 115 print 'a destination directory must be specified with the -d (--destination) option' 116 sys.exit(0) 117 if not biblibfile: 118 print 'a bibliographic library file must be specified with the -l (--library) option' 119 sys.exit(0) 120 121 if not isfile(dirfile): 122 print "the specified source directory file (%s) is not an existing file" % dirfile 123 sys.exit(0) 124 if not isdir(bibdestdir): 125 print "the specified destination directory (%s) is not an existing directory" % bibdestdir 126 sys.exit(0) 127 if not isfile(biblibfile): 128 print "the specified bibliographic library file (%s) is not an existing file" % biblibfile 129 sys.exit(0) 130 131 p = Pipe() 132 p.cycle(dirfile, biblibfile, bibdestdir) 133 134 if __name__ == "__main__": 135 main(sys.argv[1:]) 136
