| 1 |
# =========================================================================== |
|---|
| 2 |
# Copyright (C) 2006-2008 Ancient World Mapping Center (UNC-CH) and the |
|---|
| 3 |
# Institute for the Study of the Ancient World (NYU) |
|---|
| 4 |
# |
|---|
| 5 |
# This program is free software; you can redistribute it and/or modify |
|---|
| 6 |
# it under the terms of the GNU General Public License as published by |
|---|
| 7 |
# the Free Software Foundation; either version 2 of the License, or |
|---|
| 8 |
# (at your option) any later version. |
|---|
| 9 |
# |
|---|
| 10 |
# This program is distributed in the hope that it will be useful, |
|---|
| 11 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 12 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 13 |
# GNU General Public License for more details. |
|---|
| 14 |
# |
|---|
| 15 |
# You should have received a copy of the GNU General Public License along |
|---|
| 16 |
# with this program; if not, write to the Free Software Foundation, Inc., |
|---|
| 17 |
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
|---|
| 18 |
# |
|---|
| 19 |
# About Pleiades |
|---|
| 20 |
# -------------- |
|---|
| 21 |
# |
|---|
| 22 |
# Pleiades is an international research network and associated web portal and |
|---|
| 23 |
# content management system devoted to the study of ancient geography. |
|---|
| 24 |
# |
|---|
| 25 |
# See http://pleiades.stoa.org |
|---|
| 26 |
# |
|---|
| 27 |
# Funding for the creation of this software was provided by a grant from the |
|---|
| 28 |
# U.S. National Endowment for the Humanities (http://www.neh.gov), and |
|---|
| 29 |
# by the Institute for the Study of the Ancient World at New York University |
|---|
| 30 |
# (http://www.nyu.edu/isaw) |
|---|
| 31 |
# =========================================================================== |
|---|
| 32 |
""" |
|---|
| 33 |
python pipebiblio.py --directory=/path/to/directory-file --library=/path/to/bibliographic-library-file --destination=/path/to/destination/directory/ |
|---|
| 34 |
|
|---|
| 35 |
This script extracts bibliographic citations from an HTML version of a Barrington |
|---|
| 36 |
Atlas Map-by-Map Directory and reformats them into MODS XML. |
|---|
| 37 |
""" |
|---|
| 38 |
|
|---|
| 39 |
from os.path import abspath, normcase, isdir, isfile, splitdrive, splitext, split, join, exists |
|---|
| 40 |
import os |
|---|
| 41 |
import logging |
|---|
| 42 |
import sys |
|---|
| 43 |
import getopt |
|---|
| 44 |
|
|---|
| 45 |
import lxml.etree as etree |
|---|
| 46 |
|
|---|
| 47 |
import wordhtml2xml |
|---|
| 48 |
import wordstripper |
|---|
| 49 |
import biblioextractor |
|---|
| 50 |
import bibliosaver |
|---|
| 51 |
from etreehelps import getalltext |
|---|
| 52 |
from texthelps import normalizetext |
|---|
| 53 |
|
|---|
| 54 |
class Pipe: |
|---|
| 55 |
""" A processing pipeline for bibliographic extraction and munging.""" |
|---|
| 56 |
|
|---|
| 57 |
def __init__(self): |
|---|
| 58 |
logging.basicConfig(level=logging.DEBUG, format='%(levelname)-8s %(message)s') |
|---|
| 59 |
self.data = {} |
|---|
| 60 |
self.data['contextpath'] = os.getcwd() |
|---|
| 61 |
|
|---|
| 62 |
def cycle(self, dirfile, biblibfile, bibdestdir): |
|---|
| 63 |
self.loaddirfile(dirfile) |
|---|
| 64 |
self.fixdir() |
|---|
| 65 |
self.extractbiblio() |
|---|
| 66 |
self.savebiblio(biblibfile, bibdestdir) |
|---|
| 67 |
|
|---|
| 68 |
def loaddirfile(self, dirfile): |
|---|
| 69 |
f = open(dirfile) |
|---|
| 70 |
self.data['wordhtml'] = f.read() |
|---|
| 71 |
f.close() |
|---|
| 72 |
drive, dirpath = splitdrive(dirfile) |
|---|
| 73 |
filepath, filename = split(dirpath) |
|---|
| 74 |
self.data['filenameroot'], extension = splitext(filename) |
|---|
| 75 |
|
|---|
| 76 |
def fixdir(self): |
|---|
| 77 |
self.data['wordxml'] = wordhtml2xml.convert(self.data['wordhtml']) |
|---|
| 78 |
self.data['cleanxml'] = wordstripper.strip(self.data['contextpath'], self.data['wordxml']) |
|---|
| 79 |
|
|---|
| 80 |
def extractbiblio(self): |
|---|
| 81 |
self.data['bibliography'] = biblioextractor.extract(self.data['cleanxml']) |
|---|
| 82 |
|
|---|
| 83 |
def savebiblio(self,biblibfile,bibdestdir): |
|---|
| 84 |
self.data['bibdestdir'] = bibdestdir |
|---|
| 85 |
self.data['biblibfile'] = biblibfile |
|---|
| 86 |
bibliosaver.save_biblio_mods(self.data) |
|---|
| 87 |
|
|---|
| 88 |
def main(argv): |
|---|
| 89 |
|
|---|
| 90 |
dirfile = None |
|---|
| 91 |
bibdestdir = None |
|---|
| 92 |
biblibfile = None |
|---|
| 93 |
|
|---|
| 94 |
try: |
|---|
| 95 |
opts, args = getopt.getopt(argv, "s:d:l:h", ["directory=", "destination=", "library=", "help"]) |
|---|
| 96 |
except getopt.GetoptError: |
|---|
| 97 |
print __doc__ |
|---|
| 98 |
sys.exit(2) |
|---|
| 99 |
|
|---|
| 100 |
for opt, arg in opts: |
|---|
| 101 |
if opt in ("-s", "--directory"): |
|---|
| 102 |
dirfile = abspath(normcase(arg)) |
|---|
| 103 |
elif opt in ("-d", "--destination"): |
|---|
| 104 |
bibdestdir = abspath(normcase(arg)) |
|---|
| 105 |
elif opt in ("-l", "--library"): |
|---|
| 106 |
biblibfile = abspath(normcase(arg)) |
|---|
| 107 |
elif opt in ("-h", "--help"): |
|---|
| 108 |
print __doc__ |
|---|
| 109 |
sys.exit(0) |
|---|
| 110 |
|
|---|
| 111 |
if not dirfile: |
|---|
| 112 |
print 'a source directory file must be specified with the -s (--directory) option' |
|---|
| 113 |
sys.exit(0) |
|---|
| 114 |
if not bibdestdir: |
|---|
| 115 |
print 'a destination directory must be specified with the -d (--destination) option' |
|---|
| 116 |
sys.exit(0) |
|---|
| 117 |
if not biblibfile: |
|---|
| 118 |
print 'a bibliographic library file must be specified with the -l (--library) option' |
|---|
| 119 |
sys.exit(0) |
|---|
| 120 |
|
|---|
| 121 |
if not isfile(dirfile): |
|---|
| 122 |
print "the specified source directory file (%s) is not an existing file" % dirfile |
|---|
| 123 |
sys.exit(0) |
|---|
| 124 |
if not isdir(bibdestdir): |
|---|
| 125 |
print "the specified destination directory (%s) is not an existing directory" % bibdestdir |
|---|
| 126 |
sys.exit(0) |
|---|
| 127 |
if not isfile(biblibfile): |
|---|
| 128 |
print "the specified bibliographic library file (%s) is not an existing file" % biblibfile |
|---|
| 129 |
sys.exit(0) |
|---|
| 130 |
|
|---|
| 131 |
p = Pipe() |
|---|
| 132 |
p.cycle(dirfile, biblibfile, bibdestdir) |
|---|
| 133 |
|
|---|
| 134 |
if __name__ == "__main__": |
|---|
| 135 |
main(sys.argv[1:]) |
|---|
| 136 |
|
|---|