root/BADataMunger/trunk/pipebiblio.py

Revision 1332, 5.0 kB (checked in by thomase, 6 months ago)

this thing actually makes ids properly for all feature types (except roads) in maps 22, 38 and 65 and correctly saves them, plus alot of useful associated information to an xml file

  • Property svn:eol-style set to native
Line 
1 # ===========================================================================
2 # Copyright (C) 2006-2008 Ancient World Mapping Center (UNC-CH) and the
3 # Institute for the Study of the Ancient World (NYU)
4 #
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License along
16 # with this program; if not, write to the Free Software Foundation, Inc.,
17 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 #
19 # About Pleiades
20 # --------------
21 #
22 # Pleiades is an international research network and associated web portal and
23 # content management system devoted to the study of ancient geography.
24 #
25 # See http://pleiades.stoa.org
26 #
27 # Funding for the creation of this software was provided by a grant from the
28 # U.S. National Endowment for the Humanities (http://www.neh.gov), and
29 # by the Institute for the Study of the Ancient World at New York University
30 # (http://www.nyu.edu/isaw)
31 # ===========================================================================
32 """
33 python pipebiblio.py --directory=/path/to/directory-file --library=/path/to/bibliographic-library-file --destination=/path/to/destination/directory/
34
35 This script extracts bibliographic citations from an HTML version of a Barrington
36 Atlas Map-by-Map Directory and reformats them into MODS XML.
37 """
38
39 from os.path import abspath, normcase, isdir, isfile, splitdrive, splitext, split, join, exists
40 import os
41 import logging
42 import sys
43 import getopt
44
45 import lxml.etree as etree
46
47 import wordhtml2xml
48 import wordstripper
49 import biblioextractor
50 import bibliosaver
51 from etreehelps import getalltext
52 from texthelps import normalizetext
53
54 class Pipe:
55     """ A processing pipeline for bibliographic extraction and munging."""
56    
57     def __init__(self):
58         logging.basicConfig(level=logging.DEBUG, format='%(levelname)-8s %(message)s')
59         self.data = {}
60         self.data['contextpath'] = os.getcwd()
61        
62     def cycle(self, dirfile, biblibfile, bibdestdir):
63         self.loaddirfile(dirfile)
64         self.fixdir()
65         self.extractbiblio()
66         self.savebiblio(biblibfile, bibdestdir)
67        
68     def loaddirfile(self, dirfile):
69         f = open(dirfile)
70         self.data['wordhtml'] = f.read()
71         f.close()
72         drive, dirpath = splitdrive(dirfile)
73         filepath, filename = split(dirpath)
74         self.data['filenameroot'], extension = splitext(filename)
75        
76     def fixdir(self):
77         self.data['wordxml'] = wordhtml2xml.convert(self.data['wordhtml'])
78         self.data['cleanxml'] = wordstripper.strip(self.data['contextpath'], self.data['wordxml'])
79        
80     def extractbiblio(self):
81         self.data['bibliography'] = biblioextractor.extract(self.data['cleanxml'])
82        
83     def savebiblio(self,biblibfile,bibdestdir):
84         self.data['bibdestdir'] = bibdestdir
85         self.data['biblibfile'] = biblibfile
86         bibliosaver.save_biblio_mods(self.data)
87        
88 def main(argv):
89    
90     dirfile = None
91     bibdestdir = None
92     biblibfile = None
93    
94     try:                               
95         opts, args = getopt.getopt(argv, "s:d:l:h", ["directory=", "destination=", "library=", "help"])
96     except getopt.GetoptError:
97         print __doc__
98         sys.exit(2)                     
99
100     for opt, arg in opts:
101         if opt in ("-s", "--directory"):
102             dirfile = abspath(normcase(arg))
103         elif opt in ("-d", "--destination"):
104             bibdestdir = abspath(normcase(arg))
105         elif opt in ("-l", "--library"):
106             biblibfile = abspath(normcase(arg))
107         elif opt in ("-h", "--help"):
108             print __doc__
109             sys.exit(0)
110            
111     if not dirfile:
112         print 'a source directory file must be specified with the -s (--directory) option'
113         sys.exit(0)
114     if not bibdestdir:
115         print 'a destination directory must be specified with the -d (--destination) option'
116         sys.exit(0)
117     if not biblibfile:
118         print 'a bibliographic library file must be specified with the -l (--library) option'
119         sys.exit(0)
120      
121     if not isfile(dirfile):
122         print "the specified source directory file (%s) is not an existing file" % dirfile
123         sys.exit(0)
124     if not isdir(bibdestdir):
125         print "the specified destination directory (%s) is not an existing directory" % bibdestdir
126         sys.exit(0)
127     if not isfile(biblibfile):
128         print "the specified bibliographic library file (%s) is not an existing file" % biblibfile
129         sys.exit(0)
130          
131     p = Pipe()
132     p.cycle(dirfile, biblibfile, bibdestdir)
133            
134 if __name__ == "__main__":
135     main(sys.argv[1:])
136    
Note: See TracBrowser for help on using the browser.