root/BADataMunger/trunk/pipeline.py

Revision 862, 6.3 kB (checked in by thomase, 1 year ago)

handles the new "use case" that surfaced with Map 38: unlabeled point features on that map that have no corresponding directory entry

  • Property svn:eol-style set to native
Line 
1 from os.path import normpath, normcase, isdir, isfile, splitdrive, splitext, split, join
2 import os
3 import logging
4
5 import lxml.etree as etree
6
7 import wordhtml2xml
8 import wordstripper
9 import biblioextractor
10 import bibliosaver
11 import tablegroker
12 import tableparser
13 import placesaver
14 import gismixer
15 from etreehelps import getalltext
16 from texthelps import normalizetext
17
18
19 class Pipe:
20     """String together a series of transformations and operations to munge a
21     BAtlas directory file into Pleiades-ready data
22        
23     Use like:
24    
25 import pipeline
26 p = pipeline.Pipe(r'/TomDocs/awmcwork/pleiadesact/svnbox/BADataMunger/config/BATL065_config.xml', r'/badigit/wordhtml/BATL065_.htm', r'/badigit/extraction/ba065points.xml', r'/badigit/bib/awmcbib-mods.xml', r'/badigit/xml4pleiades')
27 p.cycle()
28        
29            
30     """
31
32     def __init__(self, configfile, dirfile, gisfile, biblibfile, destdir):
33         # source, source_config=None, source_gis=None):
34         """Create a pipe object to manage the conversions and open the source
35         file
36         """
37         logging.basicConfig(level=logging.DEBUG, format='%(levelname)-8s %(message)s')
38         logging.info("INITIALIZING: %s" % self.__class__)
39
40         self.data = {}
41         self['contextpath'] = os.getcwd()
42         logging.info("contextpath: %s" % self['contextpath'])
43        
44         self['configfile'] = normpath(normcase(configfile))
45         self['dirfile'] = normpath(normcase(dirfile))
46         self['gisfile'] = normpath(normcase(gisfile))
47         self['biblibfile'] = normpath(normcase(biblibfile))
48         self['destdir'] = normpath(normcase(destdir))
49        
50         if not isfile(self['configfile']):
51             raise Error, "No file found at path: %s" % self['configfile']
52         elif not isfile(self['dirfile']):
53             raise Error, "No file found at path: %s" % self['dirfile']
54         elif not isfile(self['gisfile']):
55             raise Error, "No file found at path: %s" % self['dirfile']
56         elif not isdir(self['destdir']):
57             raise Error, "No directory found at path: %s" % self['destdir']
58         else:
59            
60             drive, dirpath = splitdrive(self['dirfile'])
61             filepath, filename = split(dirpath)
62             self['filenameroot'], extension = splitext(filename) 
63             self['bibdestdir'] = join(self['destdir'], 'mods')     
64            
65             if not isdir(self['bibdestdir']):
66                 raise Error, "No directory found at path: %s" % self['bibdestdir']
67             else:
68                
69                 # read and store the directory file
70                 f = open(self['dirfile'])
71                 self['wordhtml'] = f.read()
72                 f.close()
73                 logging.info("read and stored directory file from %s" % self['dirfile'])
74                
75                 # read and store the config file
76                 f = open(self['configfile'])
77                 config_text = f.read()
78                 f.close()
79                 logging.info("read and stored config file from %s" % self['configfile'])
80                
81                 # get some essential info from the config file
82                 self['config'] = etree.XML(config_text)
83                 try:
84                     self.map_number = self['config'].xpath("//map_number")[0].text
85                 except:
86                     self.map_number = 'XYZ'
87                 logging.info("map number from config file: %s" % self.map_number)
88                
89                 self.creators = []
90                 self.contributors = []
91                 self.rights = ''
92                 for c in self['config'].xpath("//creator"):
93                     self.creators.append(normalizetext(getalltext(c)))
94                 for c in self['config'].xpath("//contributor"):
95                     self.contributors.append(normalizetext(getalltext(c)))
96                 try:
97                     self.rights = self['config'].xpath("//rights")[0].text
98                 except:
99                     pass
100                
101                 self.mixer = gismixer.gisMixer(self, self['gisfile'])
102
103         logging.info("DONE INITIALIZING: %s\n" % self.__class__)
104        
105            
106     def __getitem__(self, key): return self.data[key]
107        
108     def __setitem__(self, key, item): self.data[key] = item
109        
110     def cycle(self):
111         """Cycle through all steps in the pipeline"""
112        
113         logging.info("CYCLING: %s" % self.__class__)
114
115         self['wordxml'] = wordhtml2xml.convert(self['wordhtml'])
116         self['cleanxml'] = wordstripper.strip(self['contextpath'], self['wordxml'])
117         self['bibliography'] = biblioextractor.extract(self['cleanxml'])
118         bibliosaver.save_biblio_mods(self)
119         self['dirtables'] = tablegroker.grok(self['cleanxml'])
120         self['places'] = tableparser.parse(self, self['dirtables'], self.map_number)
121         self.mixer.mixall(self['config'], self['places'])
122         placesaver.save_places_frank(self)
123        
124         logging.info("DONE CYCLING: %s\n" % self.__class__)
125        
126            
127
128     def save(self, itemkey, encoding='utf-8'):
129         """Basic save-with-encoding function for writing content to an arbitrary
130         file"""
131        
132         if itemkey == 'bibliography':
133             save_bibliography(encoding)
134         else:
135             content = self[itemkey]
136             try:
137                 pcontent = content.encode(encoding)
138             except:
139                 pcontent = etree.tostring(content).encode(encoding)
140             destfile = "%s-%s.xml" % (self['filenameroot'], itemkey)
141             g = open(join(self['filepath'], destfile),'w')
142             g.write(pcontent)
143             g.close()
144        
145     def save_bibliography(self, encoding='utf-8'):
146         """Save individual bibliography files."""
147        
148         bibs = self['bibliography']
149         for i, bib in enumerate(bibs):
150             div = etree.Element("div")
151             head = etree.Element("head")
152             head.text = bib[0]
153             div.append(head)
154             subhead = etree.Element("subhead")
155             subhead.text = bib[1]
156             div.append(subhead)
157             div.append(bib[2])
158             pcontent = etree.tostring(div).encode(encoding)
159             destfile = "%s-biblio-%s.xml" % (self['filenameroot'], i)
160             g = open(join(self['filepath'], destfile),'w')
161             g.write(pcontent)
162             g.close()
163            
164            
Note: See TracBrowser for help on using the browser.