Changeset 840
- Timestamp:
- 06/14/07 16:41:57 (2 years ago)
- Files:
-
- BADataMunger/trunk/bibliosaver.py (modified) (1 diff)
- BADataMunger/trunk/gismixer.py (modified) (4 diffs)
- BADataMunger/trunk/pipeline.py (modified) (4 diffs)
- BADataMunger/trunk/placesaver.py (modified) (8 diffs)
- BADataMunger/trunk/tableparser.py (modified) (3 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
BADataMunger/trunk/bibliosaver.py
r823 r840 29 29 pcontent = etree.tostring(cleantree).encode('utf-8') 30 30 destfile = "%s-biblio-mods.xml" % self['filenameroot'] 31 g = open(join(self[' filepath'], destfile),'w')31 g = open(join(self['bibdestdir'], destfile),'w') 32 32 g.write(pcontent) 33 33 g.close() BADataMunger/trunk/gismixer.py
r826 r840 23 23 self.labeli = self.fieldnames.index('Label') 24 24 self.disambiguatori = self.fieldnames.index('disambiguator') 25 self.shapei = self.fieldnames.index('SHAPE') 26 self.typei = self.fieldnames.index('Type') 27 self.approxi = self.fieldnames.index('Approximate') 25 28 26 29 # get the data and store it for later use … … 40 43 41 44 def mixin(self, config, place): 42 label = place.namestring.replace("/ ", "/") 43 diamondi = label.find(u'\xa7') 44 if diamondi > 0: 45 label = label[:diamondi-1] 46 label = label.strip() 45 46 47 label = self.clean_label(place.namestring) 47 48 grid = place.grid 48 49 try: … … 53 54 r = self.get_record(label, grid) 54 55 if r: 55 pass 56 57 place.matched = True 58 # mix the data 59 60 place.shape = r[self.shapei] 61 place.approximate = r[self.approxi] 62 rawtype = r[self.typei].lower() 63 if rawtype == 'road station': 64 place.type = 'station' 65 elif rawtype == 'water wheel': 66 place.type = 'wheel' 67 elif rawtype == 'mountain pass': 68 place.type = 'pass' 69 elif rawtype == 'inland water': 70 place.type = 'water-inland' 71 elif rawtype == 'open water': 72 place.type = 'water-open' 73 elif rawtype == 'salt marsh': 74 place.type = 'salt-marsh' 75 else: 76 place.type = rawtype 77 56 78 else: 57 79 print "mixin failed to match %s: '%s' (%s - %s), rowi=%s" % (place.type, label.encode('ascii', 'backslashreplace'), place.locdesc.encode('ascii', 'backslashreplace'), place.itinraw.encode('ascii', 'backslashreplace'), place.rowi) … … 93 115 return None 94 116 95 117 def clean_label(self, raw_label): 118 clabel = raw_label.replace("/ ", "/") 119 diamondi = clabel.find(u'\xa7') 120 if diamondi > 0: 121 clabel = clabel[:diamondi-1] 122 clabel = clabel.replace(", Mon.", "") 123 clabel = clabel.replace(", T.", "") 124 clabel = clabel.replace(u"\u2018", u"'") 125 clabel = clabel.replace(u"\u2019", u"'") 126 #print "'%s'" % clabel.encode('ascii', 'backslashreplace') 127 clabel = clabel.strip() 128 return clabel 96 129 97 130 BADataMunger/trunk/pipeline.py
r826 r840 1 from os.path import normpath, normcase, isfile, splitdrive, splitext, split, join 1 from os.path import normpath, normcase, isdir, isfile, splitdrive, splitext, split, join 2 import os 2 3 3 4 import lxml.etree as etree … … 22 23 23 24 import pipeline 24 p = pipeline.Pipe(r' C:\TomDocs\awmcwork\pleiadesact\svnbox\BADataMunger\wordhtml\BATL065_.htm', r'C:\TomDocs\awmcwork\pleiadesact\svnbox\BADataMunger\wordhtml\BATL065_config.xml', r'C:\TomDocs\awmcwork\pleiadesact\svnbox\system\data\geoentities\arcgis\ba065points.xml')25 p = pipeline.Pipe(r'/TomDocs/awmcwork/pleiadesact/svnbox/BADataMunger/config/BATL065_config.xml', r'/badigit/wordhtml/BATL065_.htm', r'/badigit/extraction/ba065points.xml', r'/badigit/xml4pleiades') 25 26 p.cycle() 26 27 … … 28 29 """ 29 30 30 def __init__(self, source, source_config=None, source_gis=None): 31 def __init__(self, configfile, dirfile, gisfile, destdir): 32 # source, source_config=None, source_gis=None): 31 33 """Create a pipe object to manage the conversions and open the source 32 34 file 33 35 """ 34 36 37 38 self.data = {} 39 self['contextpath'] = os.getcwd() 40 print "contextpath: %s" % self['contextpath'] 35 41 36 self.data = {} 37 self['path'] = normpath(normcase(source)) 38 if not isfile(self['path']): 39 raise Error, "No file found at path: %s" % self['path'] 42 self['configfile'] = normpath(normcase(configfile)) 43 self['dirfile'] = normpath(normcase(dirfile)) 44 self['gisfile'] = normpath(normcase(gisfile)) 45 self['destdir'] = normpath(normcase(destdir)) 46 47 if not isfile(self['configfile']): 48 raise Error, "No file found at path: %s" % self['configfile'] 49 elif not isfile(self['dirfile']): 50 raise Error, "No file found at path: %s" % self['dirfile'] 51 elif not isfile(self['gisfile']): 52 raise Error, "No file found at path: %s" % self['dirfile'] 53 elif not isdir(self['destdir']): 54 raise Error, "No directory found at path: %s" % self['destdir'] 40 55 else: 41 self['drive'], dirpath = splitdrive(self['path']) 42 self['filepath'], self['filename'] = split(dirpath) 43 self['contextpath'], filedir = split(self['filepath']) 44 self['filenameroot'], extension = splitext(self['filename']) 45 f = open(self['path']) 46 self['wordhtml'] = f.read() 47 f.close() 48 if source_config: 49 self['config_path'] = normpath(normcase(source_config)) 50 if not isfile(self['config_path']): 51 raise Error, "No file found at path: %s" % self['config_path'] 56 57 drive, dirpath = splitdrive(self['dirfile']) 58 filepath, filename = split(dirpath) 59 self['filenameroot'], extension = splitext(filename) 60 self['bibdestdir'] = join(self['destdir'], 'mods') 61 62 if not isdir(self['bibdestdir']): 63 raise Error, "No directory found at path: %s" % self['bibdestdir'] 52 64 else: 53 f = open(self['config_path']) 65 66 # read and store the directory file 67 f = open(self['dirfile']) 68 self['wordhtml'] = f.read() 69 f.close() 70 71 # read and store the config file 72 f = open(self['configfile']) 54 73 config_text = f.read() 55 74 f.close() 75 76 # get some essential info from the config file 56 77 self['config'] = etree.XML(config_text) 57 try: 58 self.map_number = self['config'].xpath("//map_number")[0].text 59 except: 60 self.map_number = 'XYZ' 61 self.creators = [] 62 self.contributors = [] 63 self.rights = '' 64 for c in self['config'].xpath("//creator"): 65 self.creators.append(normalizetext(getalltext(c))) 66 for c in self['config'].xpath("//contributor"): 67 self.contributors.append(normalizetext(getalltext(c))) 68 try: 69 self.rights = self['config'].xpath("//rights")[0].text 70 except: 71 pass 72 73 if source_gis: 74 self.mixer = gismixer.gisMixer(normpath(normcase(source_gis))) 78 try: 79 self.map_number = self['config'].xpath("//map_number")[0].text 80 except: 81 self.map_number = 'XYZ' 82 self.creators = [] 83 self.contributors = [] 84 self.rights = '' 85 for c in self['config'].xpath("//creator"): 86 self.creators.append(normalizetext(getalltext(c))) 87 for c in self['config'].xpath("//contributor"): 88 self.contributors.append(normalizetext(getalltext(c))) 89 try: 90 self.rights = self['config'].xpath("//rights")[0].text 91 except: 92 pass 93 94 self.mixer = gismixer.gisMixer(self['gisfile']) 75 95 76 96 … … 92 112 self.mixer.mixin(self['config'], place) 93 113 94 placesaver.save_places_tei(self)114 #placesaver.save_places_tei(self) 95 115 placesaver.save_places_frank(self) 96 116 BADataMunger/trunk/placesaver.py
r837 r840 26 26 27 27 NSCLEANUPXSLT = 'nscleanup.xsl' 28 DESCXSLT = 'calc_Description.xsl' 28 29 29 30 def save_places_frank(self): … … 32 33 # iterate through the places list, creating a corresponding frankenfile for each place 33 34 places = self['places'] 34 for i, place in enumerate(places): 35 for i, place in enumerate([place for place in places if place.matched]): 36 35 37 36 38 ge = etree.Element("{%s}geoEntity" % AWMC) … … 58 60 tag_tp.append(tag_tpn) 59 61 ge.append(tag_tp) 62 63 # classification for the place 64 tag_cs = etree.Element("{%s}classificationSection" % ADLGAZ) 65 tag_ct = etree.Element("{%s}classificationTerm" % ADLGAZ) 66 tag_ct.text = place.type.lower() 67 tag_cs.append(tag_ct) 68 tag_css = etree.Element("{%s}classificationScheme" % ADLGAZ) 69 tag_csn = etree.Element("{%s}schemeName" % ADLGAZ) 70 tag_csn.text = "geoEntityType" 71 tag_css.append(tag_csn) 72 tag_cs.append(tag_css) 73 74 #tag_nassoc = etree.Element("{%s}nameAssociation" % AWMC) 75 #tag_nassoc.attrib['ref'] = pn.certainty 76 #tag_cs.append(tag_nassoc) 77 ge.append(tag_cs) 78 60 79 61 80 # secondaryReferences … … 75 94 ge.append(tag_refs) 76 95 96 # spatial location - this only works for points! 97 coords = [] 98 try: 99 coords= place.shape.split() 100 except: 101 pass 102 if len(coords) > 0: 103 coordvals = [] 104 for coord in coords: 105 coordvals.append(round(eval(coord),4)) 106 107 tag_sl = etree.Element("{%s}spatialLocation" % ADLGAZ) 108 tag_pt = etree.Element("{%s}point" % GEORSS) 109 try: 110 # nb: georss is y x 111 tag_pt.text = "%s %s" % (coordvals[1], coordvals[0]) 112 except: 113 print coordvals 114 tag_sl.append(tag_pt) 115 ge.append(tag_sl) 116 117 118 119 77 120 # featureNames 78 121 for pn in place.placenames: … … 86 129 87 130 # classificationSection 131 nametype = "geographic" 132 if place.type == 'people': \ 133 nametype = 'ethnic' 88 134 tag_cs = etree.Element("{%s}classificationSection" % ADLGAZ) 89 135 tag_ct = etree.Element("{%s}classificationTerm" % ADLGAZ) 90 tag_ct.text = "unclassified"136 tag_ct.text = nametype 91 137 tag_cs.append(tag_ct) 92 138 tag_css = etree.Element("{%s}classificationScheme" % ADLGAZ) … … 97 143 #if pn.certainty != "certain": 98 144 tag_nassoc = etree.Element("{%s}nameAssociation" % AWMC) 99 tag_nassoc. text= pn.certainty145 tag_nassoc.attrib['ref'] = pn.certainty 100 146 tag_cs.append(tag_nassoc) 101 147 tag_fn.append(tag_cs) … … 134 180 #print ge 135 181 cleantree = do_nscleanup(self['contextpath'], ge) 136 #cleantree = ge 182 183 # calculate a description 184 cleantree = do_description(self['contextpath'], cleantree) 137 185 138 186 # encode the content as utf8 and save to file … … 273 321 274 322 def do_nscleanup(contextpath, source): 275 xslt_doc = etree.parse(join(contextpath, NSCLEANUPXSLT)) 323 xslpath = join(contextpath, NSCLEANUPXSLT) 324 result = do_transform(xslpath, source) 325 return result 326 327 def do_description(contextpath, source): 328 xslpath = join(contextpath, DESCXSLT) 329 result = do_transform(xslpath, source) 330 return result 331 332 def do_transform(xslpath, source): 333 xslt_doc = etree.parse(xslpath) 276 334 transform = etree.XSLT(xslt_doc) 277 335 tsource = transform(source) BADataMunger/trunk/tableparser.py
r826 r840 72 72 configs = pparser_config = context['config'].xpath('//%s' % tabletype) 73 73 if len(configs) == 1: 74 print "SPECIAL"75 print etree.tostring(configs[0])74 #print "SPECIAL" 75 #print etree.tostring(configs[0]) 76 76 p.parse_special(rowcells, configs[0]) 77 77 elif len(configs) == 0: … … 138 138 self.references=[] 139 139 self.type='' 140 self.matched = False 140 141 141 142 … … 176 177 # print "SPECIAL PARSING: %s" % etree.tostring(config) 177 178 directives = config.xpath("./*") 178 print "%s special directives:" % len(directives)179 #print "%s special directives:" % len(directives) 179 180 for d in directives: 180 181 self.parse_special_directive(d, cells)
