root/BADataMunger/trunk/dir2xml.py

Revision 1444, 16.4 kB (checked in by thomase, 2 months ago)

code and changes for converting raw directory html in well-formed xhtml (cleaned up) and a semantic xml that combines elements of our old frankenformat and the more recent batlas id schema

Line 
1 """
2 dir2xml.py
3
4 get all the batlas dirs from word-export html to xml once and for all
5 """
6
7 import os
8 import re
9 import logging
10 import datetime as dt
11
12 import lxml.etree as etree
13
14 import placesaver
15 import batlaspipe as bp
16
17 from placesaver import periods, AWMC, ADLGAZ, DC, XML, TEI, refmagic, do_nscleanup
18 from bidmaker import DIAMOND_STOP_REGEX
19 from batlaspipe import SLASH_REGEX, NAMESPACE, XMLDECL, DASHNUM_END_REGEX, PREPARER
20
21 PLEIADES = 'http://atlantides.org/batlas/'
22
23 BAD_REGEX = re.compile('(........)config\.xml')
24
25
26 COMMENTFILE = r'./etc/dirxmlcomment.txt'
27
28 priorcitations = []
29
30 def htmldump(map, pipe, destdir):
31     x = p['cleanxml']
32     pcontent = etree.tostring(x).encode('utf-8')
33     dest = ''.join((map, '.html'))
34     dest = os.path.join(destdir, dest)
35     g = open(dest,'w')
36     g.write(pcontent)
37     g.close()
38
39
40 def htmlconvert(configfile, sourcefile, destdir):
41     p = bp.Pipe(configfile, sourcefile, destdir)
42     p.cycle()
43     return p
44
45
46 def htmlconvertall(configdir, sourcedir, destdir):
47
48     maps = []
49     configs = os.listdir(configdir)
50     for config in configs:
51         m = BAD_REGEX.match(config)
52         if m:
53             maps.append(m.group(1))
54     for map in maps:
55         config = ''.join((map, 'config', '.xml'))
56         config = os.path.join(configdir, config)
57         source = ''.join((map, '.htm'))
58         source = os.path.join(sourcedir, source)
59         p = htmlconvert(config, source, destdir)
60         htmldump(p, destdir)
61        
62 def xmlconvert(configfile, sourcefile, destdir):
63
64     p = htmlconvert(configfile, sourcefile, destdir)
65     p.idit()
66     return p
67    
68    
69 def xmlconvertall(configdir, sourcedir, destdir):
70
71     maps = []
72     configs = os.listdir(configdir)
73     for config in configs:
74         m = BAD_REGEX.match(config)
75         if m:
76             maps.append(m.group(1))
77     for map in maps:
78         config = ''.join((map, 'config', '.xml'))
79         config = os.path.join(configdir, config)
80         source = ''.join((map, '.htm'))
81         source = os.path.join(sourcedir, source)
82         p = xmlconvert(config, source, destdir)
83         xmldump(p, destdir)
84         print 'wahoo'
85        
86
87    
88 def encodeids(place, parent):
89     if len(place.batlasids) > 0:
90         for i, bid in enumerate(place.batlasids):
91             if i == 0 or bid != place.batlasids[0]:
92                 q = etree.SubElement(parent, '{%s}baid' % PLEIADES, id=bid)
93         parent.xpath("*[local-name() = 'baid']")[0].attrib['primary'] = 'yes'
94        
95 def encodetypes(place, parent):
96
97     if place.dirtype == 'name':
98         ptype = 'labeled feature'
99     elif place.dirtype == 'numbered':
100         ptype = 'numbered feature'
101     elif place.dirtype == 'unlocated':
102         ptype = 'unlocated toponym'
103     elif place.dirtype == 'false':
104         ptype = 'false toponym'
105     else:
106         ptype = place.dirtype
107     q = etree.SubElement(parent, '{%s}type' % PLEIADES)
108     q.text = ptype
109    
110     if 'group' in place.dirtype:
111         q = etree.SubElement(parent, 'featurecount')
112         q.text = "%s" % place.featurecount
113    
114     # additional feature types
115     for i, t in enumerate(place.types):
116         if i == 0:
117             q = etree.SubElement(parent, 'subtype')
118         elif t != place.types[i-1]:
119             q = etree.SubElement(parent, 'subtype')
120         q.text = t
121
122
123 def encodegrid(place, parent):
124
125     if len(place.grid) > 0:
126         q = etree.SubElement(parent, '{%s}gridsquare' % PLEIADES)
127         q.text = place.grid
128        
129
130 def encodemaplabels(place, parent):
131
132     if len(place.namestring) > 0:
133         if place.dirtype not in ['unlocated', 'false']:
134             q = etree.SubElement(parent, '{%s}label' % PLEIADES, context='map')
135             txt = SLASH_REGEX.sub('/', place.namestring)
136             txt = DIAMOND_STOP_REGEX.sub('', txt)
137             q.text = txt.strip()
138
139
140 def encodecitations(place, parent, mapnum):
141
142     citcontent = u''
143     if place.dirtype == 'unlocated':
144         citname =SLASH_REGEX.sub('/', place.namestring.strip())
145         citcontent = "BAtlas %s unlocated %s" % (mapnum, citname)
146     elif place.dirtype =='false':
147         citname = SLASH_REGEX.sub('/',place.namestring.strip())
148         citcontent = "BAtlas %s false name %s" % (mapnum, citname)
149     elif place.dirtype == 'numbered' and len(place.placenames) == 0:
150         citname = SLASH_REGEX.sub('/',place.locdesc.strip())
151         citcontent = "BAtlas %s %s no. %s (%s)" % (mapnum, place.grid, place.namestring, citname)
152     elif place.dirtype == 'numbered' and len(place.placenames) > 0:
153         citname = SLASH_REGEX.sub('/',place.placenames[0].name.strip())
154         citcontent = "BAtlas %s %s no. %s (%s)" % (mapnum, place.grid, place.namestring, citname)
155     elif place.dirtype == 'name' and len(place.namestring) == 0:
156         citname = SLASH_REGEX.sub('/',place.locdesc.strip())
157         citcontent = "BAtlas %s %s %s" % (mapnum, place.grid, citname)
158     elif len(place.namestring) == 0:
159         citname = SLASH_REGEX.sub('/',place.locdesc.strip())
160         citcontent = "BAtlas %s %s unnamed %s (%s)" % (mapnum, place.grid, place.dirtype.replace('-', ' '), citname)                   
161     else:
162         txt = SLASH_REGEX.sub('/', place.namestring)
163         txt = DIAMOND_STOP_REGEX.sub('', txt)
164         citname = txt.strip()
165         citcontent = "BAtlas %s %s %s" % (mapnum, place.grid, citname)
166     if len(citcontent) > 0:
167         if len(place.batlasids) > 0:
168             writecit(parent, citcontent, place.batlasids[0])
169         else:
170             writecit(parent, citcontent, '')
171     return citname
172
173 def encodeplacenames(place, parent, mapnum, citname):
174
175     for i, n in enumerate(place.placenames):
176         q = etree.SubElement(parent, '{%s}geogname' % PLEIADES)
177         q.text = n.name.strip()
178        # if q.text not in citname:
179        #     q.attrib['type'] = 'variant'
180         if n.variant:
181             q.attrib['type'] = 'variant'
182         elif n.minorAlternative:
183             q.attrib['type'] = 'minor-alternate'
184         if n.completeness != 'complete':
185             q.attrib['completeness'] = n.completeness
186         if n.accuracy != 'accurate':
187             q.attrib['accuracy'] = n.accuracy
188         if n.inferred:
189             q.attrib['inferred'] = 'yes'
190         if n.certainty != 'certain':
191             q.attrib['certainty'] = n.certainty
192         if len(place.placenames) > 1 and citname != q.text:
193             txt = q.text
194             if n.completeness == 'reconstructable':
195                 txt = u"*%s" % txt
196             if n.accuracy == 'inaccurate':
197                 txt = u"\u2018%s\u2019" % txt
198             if n.inferred == True:
199                 txt = u"[%s]" % txt
200             if n.certainty != 'certain':
201                 txt = u"%s?" % txt
202             if n.minorAlternative:
203                 txt = u"\u00A7%s" % txt
204             if place.dirtype == 'unlocated':
205                 txt = "BAtlas %s unlocated %s" % (mapnum, txt)
206             elif place.dirtype == 'false':
207                 txt = "BAtlas %s false %s" % (mapnum, txt)
208             elif len(place.placenames) > 1:
209                 txt = "BAtlas %s %s %s" % (mapnum, place.grid, txt)
210             if len(txt) > 0:
211                 if 'island-group' in place.types:
212                     txt = "%s Inss." % txt
213                 elif 'island' in place.types:
214                     txt = "%s Ins." % txt
215                 elif 'river' in place.types:
216                     txt = "%s fl." % txt
217                 # if appropriate, write a citation
218                 writecit(parent, txt, place.batlasids[0])
219                
220 def encodelocdesc(place, parent):
221    
222     if len(place.locdesc) > 0:
223         q = etree.SubElement(parent, 'location')
224         q.text = SLASH_REGEX.sub('/',place.locdesc.strip())
225
226
227 def encodenotes(place, parent):
228     if len(place.note) > 0:
229         q = etree.SubElement(parent, 'note')
230         q.text = place.note
231
232 def encodeitineraries(place, parent, mapnum):
233     if len(place.itinraw) > 0:
234         q = etree.SubElement(parent, 'itinerary')
235         q.text = place.itinraw.strip()
236         citcontent = "BATlas %s %s (%s)" % (mapnum, place.dirtype, place.itinraw.strip())
237         if len(place.batlasids) > 0:
238             writecit(parent, citcontent, place.batlasids[0])
239         else:
240             writecit(parent, citcontent, '')
241
242 def oldschoolFeatureID(place, parent, mapnum):
243     if place.tablei == -1 and place.rowi == -1:
244         placeid = "batlas-%s-anon-%s" % (mapnum, place.anonsequence)
245     else:
246         placeid = "batlas-%s-%s-%s" % (mapnum, place.tablei+1, place.rowi+1)
247     tag_fid = etree.Element("{%s}featureID" % ADLGAZ)
248     tag_fid.text = placeid
249     parent.append(tag_fid)
250    
251 def oldschoolTimePeriodNames(place, parent):
252     for tp in place.periods:
253         tag_tp = etree.Element("{%s}timePeriod" % ADLGAZ)
254         tag_tpn = etree.Element("{%s}timePeriodName" % ADLGAZ)
255         tpstring = periods[tp[0]]
256         if tp[1] == 'less-confident':
257             tpstring += "?"
258         tag_tpn.text = tpstring
259         tag_tp.append(tag_tpn)
260         parent.append(tag_tp)
261
262    
263 def oldschoolAttribution(pipe, parent):
264     for c in pipe.creators:
265         tag_c = etree.Element("{%s}creator" % DC)
266         tag_c.text = c
267         parent.append(tag_c)
268     for c in pipe.contributors:
269         tag_c = etree.Element("{%s}contributor" % DC)
270         tag_c.text = c
271         parent.append(tag_c)
272
273 def oldschoolFeatureNames(place, parent, magicrefs):
274     for pn in place.placenames:
275        
276         tag_fn = etree.Element("{%s}featureName" % ADLGAZ)
277        
278         # transliteration
279         tag_translit = etree.Element("{%s}transliteration" % AWMC)
280         tag_translit.text = pn.name
281         tag_fn.append(tag_translit)
282        
283         # classificationSection
284         try:
285             if place.types.index('people') != 0:
286                 nametype = 'ethnic'
287         except ValueError:
288             nametype = 'geographic'
289         tag_cs = etree.Element("{%s}classificationSection" % ADLGAZ)
290         tag_ct = etree.Element("{%s}classificationTerm" % ADLGAZ)
291         tag_ct.text = nametype
292         tag_cs.append(tag_ct)
293         tag_css = etree.Element("{%s}classificationScheme" % ADLGAZ)
294         tag_csn = etree.Element("{%s}schemeName" % ADLGAZ)
295         tag_csn.text = "geoNameType"
296         tag_css.append(tag_csn)
297         tag_cs.append(tag_css)
298         if pn.inferred:
299             tag_naspect = etree.Element("{%s}nameAspect" % AWMC)
300             tag_naspect.attrib['ref'] = 'na-inferred'
301             tag_cs.append(tag_naspect)
302         if pn.completeness != 'complete':
303             tag_naspect = etree.Element("{%s}nameAspect" % AWMC)
304             tag_naspect.attrib['ref'] = 'na-reconstructed'
305             tag_cs.append(tag_naspect)
306         if pn.accuracy != 'accurate':
307             tag_naspect = etree.Element("{%s}nameAspect" % AWMC)
308             tag_naspect.attrib['ref'] = 'na-inaccurate'
309             tag_cs.append(tag_naspect)
310         tag_nassoc = etree.Element("{%s}nameAssociation" % AWMC)
311         tag_nassoc.attrib['ref'] = pn.certainty
312         tag_cs.append(tag_nassoc)
313         tag_fn.append(tag_cs)
314
315         # timePeriods for the name
316         for tp in pn.periods:
317             tag_tp = etree.Element("{%s}timePeriod" % ADLGAZ)
318             tag_tpn = etree.Element("{%s}timePeriodName" % ADLGAZ)
319             tpstring = periods[tp[0]]
320             if tp[1] == 'less-confident':
321                 tpstring += "?"
322             tag_tpn.text = tpstring
323             tag_tp.append(tag_tpn)
324             tag_fn.append(tag_tp)
325            
326         # secondary references for the name
327         if len(pn.references) > 0:
328             tag_refs = etree.Element("{%s}secondaryReferences" % AWMC)
329             for ref in pn.references:
330                 tag_bibl_xml = "<tei:bibl xmlns='%s' xmlns:tei='%s'>%s</tei:bibl>" % (TEI, TEI, magicrefs[ref])
331                 if "tei:title" in tag_bibl_xml:
332                     pass
333                 else:
334                     print place.namestring.encode('ascii', 'xmlcharrefreplace')
335                     print ">>>> no title in: '%s'" % tag_bibl_xml.encode('ascii', 'xmlcharrefreplace')
336                    
337                 tag_refs.append(etree.XML(tag_bibl_xml))
338                
339             tag_fn.append(tag_refs)
340
341            
342         parent.append(tag_fn)
343        
344 def oldschoolReferences(place, parent, mapnum):
345
346     # first, recall our origins in the barrington atlas
347     tag_refs = etree.Element("{%s}secondaryReferences" % AWMC)
348     try:
349         thislabel = place.namestring.replace("/ ", "/")
350     except:
351         thislabel = ''
352     if len(thislabel) > 0:
353         findi = thislabel.find(u'\xA7')
354         if findi > -1:
355             thislabel = thislabel[:findi-1].strip()
356         tag_bibl_xml = "<tei:bibl xmlns='%s' xmlns:tei='%s'><title>BAtlas</title> <biblScope>%s %s %s</biblScope></tei:bibl>" % (TEI, TEI, mapnum, place.grid, thislabel)
357     else:
358         tag_bibl_xml = "<tei:bibl xmlns='%s' xmlns:tei='%s'><title>BAtlas</title> <biblScope>%s %s</biblScope></tei:bibl>" % (TEI, TEI, mapnum, place.grid)
359    
360     tag_refs.append(etree.XML(tag_bibl_xml))
361    
362     # now, any other references
363     magicrefs = {}
364     if len(place.placenames) == 1:
365         for ref in place.references:
366             magicrefs[ref] = refmagic(ref, place.placenames[0].name)
367     elif len(place.placenames) > 1:
368         for ref in place.references:
369             magicrefs[ref] = refmagic(ref, place.namestring)
370         for pn in place.placenames:
371             for ref in pn.references:
372                 magicrefs[ref] = refmagic(ref, pn.name)
373    
374        
375     if len(magicrefs) == 0 and len(place.references) > 0:
376         for ref in place.references:
377             magicrefs[ref] = refmagic(ref, "")
378                        
379     for ref in place.references:
380         try:
381             # need to fix this usage now that we have internal tagging
382             tag_bibl_xml = "<tei:bibl xmlns='%s' xmlns:tei='%s'>%s</tei:bibl>" % (TEI, TEI, magicrefs[ref])
383             if "tei:title" in tag_bibl_xml:
384                 pass
385             else:
386                 print place.namestring.encode('ascii', 'xmlcharrefreplace')
387                 print ">>>> no title in: '%s'" % tag_bibl_xml.encode('ascii', 'xmlcharrefreplace')
388         except KeyError:
389             print 'KeyError %s' % ref.encode('ascii', 'backslashreplace')
390             print magicrefs
391         tag_refs.append(etree.XML(tag_bibl_xml))
392     parent.append(tag_refs)
393     return magicrefs
394
395 def encodeplace(place, parent, mapnum, pipe):
396     e = etree.SubElement(parent, '{%s}place' % PLEIADES)
397     oldschoolFeatureID(place, e, mapnum)
398     encodeids(place, e)
399     encodetypes(place, e)
400     encodegrid(place, e)
401     encodemaplabels(place, e)
402     citname = encodecitations(place, e, mapnum)
403     encodeplacenames(place, e, mapnum, citname)
404     encodeitineraries(place, e, mapnum)
405     encodelocdesc(place, e)
406     encodenotes(place, e)
407     oldschoolTimePeriodNames(place, e)
408     oldschoolAttribution(pipe, e)
409     refdict = oldschoolReferences(place, e, mapnum)
410     oldschoolFeatureNames(place, e, refdict)
411    
412
413 def xmldump(pipe, destdir):
414     """Output an xml file containing the ids, along with some descriptive info
415     """
416    
417     priorcitations = []
418     # get all the places
419     places = pipe['places']
420    
421     # serialize to xml
422     d = etree.Element('{%s}featurelist' % PLEIADES)
423     d.attrib['mapnum'] = pipe.map_number
424     q = etree.SubElement(d, '{%s}uribase' % PLEIADES)
425     q.text = "%s/" % NAMESPACE
426     for p in places:
427         encodeplace(p, d, pipe.map_number, pipe)       
428        
429     cleantree = do_nscleanup(pipe['contextpath'], d)
430        
431                    
432      # write to file, prepending explanator comment text, date etc.
433     cmntf = open(COMMENTFILE)
434     cmnt = cmntf.read()
435     cmntf.close()
436    
437     dtime = dt.datetime.utcnow()
438     dtstamp = dtime.isoformat()
439     dtyear = dtime.year
440                
441     fn = "map%s.xml" % pipe.map_number
442     fpath = os.path.join(destdir, fn)
443     f = open(fpath, 'w')
444     f.write(XMLDECL)
445     cmnt = cmnt % (fn, pipe.map_number, dtime.isoformat(), PREPARER)
446     f.write(cmnt)
447     etree.ElementTree(cleantree).write(f)
448     f.close()
449     logging.info("wrote output result xml file on %s" % fpath)
450    
451 def writecit(parent, tcontent, primeid):
452     ctcontent = u''.join(tcontent.split())
453     if ctcontent in priorcitations:
454         # if primeid ends in dash-number then postfix the number onto the citation
455         m = DASHNUM_END_REGEX.search(primeid)
456         n = DASHNUM_END_REGEX.search(tcontent)
457         if m and not n:
458             writecit(parent, "%s (%s)" % (tcontent, m.group(1)), primeid)
459         else:
460             logging.warning("Suppressed writing of citation '%s' for primary id '%s' because that citation is already in use" % (tcontent, primeid))
461            
462     else:
463         q = etree.SubElement(parent, '{%s}citation' % PLEIADES)
464         q.text = tcontent
465         priorcitations.append(ctcontent)
466        
Note: See TracBrowser for help on using the browser.