Changeset 777
- Timestamp:
- 04/12/07 15:00:25 (2 years ago)
- Files:
-
- BADataMunger/trunk/pipeline.py (modified) (3 diffs)
- BADataMunger/trunk/wordhtml2xml.py (modified) (1 diff)
- BADataMunger/trunk/wordstripper.xsl (added)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
BADataMunger/trunk/pipeline.py
r772 r777 1 from os.path import normpath, normcase, isfile, splitdrive, splitext, split 1 from os.path import normpath, normcase, isfile, splitdrive, splitext, split, join 2 2 3 3 import wordhtml2xml 4 import wordstripper 5 import lxml.etree as etree 6 4 7 5 8 class Pipe: … … 11 14 import pipeline 12 15 p = pipeline.Pipe(r'C:\TomDocs\awmcwork\pleiadesact\svnbox\BADataMunger\wordhtml\BATL038_.htm') 16 p.cycle() 17 p.save(p.cleanxml, p.filenameroot + 'clean' + '.xml') 13 18 14 19 """ … … 19 24 """ 20 25 21 self.path = normpath(normcase(source)) 22 if not isfile(self.path): 23 print "crap" 26 self.data = {} 27 self['path'] = normpath(normcase(source)) 28 if not isfile(self['path']): 29 raise Error, "No file found at path: %s" % self['path'] 24 30 else: 25 self.drive, dirpath = splitdrive(self.path) 26 self.filepath, self.filename = split(dirpath) 27 self.filenameroot, extension = splitext(self.filename) 28 f = open(self.path) 29 self.wordhtml = f.read() 31 self['drive'], dirpath = splitdrive(self['path']) 32 self['filepath'], self['filename'] = split(dirpath) 33 self['contextpath'], filedir = split(self['filepath']) 34 self['filenameroot'], extension = splitext(self['filename']) 35 f = open(self['path']) 36 self['wordhtml'] = f.read() 30 37 f.close() 38 39 def __getitem__(self, key): return self.data[key] 40 41 def __setitem__(self, key, item): self.data[key] = item 31 42 32 43 def cycle(self): 33 44 """Cycle through all steps in the pipeline""" 34 45 35 self.wordxml = wordhtml2xml.convert(self.wordhtml) 46 self['wordxml'] = wordhtml2xml.convert(self['wordhtml']) 47 self['cleanxml'] = wordstripper.strip(self['contextpath'], self['wordxml']) 36 48 37 38 def save(self, content, destination, encoding=None): 49 def save(self, itemkey, encoding='utf-8'): 39 50 """Basic save-with-encoding function for writing content to an arbitrary 40 51 file""" 41 52 42 if encoding: 53 content = self[itemkey] 54 try: 43 55 pcontent = content.encode(encoding) 44 else: 45 pcontent = content 46 g = open(destination) 47 g.writelines(pcontent) 56 except: 57 pcontent = etree.tostring(content).encode(encoding) 58 destfile = "%s-%s.xml" % (self['filenameroot'], itemkey) 59 g = open(join(self['filepath'], destfile),'w') 60 g.write(pcontent) 48 61 g.close() BADataMunger/trunk/wordhtml2xml.py
r767 r777 1 1 from BeautifulSoup import BeautifulSoup 2 import lxml.etree as etree 2 3 3 XMLDECL = u'<?xml version="1.0" encoding="UTF-8"?>\n'4 4 XMLDOCTYPE = u'<!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n' 5 5 6 6 def convert(source): 7 """Make html exported from Microsoft Word both well-formed and valid.""" 7 8 soup = BeautifulSoup(source) 8 9 html = soup.findAll('html')[0] 9 10 html['xmlns'] = u'http://www.w3.org/1999/xhtml' 10 return XMLDECL + XMLDOCTYPE + unicode(soup) 11 style = soup.findAll('style')[0] 12 style['type'] = u'text/css' 13 # need to kill off weird ms characters? 14 return etree.XML(XMLDOCTYPE + unicode(soup)) 11 15 12 16 13
