Changeset 777

Show
Ignore:
Timestamp:
04/12/07 15:00:25 (2 years ago)
Author:
thomase
Message:

Strip all stylistic components and suppress some pointless elements to get "clean" xhtml. Store intermediate xml on the Pipe object as elementtrees. Make Pipe serve up its attributes like a dictionary. Save stuff more sensibly.

Files:

Legend:

Unmodified
Added
Removed
Modified
Copied
Moved
  • BADataMunger/trunk/pipeline.py

    r772 r777  
    1 from os.path import normpath, normcase, isfile, splitdrive, splitext, split 
     1from os.path import normpath, normcase, isfile, splitdrive, splitext, split, join 
    22 
    33import wordhtml2xml 
     4import wordstripper 
     5import lxml.etree as etree 
     6 
    47 
    58class Pipe: 
     
    1114        import pipeline 
    1215        p = pipeline.Pipe(r'C:\TomDocs\awmcwork\pleiadesact\svnbox\BADataMunger\wordhtml\BATL038_.htm') 
     16        p.cycle() 
     17        p.save(p.cleanxml, p.filenameroot + 'clean' + '.xml') 
    1318             
    1419    """ 
     
    1924        """ 
    2025         
    21         self.path = normpath(normcase(source)) 
    22         if not isfile(self.path): 
    23             print "crap" 
     26        self.data = {} 
     27        self['path'] = normpath(normcase(source)) 
     28        if not isfile(self['path']): 
     29            raise Error, "No file found at path: %s" % self['path'] 
    2430        else: 
    25             self.drive, dirpath = splitdrive(self.path) 
    26             self.filepath, self.filename = split(dirpath) 
    27             self.filenameroot, extension = splitext(self.filename) 
    28             f = open(self.path) 
    29             self.wordhtml = f.read() 
     31            self['drive'], dirpath = splitdrive(self['path']) 
     32            self['filepath'], self['filename'] = split(dirpath) 
     33            self['contextpath'], filedir = split(self['filepath']) 
     34            self['filenameroot'], extension = splitext(self['filename']) 
     35            f = open(self['path']) 
     36            self['wordhtml'] = f.read() 
    3037            f.close() 
     38             
     39    def __getitem__(self, key): return self.data[key] 
     40         
     41    def __setitem__(self, key, item): self.data[key] = item 
    3142         
    3243    def cycle(self): 
    3344        """Cycle through all steps in the pipeline""" 
    3445         
    35         self.wordxml = wordhtml2xml.convert(self.wordhtml) 
     46        self['wordxml'] = wordhtml2xml.convert(self['wordhtml']) 
     47        self['cleanxml'] = wordstripper.strip(self['contextpath'], self['wordxml']) 
    3648         
    37          
    38     def save(self, content, destination, encoding=None): 
     49    def save(self, itemkey, encoding='utf-8'): 
    3950        """Basic save-with-encoding function for writing content to an arbitrary  
    4051        file""" 
    4152         
    42         if encoding: 
     53        content = self[itemkey] 
     54        try: 
    4355            pcontent = content.encode(encoding) 
    44         else: 
    45             pcontent = content 
    46         g = open(destination) 
    47         g.writelines(pcontent) 
     56        except: 
     57            pcontent = etree.tostring(content).encode(encoding) 
     58        destfile = "%s-%s.xml" % (self['filenameroot'], itemkey) 
     59        g = open(join(self['filepath'], destfile),'w') 
     60        g.write(pcontent) 
    4861        g.close() 
  • BADataMunger/trunk/wordhtml2xml.py

    r767 r777  
    11from BeautifulSoup import BeautifulSoup 
     2import lxml.etree as etree 
    23 
    3 XMLDECL = u'<?xml version="1.0" encoding="UTF-8"?>\n' 
    44XMLDOCTYPE = u'<!DOCTYPE html SYSTEM "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n' 
    55 
    66def convert(source): 
     7    """Make html exported from Microsoft Word both well-formed and valid.""" 
    78    soup = BeautifulSoup(source) 
    89    html = soup.findAll('html')[0] 
    910    html['xmlns'] = u'http://www.w3.org/1999/xhtml' 
    10     return XMLDECL + XMLDOCTYPE + unicode(soup) 
     11    style = soup.findAll('style')[0] 
     12    style['type'] = u'text/css' 
     13    # need to kill off weird ms characters? 
     14    return etree.XML(XMLDOCTYPE + unicode(soup)) 
    1115     
    1216     
    13