root/BADataMunger/trunk/wordstripper.py

Revision 1303, 2.2 kB (checked in by thomase, 7 months ago)

make validation benchmark explicit

  • Property svn:eol-style set to native
Line 
1 # ===========================================================================
2 # Copyright (C) 2006-2008 Ancient World Mapping Center (UNC-CH) and the
3 # Institute for the Study of the Ancient World (NYU)
4 #
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License along
16 # with this program; if not, write to the Free Software Foundation, Inc.,
17 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 #
19 # About Pleiades
20 # --------------
21 #
22 # Pleiades is an international research network and associated web portal and
23 # content management system devoted to the study of ancient geography.
24 #
25 # See http://pleiades.stoa.org
26 #
27 # Funding for the creation of this software was provided by a grant from the
28 # U.S. National Endowment for the Humanities (http://www.neh.gov), and
29 # by the Institute for the Study of the Ancient World at New York University
30 # (http://www.nyu.edu/isaw)
31 # ===========================================================================
32
33 import logging
34 from os.path import join
35
36 import lxml.etree as etree
37
38 XSLTFILE = 'wordstripper.xsl'
39
40 def strip(contextpath, source):
41     """Strip unneeded formatting inherited from MSWord. Uses lxml and an external XSLT stylesheet."""
42     logging.info("BEGIN attempt to strip unneeded formatting inherited from MSWord using %s: wordstripper.strip()" % XSLTFILE)
43     xslt_doc = etree.parse(join(contextpath, XSLTFILE))
44     transform = etree.XSLT(xslt_doc)
45     result = etree.XML(unicode(transform(source)))
46     logging.info("DONE with wordstripper.strip()")
47     return result
48    
49 def _test():
50     import doctest
51     doctest.testmod()
52     doctest.testfile('tests/wordstripper.txt')
53     # invoke additional doctest files here
54
55 if __name__ == "__main__":
56     _test()
57    
58    
Note: See TracBrowser for help on using the browser.