root/BADataMunger/trunk/texthelps.py

Revision 1301, 2.0 kB (checked in by thomase, 8 months ago)

added header, docstring and tests, and changed code to use unicode-aware regular expression substitution

  • Property svn:eol-style set to native
Line 
1 # ===========================================================================
2 # Copyright (C) 2006-2008 Ancient World Mapping Center (UNC-CH) and the
3 # Institute for the Study of the Ancient World (NYU)
4 #
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License along
16 # with this program; if not, write to the Free Software Foundation, Inc.,
17 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 #
19 # About Pleiades
20 # --------------
21 #
22 # Pleiades is an international research network and associated web portal and
23 # content management system devoted to the study of ancient geography.
24 #
25 # See http://pleiades.stoa.org
26 #
27 # Funding for the creation of this software was provided by a grant from the
28 # U.S. National Endowment for the Humanities (http://www.neh.gov), and
29 # by the Institute for the Study of the Ancient World at New York University
30 # (http://www.nyu.edu/isaw)
31 # ===========================================================================
32
33 import re
34
35 SPACEPATTERN = u'\s+'
36
37 def normalizetext(source):
38     """ Condense arbitrary spans of spaces and newlines in a unicode string down
39     to a single space. Returns a unicode string. """
40     #return u' '.join(source.replace(u'\n', u' ').strip().split()).strip()
41     rex = re.compile(SPACEPATTERN, re.UNICODE)
42     return rex.sub(u' ', source).strip()
43    
44 def _test():
45     import doctest
46     doctest.testmod()
47     doctest.testfile('tests/texthelps.txt')
48     # invoke additional doctest files here
49
50 if __name__ == "__main__":
51     _test()
52    
Note: See TracBrowser for help on using the browser.