| 1 |
# =========================================================================== |
|---|
| 2 |
# Copyright (C) 2006-2008 Ancient World Mapping Center (UNC-CH) and the |
|---|
| 3 |
# Institute for the Study of the Ancient World (NYU) |
|---|
| 4 |
# |
|---|
| 5 |
# This program is free software; you can redistribute it and/or modify |
|---|
| 6 |
# it under the terms of the GNU General Public License as published by |
|---|
| 7 |
# the Free Software Foundation; either version 2 of the License, or |
|---|
| 8 |
# (at your option) any later version. |
|---|
| 9 |
# |
|---|
| 10 |
# This program is distributed in the hope that it will be useful, |
|---|
| 11 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 12 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 13 |
# GNU General Public License for more details. |
|---|
| 14 |
# |
|---|
| 15 |
# You should have received a copy of the GNU General Public License along |
|---|
| 16 |
# with this program; if not, write to the Free Software Foundation, Inc., |
|---|
| 17 |
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
|---|
| 18 |
# |
|---|
| 19 |
# About Pleiades |
|---|
| 20 |
# -------------- |
|---|
| 21 |
# |
|---|
| 22 |
# Pleiades is an international research network and associated web portal and |
|---|
| 23 |
# content management system devoted to the study of ancient geography. |
|---|
| 24 |
# |
|---|
| 25 |
# See http://pleiades.stoa.org |
|---|
| 26 |
# |
|---|
| 27 |
# Funding for the creation of this software was provided by a grant from the |
|---|
| 28 |
# U.S. National Endowment for the Humanities (http://www.neh.gov), and |
|---|
| 29 |
# by the Institute for the Study of the Ancient World at New York University |
|---|
| 30 |
# (http://www.nyu.edu/isaw) |
|---|
| 31 |
# =========================================================================== |
|---|
| 32 |
import lxml.etree as etree |
|---|
| 33 |
|
|---|
| 34 |
norms = [ |
|---|
| 35 |
('‑','-'), # non-breaking hyphen |
|---|
| 36 |
(' ',' '), # non-breaking space |
|---|
| 37 |
(' ',' '), # non-breaking space bis |
|---|
| 38 |
('ߪ','...') , # horizontal ellipsis |
|---|
| 39 |
('\xc2\xa0;', ' ') # non-breaking space in utf-8 |
|---|
| 40 |
] |
|---|
| 41 |
|
|---|
| 42 |
def normalize(source): |
|---|
| 43 |
"""Remove/replace undesireable characters from all text nodes in an XML Element Tree.""" |
|---|
| 44 |
result = etree.tostring(source) |
|---|
| 45 |
for norm in norms: |
|---|
| 46 |
result = result.replace(norm[0], norm[1]) |
|---|
| 47 |
return etree.XML(result) |
|---|
| 48 |
|
|---|
| 49 |
def _test(): |
|---|
| 50 |
import doctest |
|---|
| 51 |
doctest.testmod() |
|---|
| 52 |
doctest.testfile('tests/wordnormalizer.txt') |
|---|
| 53 |
# invoke additional doctest files here |
|---|
| 54 |
|
|---|
| 55 |
if __name__ == "__main__": |
|---|
| 56 |
_test() |
|---|