| 1 |
# =========================================================================== |
|---|
| 2 |
# Copyright (C) 2006-2008 Ancient World Mapping Center (UNC-CH) and the |
|---|
| 3 |
# Institute for the Study of the Ancient World (NYU) |
|---|
| 4 |
# |
|---|
| 5 |
# This program is free software; you can redistribute it and/or modify |
|---|
| 6 |
# it under the terms of the GNU General Public License as published by |
|---|
| 7 |
# the Free Software Foundation; either version 2 of the License, or |
|---|
| 8 |
# (at your option) any later version. |
|---|
| 9 |
# |
|---|
| 10 |
# This program is distributed in the hope that it will be useful, |
|---|
| 11 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 12 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 13 |
# GNU General Public License for more details. |
|---|
| 14 |
# |
|---|
| 15 |
# You should have received a copy of the GNU General Public License along |
|---|
| 16 |
# with this program; if not, write to the Free Software Foundation, Inc., |
|---|
| 17 |
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
|---|
| 18 |
# |
|---|
| 19 |
# About Pleiades |
|---|
| 20 |
# -------------- |
|---|
| 21 |
# |
|---|
| 22 |
# Pleiades is an international research network and associated web portal and |
|---|
| 23 |
# content management system devoted to the study of ancient geography. |
|---|
| 24 |
# |
|---|
| 25 |
# See http://pleiades.stoa.org |
|---|
| 26 |
# |
|---|
| 27 |
# Funding for the creation of this software was provided by a grant from the |
|---|
| 28 |
# U.S. National Endowment for the Humanities (http://www.neh.gov), and |
|---|
| 29 |
# by the Institute for the Study of the Ancient World at New York University |
|---|
| 30 |
# (http://www.nyu.edu/isaw) |
|---|
| 31 |
# =========================================================================== |
|---|
| 32 |
|
|---|
| 33 |
import re |
|---|
| 34 |
import logging |
|---|
| 35 |
|
|---|
| 36 |
import lxml.etree as etree |
|---|
| 37 |
|
|---|
| 38 |
from etreehelps import getalltext |
|---|
| 39 |
from texthelps import normalizetext |
|---|
| 40 |
|
|---|
| 41 |
REANATITLE = u'\u201C(.*)\u201D' |
|---|
| 42 |
|
|---|
| 43 |
def extract(source): |
|---|
| 44 |
"""Extract bibliography from cleaned up wordxml. Finds the bibliography div and |
|---|
| 45 |
pulls out the titles and citations into a list. Also tries to find an abbreviations table |
|---|
| 46 |
in the directory listing div and appends any titles and citations there to the list. |
|---|
| 47 |
Expects as input an lxml etree. A python list of tuples like (shorttitle, fulltitle, fullcitation) |
|---|
| 48 |
is returned, where shorttitle and fulltitle are unicode strings and fullcitation is a |
|---|
| 49 |
lxml etree Element (an html p tag containing the full citation).""" |
|---|
| 50 |
|
|---|
| 51 |
logging.info("BEGIN extracting bibliography from cleaned-up word html") |
|---|
| 52 |
|
|---|
| 53 |
# determine which div contains the bibliography |
|---|
| 54 |
biblist = [] |
|---|
| 55 |
divs = source.xpath("descendant::*[local-name()='div']") |
|---|
| 56 |
bibdiv = None |
|---|
| 57 |
dirlistdiv = None |
|---|
| 58 |
for i, div in enumerate(divs): |
|---|
| 59 |
text = getalltext(div).strip() |
|---|
| 60 |
if text == 'Bibliography': |
|---|
| 61 |
bibdiv = divs[i+1] # yes, really, the next div |
|---|
| 62 |
logging.info("bibliography div is div %s" % (i+1)) |
|---|
| 63 |
elif text.find('Abbreviation') > 0: |
|---|
| 64 |
dirlistdiv = divs[i] |
|---|
| 65 |
logging.info("abbreviation div is div %s" % i) |
|---|
| 66 |
|
|---|
| 67 |
|
|---|
| 68 |
# parse the bibliography div |
|---|
| 69 |
paras = bibdiv.xpath("*[local-name()='p']") |
|---|
| 70 |
logging.info ("the bibliography div contains %s paragraphs" % len(paras)) |
|---|
| 71 |
for p in paras: |
|---|
| 72 |
text = normalizetext(p.text) |
|---|
| 73 |
alltext = normalizetext(getalltext(p)) |
|---|
| 74 |
if text==alltext: |
|---|
| 75 |
# this paragraph contains a short title |
|---|
| 76 |
try: |
|---|
| 77 |
biblist.append((shorttitle, atitle, citation)) |
|---|
| 78 |
except: |
|---|
| 79 |
pass |
|---|
| 80 |
shorttitle = text |
|---|
| 81 |
else: |
|---|
| 82 |
# this paragraph contains a citation of a work: try to extract title, first as from an article and, failing |
|---|
| 83 |
# that, as if from a book |
|---|
| 84 |
citation = p |
|---|
| 85 |
atitle = extract_title(citation) |
|---|
| 86 |
|
|---|
| 87 |
# parse the list of abbreviations out of the directory listing div |
|---|
| 88 |
if dirlistdiv: |
|---|
| 89 |
table = dirlistdiv.xpath("descendant::*[local-name()='p' and contains(., 'Abbreviation')]")[0] |
|---|
| 90 |
rows = table.xpath("following-sibling::*[local-name()='table'][1]/*[local-name()='tr']") |
|---|
| 91 |
logging.info("the abbreviations table has %s rows" % len(rows)) |
|---|
| 92 |
for row in rows: |
|---|
| 93 |
shorttitle = normalizetext(getalltext(row.xpath("*[local-name()='td']")[0])) |
|---|
| 94 |
citation = row.xpath("*[local-name()='td'][2]/*[local-name()='p']")[0] |
|---|
| 95 |
#print etree.tostring(citation) |
|---|
| 96 |
atitle = extract_title(citation) |
|---|
| 97 |
biblist.append((shorttitle, atitle, citation)) |
|---|
| 98 |
|
|---|
| 99 |
logging.info("DONE extracting bibliography from cleaned-up word html") |
|---|
| 100 |
|
|---|
| 101 |
return biblist |
|---|
| 102 |
|
|---|
| 103 |
|
|---|
| 104 |
def extract_title(source): |
|---|
| 105 |
"""Given an lxml etree Element, interpret its text content as a bibliographic citation |
|---|
| 106 |
and attempt to parse from it an article title. If that fails, try to parse a book title.""" |
|---|
| 107 |
|
|---|
| 108 |
|
|---|
| 109 |
reana = re.compile(REANATITLE) |
|---|
| 110 |
m = reana.search(normalizetext(getalltext(source))) |
|---|
| 111 |
|
|---|
| 112 |
try: |
|---|
| 113 |
atitle = m.groups()[0] |
|---|
| 114 |
except: |
|---|
| 115 |
atitle = normalizetext(getalltext(source.xpath("descendant::*[local-name()='i']")[0])) |
|---|
| 116 |
try: |
|---|
| 117 |
if atitle[-1] == ',': |
|---|
| 118 |
atitle = atitle[:-1] |
|---|
| 119 |
except: |
|---|
| 120 |
pass |
|---|
| 121 |
return atitle |
|---|
| 122 |
|
|---|
| 123 |
|
|---|
| 124 |
def _test(): |
|---|
| 125 |
import doctest |
|---|
| 126 |
doctest.testmod() |
|---|
| 127 |
doctest.testfile('tests/biblioextractor.txt') |
|---|
| 128 |
# invoke additional doctest files here |
|---|
| 129 |
|
|---|
| 130 |
if __name__ == "__main__": |
|---|
| 131 |
_test() |
|---|