root/BADataMunger/trunk/biblioextractor.py

Revision 1306, 5.1 kB (checked in by thomase, 7 months ago)

added header, improved docstring and added working tests; also changed method of whitespace normalization to use texthelps.normalizespace so as to get better unicode handling and more uniform behavior

  • Property svn:eol-style set to native
Line 
1 # ===========================================================================
2 # Copyright (C) 2006-2008 Ancient World Mapping Center (UNC-CH) and the
3 # Institute for the Study of the Ancient World (NYU)
4 #
5 # This program is free software; you can redistribute it and/or modify
6 # it under the terms of the GNU General Public License as published by
7 # the Free Software Foundation; either version 2 of the License, or
8 # (at your option) any later version.
9 #
10 # This program is distributed in the hope that it will be useful,
11 # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 # GNU General Public License for more details.
14 #
15 # You should have received a copy of the GNU General Public License along
16 # with this program; if not, write to the Free Software Foundation, Inc.,
17 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
18 #
19 # About Pleiades
20 # --------------
21 #
22 # Pleiades is an international research network and associated web portal and
23 # content management system devoted to the study of ancient geography.
24 #
25 # See http://pleiades.stoa.org
26 #
27 # Funding for the creation of this software was provided by a grant from the
28 # U.S. National Endowment for the Humanities (http://www.neh.gov), and
29 # by the Institute for the Study of the Ancient World at New York University
30 # (http://www.nyu.edu/isaw)
31 # ===========================================================================
32
33 import re
34 import logging
35
36 import lxml.etree as etree
37
38 from etreehelps import getalltext
39 from texthelps import normalizetext
40
41 REANATITLE = u'\u201C(.*)\u201D'
42
43 def extract(source):
44     """Extract bibliography from cleaned up wordxml. Finds the bibliography div and
45     pulls out the titles and citations into a list. Also tries to find an abbreviations table
46     in the directory listing div and appends any titles and citations there to the list.
47     Expects as input an lxml etree. A python list of tuples like (shorttitle, fulltitle, fullcitation)
48     is returned, where shorttitle and fulltitle are unicode strings and fullcitation is a
49     lxml etree Element (an html p tag containing the full citation)."""
50    
51     logging.info("BEGIN extracting bibliography from cleaned-up word html")
52    
53     # determine which div contains the bibliography
54     biblist = []
55     divs = source.xpath("descendant::*[local-name()='div']")
56     bibdiv = None
57     dirlistdiv = None
58     for i, div in enumerate(divs):
59         text = getalltext(div).strip()
60         if text == 'Bibliography':
61             bibdiv = divs[i+1]  # yes, really, the next div
62             logging.info("bibliography div is div %s" % (i+1))
63         elif text.find('Abbreviation') > 0:
64             dirlistdiv = divs[i]
65             logging.info("abbreviation div is div %s" % i)
66            
67        
68     # parse the bibliography div
69     paras = bibdiv.xpath("*[local-name()='p']")
70     logging.info ("the bibliography div contains %s paragraphs" % len(paras))
71     for p in paras:
72         text = normalizetext(p.text)
73         alltext = normalizetext(getalltext(p))
74         if text==alltext:
75             # this paragraph contains a short title
76             try:
77                 biblist.append((shorttitle, atitle, citation))
78             except:
79                 pass
80             shorttitle = text
81         else:
82             # this paragraph contains a citation of a work: try to extract title, first as from an article and, failing
83             # that, as if from a book
84             citation = p
85             atitle = extract_title(citation)
86                
87     # parse the list of abbreviations out of the directory listing div
88     if dirlistdiv:
89         table = dirlistdiv.xpath("descendant::*[local-name()='p' and contains(., 'Abbreviation')]")[0]
90         rows = table.xpath("following-sibling::*[local-name()='table'][1]/*[local-name()='tr']")
91         logging.info("the abbreviations table has %s rows" % len(rows))
92         for row in rows:
93             shorttitle = normalizetext(getalltext(row.xpath("*[local-name()='td']")[0]))
94             citation = row.xpath("*[local-name()='td'][2]/*[local-name()='p']")[0]
95             #print etree.tostring(citation)
96             atitle = extract_title(citation)
97             biblist.append((shorttitle, atitle, citation))
98        
99     logging.info("DONE extracting bibliography from cleaned-up word html")
100    
101     return biblist
102    
103    
104 def extract_title(source):
105     """Given an lxml etree Element, interpret its text content as a bibliographic citation
106     and attempt to parse from it an article title. If that fails, try to parse a book title."""
107    
108    
109     reana = re.compile(REANATITLE)
110     m = reana.search(normalizetext(getalltext(source)))
111    
112     try:
113         atitle = m.groups()[0]
114     except:
115         atitle = normalizetext(getalltext(source.xpath("descendant::*[local-name()='i']")[0]))
116     try:
117         if atitle[-1] == ',':
118             atitle = atitle[:-1]
119     except:
120         pass
121     return atitle
122    
123    
124 def _test():
125     import doctest
126     doctest.testmod()
127     doctest.testfile('tests/biblioextractor.txt')
128     # invoke additional doctest files here
129
130 if __name__ == "__main__":
131     _test()
Note: See TracBrowser for help on using the browser.