| 34 | | |
|---|
| | 61 | def parse_name_variants(placename): |
|---|
| | 62 | """BAtlas placenames use internal parentheses to sausage-bundle variants. Tease these out into |
|---|
| | 63 | separate names.""" |
|---|
| | 64 | # chunk the name at parentheticals and split into two lists: those portions that are always |
|---|
| | 65 | # part of the name, and those portions that are only sometimes part of the name |
|---|
| | 66 | pattern = u'(\([^\)]+\))' |
|---|
| | 67 | chunks = re.split(pattern, placename) |
|---|
| | 68 | components = [] |
|---|
| | 69 | varindexes = [] |
|---|
| | 70 | variants = [] |
|---|
| | 71 | for chunk in chunks: |
|---|
| | 72 | if chunk != u'': |
|---|
| | 73 | if chunk[0] == u'(': |
|---|
| | 74 | components.append((re.sub(u'[\(\)]', u'', chunk), False)) |
|---|
| | 75 | else: |
|---|
| | 76 | components.append((chunk, True)) |
|---|
| | 77 | tcomponents = [(val[0], i) for i,val in enumerate(components) if val[1] == True] |
|---|
| | 78 | fcomponents = [(val[0], i) for i,val in enumerate(components) if val[1] == False] |
|---|
| | 79 | |
|---|
| | 80 | # generate all the various combinations of the sometimes parts |
|---|
| | 81 | fvariants = [] |
|---|
| | 82 | for i in range(1, len(fcomponents)+1): |
|---|
| | 83 | fvariants.extend(xpermutations.xuniqueCombinations(fcomponents, i)) |
|---|
| | 84 | |
|---|
| | 85 | # recombine the "always" and "sometimes" lists to create each unique variant |
|---|
| | 86 | for fvar in fvariants: |
|---|
| | 87 | vcomponents = fvar + tcomponents |
|---|
| | 88 | svcomponents = sorted(vcomponents, key=operator.itemgetter(1)) |
|---|
| | 89 | variant = u''.join(map(operator.itemgetter(0), svcomponents)) |
|---|
| | 90 | variants.append(variant) |
|---|
| | 91 | |
|---|
| | 92 | # don't forget the variant that involves none of the "sometimes" parts |
|---|
| | 93 | svcomponents = sorted(tcomponents, key=operator.itemgetter(1)) |
|---|
| | 94 | variant = u''.join(map(operator.itemgetter(0), svcomponents)) |
|---|
| | 95 | variants.append(variant) |
|---|
| | 96 | |
|---|
| | 97 | return variants |
|---|
| | 98 | |
|---|
| | 99 | class Place: |
|---|
| | 100 | def __init__(self): |
|---|
| | 101 | self.grid='' |
|---|
| | 102 | self.locdesc=u'' |
|---|
| | 103 | self.itinerary=[] |
|---|
| | 104 | self.placenames=[] |
|---|
| | 105 | self.number='' |
|---|
| | 106 | self.periods=[] |
|---|
| | 107 | self.references=[] |
|---|
| | 108 | self.type='' |
|---|
| | 109 | |
|---|
| | 110 | def __str__(self): |
|---|
| | 111 | result = '----------\n%s\n' % self.type |
|---|
| | 112 | if len(self.grid) > 0: |
|---|
| | 113 | result += 'grid: %s\n' % self.grid |
|---|
| | 114 | if len(self.locdesc) > 0: |
|---|
| | 115 | result += 'locdesc: %s\n' % self.locdesc.encode('ascii','backslashreplace') |
|---|
| | 116 | if len(self.itinerary) > 0: |
|---|
| | 117 | result += 'itinerary:' |
|---|
| | 118 | for itin in self.itinerary: |
|---|
| | 119 | result += ' :: %s' % itin.encode('ascii', 'backslashreplace') |
|---|
| | 120 | result += '\n' |
|---|
| | 121 | if len(self.placenames) > 0: |
|---|
| | 122 | for placename in self.placenames: |
|---|
| | 123 | result += 'placename: %s\n' % placename.name.encode('ascii', 'backslashreplace') |
|---|
| | 124 | if len(self.number) > 0: |
|---|
| | 125 | result += 'number: %s\n' % self.number |
|---|
| | 126 | result += 'periods: ' |
|---|
| | 127 | for period in self.periods: |
|---|
| | 128 | result += '%s (%s); ' % period |
|---|
| | 129 | if len(self.periods) > 0: |
|---|
| | 130 | result += '\n' |
|---|
| | 131 | if len(self.references) > 0: |
|---|
| | 132 | result += 'references: ' |
|---|
| | 133 | for ref in self.references: |
|---|
| | 134 | sref = ref.encode('ascii', 'backslashreplace') + '; ' |
|---|
| | 135 | result += sref |
|---|
| | 136 | result += '\n' |
|---|
| | 137 | result += '----------' |
|---|
| | 138 | return result |
|---|
| | 139 | |
|---|
| | 140 | |
|---|
| | 141 | def add_aqueduct(self, cells): |
|---|
| | 142 | # grid | locdesc | periods | references |
|---|
| | 143 | self.type = 'aqueduct' |
|---|
| | 144 | self.grid = normalizetext(getalltext(cells[0])) |
|---|
| | 145 | self.locdesc = normalizetext(getalltext(cells[1])) |
|---|
| | 146 | self.periods = self.parse_periods(normalizetext(getalltext(cells[2]))) |
|---|
| | 147 | self.references = self.parse_references(getalltext(cells[3])) |
|---|
| | 148 | |
|---|
| | 149 | def add_name(self, cells): |
|---|
| | 150 | # grid | placename(s) | periods | locdesc | references |
|---|
| | 151 | self.type = 'name' |
|---|
| | 152 | self.grid = normalizetext(getalltext(cells[0])) |
|---|
| | 153 | self.placenames = self.parse_placenames(cells[1]) |
|---|
| | 154 | self.periods = self.parse_periods(normalizetext(getalltext(cells[2]))) |
|---|
| | 155 | self.locdesc = normalizetext(getalltext(cells[3])) |
|---|
| | 156 | self.references = self.parse_references(getalltext(cells[4])) |
|---|
| | 157 | pass |
|---|
| | 158 | |
|---|
| | 159 | def add_numbered(self, cells): |
|---|
| | 160 | # number | grid | location | period | reference |
|---|
| | 161 | self.type = 'numbered' |
|---|
| | 162 | self.number = normalizetext(getalltext(cells[0])) |
|---|
| | 163 | self.grid = normalizetext(getalltext(cells[1])) |
|---|
| | 164 | self.locdesc = normalizetext(getalltext(cells[2])) |
|---|
| | 165 | self.references = self.parse_references(getalltext(cells[3])) |
|---|
| | 166 | pass |
|---|
| | 167 | |
|---|
| | 168 | def add_road(self, cells): |
|---|
| | 169 | # itinerary | period | reference |
|---|
| | 170 | self.type = 'road' |
|---|
| | 171 | self.itinerary = self.parse_itinerary(normalizetext(getalltext(cells[0]))) |
|---|
| | 172 | self.periods = self.parse_periods(normalizetext(getalltext(cells[1]))) |
|---|
| | 173 | self.references = self.parse_references(getalltext(cells[2])) |
|---|
| | 174 | pass |
|---|
| | 175 | |
|---|
| | 176 | def add_unlocated(self, cells): |
|---|
| | 177 | # name | period | probable location | reference |
|---|
| | 178 | self.type = 'unlocated' |
|---|
| | 179 | self.placenames = self.parse_placenames(cells[0]) |
|---|
| | 180 | self.periods = self.parse_periods(normalizetext(getalltext(cells[1]))) |
|---|
| | 181 | self.locdesc = normalizetext(getalltext(cells[2])) |
|---|
| | 182 | self.references = self.parse_references(getalltext(cells[2])) |
|---|
| | 183 | pass |
|---|
| | 184 | |
|---|
| | 185 | def parse_periods(self, pstring): |
|---|
| | 186 | """Parse all periods for places noting that a period reference (with or without question mark) |
|---|
| | 187 | can be repeated for individual names, and we need to flatten that all out for places with the |
|---|
| | 188 | most certain measure on each period taking precedence.""" |
|---|
| | 189 | periods=[] |
|---|
| | 190 | lpstring = pstring |
|---|
| | 191 | for pcode in periodcodes: |
|---|
| | 192 | qpcode = '%s?' % pcode |
|---|
| | 193 | if lpstring.find(qpcode) != -1: |
|---|
| | 194 | lpstring = lpstring.replace(qpcode, '') |
|---|
| | 195 | if lpstring.find(pcode) != -1: |
|---|
| | 196 | lpstring = lpstring.replace(pcode, '') |
|---|
| | 197 | periods.append((pcode, 'confident')) |
|---|
| | 198 | else: |
|---|
| | 199 | periods.append((pcode, 'less-confident')) |
|---|
| | 200 | else: |
|---|
| | 201 | if lpstring.find(pcode) != -1: |
|---|
| | 202 | lpstring = lpstring.replace(pcode, '') |
|---|
| | 203 | periods.append((pcode, 'confident')) |
|---|
| | 204 | return periods |
|---|
| | 205 | |
|---|
| | 206 | def parse_itinerary(self, istring): |
|---|
| | 207 | """Parse itinerary into individual place references.""" |
|---|
| | 208 | places = istring.split(u'\xae') |
|---|
| | 209 | return [normalizetext(place) for place in places] |
|---|
| | 210 | |
|---|
| | 211 | def parse_placenames(self, namecell): |
|---|
| | 212 | """We can have multiple placenames in a single cell, and there are two types of delimiters. |
|---|
| | 213 | Parse this mess to get the individual names, and then deal with variants too.""" |
|---|
| | 214 | |
|---|
| | 215 | placenames=[] |
|---|
| | 216 | nameishes=[] |
|---|
| | 217 | |
|---|
| | 218 | celltext = getalltext(namecell) |
|---|
| | 219 | |
|---|
| | 220 | # get individual lines |
|---|
| | 221 | lines = celltext.splitlines() |
|---|
| | 222 | nlines = [normalizetext(line) for line in lines if normalizetext(line) != u''] |
|---|
| | 223 | |
|---|
| | 224 | # get individual names and remember their sequence for later linking with periods and bibliography |
|---|
| | 225 | pattern = u'\s*[/\\xa7]\s*' |
|---|
| | 226 | for nline in nlines: |
|---|
| | 227 | nameishes.extend(re.split(pattern, nline)) |
|---|
| | 228 | for i, nameish in enumerate(nameishes): |
|---|
| | 229 | if nameish.find(u'(') != -1: |
|---|
| | 230 | variants = parse_name_variants(nameish) |
|---|
| | 231 | for variant in variants: |
|---|
| | 232 | pn = Placename(variant, celltext, i) |
|---|
| | 233 | placenames.append(pn) |
|---|
| | 234 | else: |
|---|
| | 235 | pn = Placename(nameish, celltext, i) |
|---|
| | 236 | placenames.append(pn) |
|---|
| | 237 | |
|---|
| | 238 | # test for modern/ancient |
|---|
| | 239 | return placenames |
|---|
| | 240 | |
|---|
| | 241 | |
|---|
| | 242 | |
|---|
| | 243 | def parse_references(self, rstring): |
|---|
| | 244 | """Parse all references without regard for relative order with respect to names. |
|---|
| | 245 | Delimters can be \n or ;""" |
|---|
| | 246 | references=[] |
|---|
| | 247 | lines = rstring.splitlines() |
|---|
| | 248 | for line in lines: |
|---|
| | 249 | nline = normalizetext(line) |
|---|
| | 250 | if nline == u'': |
|---|
| | 251 | pass |
|---|
| | 252 | elif nline.find(u';') != -1: |
|---|
| | 253 | refs = nline.split(u';') |
|---|
| | 254 | for ref in refs: |
|---|
| | 255 | nref = normalizetext(ref) |
|---|
| | 256 | if nref != u'': |
|---|
| | 257 | references.append(nref) |
|---|
| | 258 | else: |
|---|
| | 259 | references.append(nline) |
|---|
| | 260 | return references |
|---|
| | 261 | |
|---|
| | 262 | class Placename: |
|---|
| | 263 | def __init__(self, nstring, onstring, pos): |
|---|
| | 264 | self.name=nstring |
|---|
| | 265 | self.modern=False |
|---|
| | 266 | self.originalNameString=onstring |
|---|
| | 267 | self.originalPosition=pos |
|---|
| | 268 | self.periods=[] |
|---|
| | 269 | self.references=[] |
|---|
| | 270 | |
|---|
| | 271 | |
|---|
| | 272 | def parse_periodcell(self, periodcell): |
|---|
| | 273 | periods=[] |
|---|
| | 274 | return periods |
|---|
| | 275 | |
|---|
| | 276 | |
|---|
| | 277 | |
|---|
| | 278 | |
|---|