-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathritual_direction_extractor.py
112 lines (100 loc) · 5.42 KB
/
ritual_direction_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
from lxml import etree
from transcriptions.yasna_word_parser import YasnaWordParser
class RitualDirectionExtractor(object):
def __init__(self):
self.word_parser = YasnaWordParser()
self.parser = etree.XMLParser(resolve_entities=False)
self.sigla = []
self.ritual_dirs = {}
self.y_n_values = []
self.vrs_n_values = []
self.vs_n_values = []
self.vyts_n_values = []
self.yvr_n_values = []
self.errors = []
def get_ritual_direction_text(self, rd, verse):
rd_wrapper = etree.Element('wrapper')
rd_wrapper.append(rd)
data = self.word_parser.walk_reading(rd_wrapper, verse, '')
if len(data[0]) > 0:
new_list = []
for word in data[0]:
if 'gap_before_details' in word:
new_list.append('<{}>'.format(word['gap_before_details']))
if 'pc_before' in word and 'pc_after' in word:
new_list.append('{}{}{}'.format(word['pc_before'], word['original'], word['pc_after']))
elif 'pc_before' in word:
new_list.append('{}{}'.format(word['pc_before'], word['original']))
elif 'pc_after' in word:
new_list.append('{}{}'.format(word['original'], word['pc_after']))
else:
new_list.append(word['original'])
if 'gap_after_details' in word:
new_list.append('<{}>'.format(word['gap_after_details']))
return ' '.join(new_list)
return ''
def extract_ritual_directions(self, transcriptions):
for transcription in transcriptions:
tree = etree.XML(transcription['tei'], self.parser)
siglum = transcription['siglum']
if siglum != 'basetext':
self.sigla.append(siglum)
self.ritual_dirs[siglum] = {}
for rd in tree.xpath('//tei:ab[@type="ritualdirection"]',
namespaces={'tei': 'http://www.tei-c.org/ns/1.0'}):
n = rd.get('n')
book = n.split('.')[0]
if book == 'Y' and n not in self.y_n_values:
self.y_n_values.append(n)
elif book == 'VrS' and n not in self.vrs_n_values:
self.vrs_n_values.append(n)
elif book == 'VS' and n not in self.vs_n_values:
self.vs_n_values.append(n)
elif book == 'VytS' and n not in self.vyts_n_values:
self.vyts_n_values.append(n)
elif book == 'YVr' and n not in self.yvr_n_values:
self.yvr_n_values.append(n)
elif book not in ['Y', 'VrS', 'VS', 'VytS', 'YVr']:
self.errors.append('book {} in unit {} not recognised'.format(book, n))
text = self.get_ritual_direction_text(rd, {'siglum': siglum})
if n in self.ritual_dirs[siglum]:
self.ritual_dirs[siglum][n].append(text)
else:
self.ritual_dirs[siglum][n] = [text]
n_list = sorted(self.y_n_values, key=lambda x: (x.split('.')[0],
int(x.split('.')[1]),
int(x.split('.')[2]),
int(x.split('.')[3])
)
)
n_list.extend(sorted(self.vrs_n_values, key=lambda x: (x.split('.')[0],
int(x.split('.')[1]),
int(x.split('.')[2]),
int(x.split('.')[3])
)
)
)
n_list.extend(sorted(self.vs_n_values, key=lambda x: (x.split('.')[0],
int(x.split('.')[1]),
int(x.split('.')[2]),
int(x.split('.')[3])
)
)
)
n_list.extend(sorted(self.vyts_n_values, key=lambda x: (x.split('.')[0],
int(x.split('.')[1]),
int(x.split('.')[2]),
int(x.split('.')[3])
)
)
)
n_list.extend(sorted(self.yvr_n_values, key=lambda x: (x.split('.')[0],
int(x.split('.')[1]),
int(x.split('.')[2]),
int(x.split('.')[3])
)
)
)
self.sigla = sorted(self.sigla, key=lambda x: (float(x.replace('S', '.5'))))
self.sigla.append('basetext')
return (self.ritual_dirs, self.sigla, n_list, self.errors)