-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathpython-xml-subject-count.py
60 lines (52 loc) · 1.63 KB
/
python-xml-subject-count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import urllib2
import os.path
import re
from xml.dom.minidom import parseString
hotel = 1
review = 1
review_count = 0
subjects = []
subject_count = 0
path, dirs, files = os.walk("KAF").next()
file_count = len(files)
print file_count
while hotel < 500:
filename = 'KAF/review-'+str(hotel)+"-"+str(review)+'.xml'
if os.path.exists(filename):
os.system( [ 'clear', 'cls' ][ os.name == 'nt' ] )
print "Processed: "+str(review_count)+" of "+str(file_count)+" reviews."
print ""
print "Number of subjects: "+str(len(subjects))
print "Total number of subjects: "+str(subject_count)
print ""
print "Processing hotel: "+str(hotel)+", review: "+str(review)
file = open(os.path.abspath('KAF/review-'+str(hotel)+"-"+str(review)+'.xml'), 'r')
data = file.read()
file.close()
dom = parseString(data)
opinions = dom.getElementsByTagName('opinion')
for opinion in opinions:
oTargetTag = opinion.getElementsByTagName('target')[0]
oTarget = oTargetTag.attributes['id'].value
terms = dom.getElementsByTagName('term')
for term in terms:
tTarget = ''
if term.attributes['tid'].value == oTarget:
tTargetTag = term.getElementsByTagName('target')[0]
tTarget = tTargetTag.attributes['id'].value
words = dom.getElementsByTagName('wf')
for word in words:
if word.attributes['wid'].value == tTarget:
word = re.sub('<.*?>', "", word.toxml())
if word not in subjects:
subjects.append(word)
print "Toegevoegd: "+word
else:
print "Dubbel: "+word
subject_count += 1
review += 1
review_count += 1
else:
print "Hotel: "+str(hotel)
hotel += 1
review = 1