-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathrdfFileReconciliation.py
141 lines (127 loc) · 4.98 KB
/
rdfFileReconciliation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import csv
from fuzzywuzzy import fuzz
import time
import datetime
from rdflib import Graph
from rdflib.namespace import SKOS, DC
from rdflib import URIRef
from rdflib.plugins.sparql import prepareQuery
import argparse
import os
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--rdfFileName', help='the RDF file to be reconciled \
against (include the extension). optional - if not provided, the script will \
ask for input')
parser.add_argument('-f', '--fileName', help='the CSV file of headings to \
reconcile (including \'.csv\'). optional - if not provided, the script will \
ask for input')
parser.add_argument('-d', '--directory', help='the directory for the input and \
output files. optional - if not provided, the script will assume null')
parser.add_argument('-t', '--threshold', help='the threshold (e.g. \'90\' \
means the strings are 90% similar and 10% different ). optional - if not \
provided, the script will default to 70')
args = parser.parse_args()
if args.rdfFileName:
rdfFileName = args.rdfFileName
else:
rdfFileName = input('Enter the RDF file to be reconciled against (include \
the extension): ')
if args.fileName:
fileName = args.fileName
else:
fileName = input('Enter the CSV file of headings to reconcile \
(including \'.csv\'): ')
if args.threshold:
threshold = int(args.threshold)
else:
threshold = 70
if args.directory:
directory = args.directory
else:
directory = ''
# define function for finding the prefLabel of a subject
def retrievePrefLabel(uri):
"""Retrieve prefLabel for URI."""
q = prepareQuery('SELECT ?o ?d WHERE {?s skos:prefLabel ?o. ?s dc:date \
?d. }', initNs={'skos': SKOS, 'dc': DC})
results = g.query(q, initBindings={'s': URIRef(uri)})
for row in results:
prefLabel = row[0]
date = row[1]
global match
match = [label, str(prefLabel), uri, date]
os.chdir(directory)
startTime = time.time()
date = datetime.datetime.now().strftime('%Y-%m-%d %H.%M.%S')
# import rdf file into graph
g = Graph()
g.parse(rdfFileName, format='n3')
g.bind('skos', SKOS)
# create dict of pref and alt labels from rdf file
existingLabels = {}
q = prepareQuery('SELECT ?s ?o WHERE { ?s skos:prefLabel|skos:altLabel ?o }',
initNs={'skos': SKOS})
results = g.query(q)
for row in results:
existingLabels[str(row[1])] = str(row[0])
# create lists and csv files
completeNearMatches = []
completeExactMatches = []
f = csv.writer(open(os.path.join('reconciliationResults', 'rdfExactMatches'
+ date + '.csv'), 'w'))
f.writerow(['originalLabel'] + ['standardizedLabel'] + ['uri'] + ['date'])
f2 = csv.writer(open(os.path.join('reconciliationResults',
'rdfNearAndNonMatches' + date + '.csv'), 'w'))
f2.writerow(['originalLabel'] + ['standardizedLabel'] + ['uri'] + ['date'])
# create counters
newHeadingsCount = 0
exactMatchNewHeadings = 0
nearMatchNewHeadings = 0
nonmatchedNewHeadings = 0
# parse CSV data and compares against existingLabels dict for exact and near
# matches
with open(fileName) as csvfile:
reader = csv.DictReader(csvfile)
rowCount = len(list(reader))
with open(fileName) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
label = row['name']
rowCount -= 1
print('Rows remaining: ', rowCount)
newHeadingsCount += 1
preCount = len(completeNearMatches)
for label2, uri in existingLabels.items():
if label == label2:
exactMatchNewHeadings += 1
completeExactMatches.append(label)
retrievePrefLabel(uri)
f.writerow([match[0]] + [match[1]] + [match[2]] + [match[3]])
if label not in completeExactMatches:
for label2, uri in existingLabels.items():
ratio = fuzz.ratio(label, label2)
partialRatio = fuzz.partial_ratio(label, label2)
tokenSort = fuzz.token_sort_ratio(label, label2)
tokenSet = fuzz.token_set_ratio(label, label2)
avg = (ratio + partialRatio + tokenSort + tokenSet) / 4
if avg > threshold:
retrievePrefLabel(uri)
if match not in completeNearMatches:
completeNearMatches.append(match)
postCount = len(completeNearMatches)
if postCount > preCount:
nearMatchNewHeadings += 1
else:
nonmatchedNewHeadings += 1
f2.writerow([label] + [''] + ['no match'] + [''])
# write results to CSV file
for match in completeNearMatches:
f2.writerow([match[0]] + [match[1]] + [match[2]] + [match[3]])
print('Total headings reconciled: ', newHeadingsCount)
print('Exact match headings: ', exactMatchNewHeadings)
print('Near match headings: ', nearMatchNewHeadings)
print('Unmatched headings: ', nonmatchedNewHeadings)
elapsedTime = time.time() - startTime
m, s = divmod(elapsedTime, 60)
h, m = divmod(m, 60)
print('Total script run time: ', '%d:%02d:%02d' % (h, m, s))