-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathMasterize_Data_Record_Linkage.R
167 lines (133 loc) · 6.46 KB
/
Masterize_Data_Record_Linkage.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
##############################################################################################################
########################################### PROJECT: SITE MASTER #####################################
##############################################################################################################
# PURPOSE:
# - Read the cleaned-dataframe generated from Python and try to invoke the RecordLinkage library source-code
# INPUTS:
# - BINARIES_NAME
# - BINARIES_EXTENSION
# - THRESHOLD_FOR_INDIVIDUAL
# - THRESHOLD_FOR_ADDRESS_COMBINED
# - SCALING_FACTOR
# - curr_country
# - RAW_SCORES_DIRECTORY
# - TOTAL_MATCHES_THRESHOLD
# OUTPUTS:
# - CSV file with each candidate-pair having match-score>=TOTAL_MATCHES_THRESHOLD
# VERSION FOR BUILD:
# - R version 3.4.4 (2018-03-15)
# SCRIPT VERSION:
# - 2.0
# CREATED ON:
# - 2021-03-17
# CREATOR:
# - Vikrant Deshpande
# LAST UPDATED ON:
# - 2021-07-01
# LAST UPDATED BY:
# - Vikrant Deshpande
# REQUIRES:
# - tools: R package
########################################################################################################################################################
## Load Required Libraries and Scripts
require(tools)
# Remove all objects in current R Workspace
rm(list = ls(all.names = TRUE))
args=commandArgs(trailingOnly = TRUE)
#args="levenshtein .dll 0.85 0.75 3 United_States Raw_Scores 4 Linkage Master_Data\\United_States_2_Master.csv Master_Data\\United_States_3_Master.csv"
#args="levenshtein .dll 0.85 0.75 3 United_States Raw_Scores 4 Dedup NA NA"
print(args)
args=as.list(strsplit(args, " ")[[1]])
start=Sys.time()
# Initialize all the parameters for the R computations
BINARIES_NAME=args[1][[1]]# "levenshtein"
BINARIES_EXTENSION=args[2][[1]]# ".dll"
THRESHOLD_FOR_INDIVIDUAL=as.numeric(args[3][[1]])# 0.85
THRESHOLD_FOR_ADDRESS_COMBINED=as.numeric(args[4][[1]])# 0.75
SCALING_FACTOR=as.numeric(args[5][[1]])# 3
curr_country=args[6][[1]]# 'United_States'
TARGET_DIRECTORY=args[7][[1]]# 'Raw_Scores'
TARGET_CSV_NAMES=c("SR_NUM_1", "SR_NUM_2", "SITE_NAME_COMPARISON_SCORE",
"STATE_COMPARISON_SCORE", "CITY_COMPARISON_SCORE", "POSTAL_CODE_COMPARISON_SCORE",
"CONCAT_ADDRESS_COMPARISON_SCORE", "NUM_OF_MATCHES_FOUND")
# Thresholds as scaling factors for each candidate-pairs' match-scores calculated
THRESHOLDS_COLUMNS= c("SITE_NAME", "STATE", "CITY",
"POSTAL_CODE", "CONCAT_ADDRESS")
THRESHOLDS_VALUES= c(THRESHOLD_FOR_INDIVIDUAL, THRESHOLD_FOR_INDIVIDUAL, THRESHOLD_FOR_INDIVIDUAL,
THRESHOLD_FOR_INDIVIDUAL, THRESHOLD_FOR_ADDRESS_COMBINED)
SCALING_FACTORS= c(1, 1, 1,
1, SCALING_FACTOR)
TOTAL_MATCHES_THRESHOLD=as.numeric(args[8][[1]])# 4
METHOD=args[9][[1]]# Dedup or Linkage
# Load the utility functions
source(paste("utils","SourceCode_Record_Linkage.R",sep="//"))
# Experimental: If the levenshtein C function is not loaded already, load the pre-compiled binaries to which it belongs.
if (!is.loaded(BINARIES_NAME)){
print(paste0("Loading ",BINARIES_NAME,BINARIES_EXTENSION," !"))
dyn.load(paste0(paste("utils", BINARIES_NAME, sep = "//"), BINARIES_EXTENSION))
}else {
print(paste0(BINARIES_NAME,BINARIES_EXTENSION," is already loaded !"))
}
# Experimental: To increase allocated RAM size and invoke garbage-collector
if (memory.limit()!=4000){
memory.limit(size=4000)
}
gc()
# Deduplicate the incoming dataset, and create an output /Raw_Scores/country_Score_Features.csv file
if (METHOD=="Dedup"){
# Read the country-specific batch with columns for only relevant match-score
country_df=read.csv(paste0(curr_country,'_country_df.csv'), encoding="UTF-8")
country_df[is.na(country_df)]=""
n_rows=nrow(country_df)
n_candidates=n_rows*(n_rows-1)/2
print(paste("NRows=",n_rows,", Candidate-pairs=",n_candidates,", Columns are "))
print(names(country_df))
COLUMNS_TO_KEEP_IN_CSV=c("id2","id1","SITE_NAME","STATE","CITY","POSTAL_CODE","CONCAT_ADDRESS","NUM_OF_MATCHES_FOUND")
candidate_pairs=processDedupBatch(country_df)
if (nrow(candidate_pairs)>0){
for (i in 1:nrow(candidate_pairs)){
candidate_pairs[i,'SR_NUM_1']=country_df[candidate_pairs[i,'SR_NUM_1'],1]
candidate_pairs[i,'SR_NUM_2']=country_df[candidate_pairs[i,'SR_NUM_2'],1]
}
write_df_to_csv(df=candidate_pairs, root_dir = TARGET_DIRECTORY, curr_country = curr_country, file_suffix = "_Score_Features.csv", index_flag = FALSE)
}else{
dummy.names=names(candidate_pairs)
candidate_pairs=rbind(candidate_pairs, c(0, 0, 0, 0, 0, 0, 0, 0))
names(candidate_pairs)=dummy.names
print("No potential matches found in the incoming dataset! Creating a dummy csv...")
write_df_to_csv(df=candidate_pairs, root_dir = TARGET_DIRECTORY, curr_country = curr_country, file_suffix = "_Score_Features.csv", index_flag = FALSE)
}
}else if (METHOD=="Linkage"){
FIRST_FILE=args[10][[1]]# First_dataset csv
SECOND_FILE=args[11][[1]]# Second_dataset csv
# Read the country-specific batch with columns for only relevant match-score
country_df=read.csv(FIRST_FILE, encoding="UTF-8")
country_df[is.na(country_df)]=""
n_rows=nrow(country_df)
print(paste("NRows=",n_rows,", Columns are "))
print(names(country_df))
country_df2=read.csv(SECOND_FILE, encoding="UTF-8")
country_df2[is.na(country_df2)]=""
n_rows=nrow(country_df2)
print(paste("NRows=",n_rows,", Columns are "))
print(names(country_df2))
COLUMNS_TO_KEEP_IN_CSV=c("id1","id2","SITE_NAME","STATE","CITY","POSTAL_CODE","CONCAT_ADDRESS","NUM_OF_MATCHES_FOUND")
candidate_pairs=processLinkageBatch(country_df, country_df2)
if (nrow(candidate_pairs)>0){
for (i in 1:nrow(candidate_pairs)){
candidate_pairs[i,'SR_NUM_1']=country_df[candidate_pairs[i,'SR_NUM_1'],1]
candidate_pairs[i,'SR_NUM_2']=country_df2[candidate_pairs[i,'SR_NUM_2'],1]
}
write_df_to_csv(df=candidate_pairs, root_dir = TARGET_DIRECTORY, curr_country = curr_country, file_suffix = "_Score_Features.csv", index_flag = FALSE)
}else{
dummy.names=names(candidate_pairs)
candidate_pairs=rbind(candidate_pairs, c(0, 0, 0, 0, 0, 0, 0, 0))
names(candidate_pairs)=dummy.names
print("No potential matches found in the incoming 2 datasets! Creating a dummy csv...")
write_df_to_csv(df=candidate_pairs, root_dir = TARGET_DIRECTORY, curr_country = curr_country, file_suffix = "_Score_Features.csv", index_flag = FALSE)
}
}
#View(candidate_pairs)
end=Sys.time()
cat("\n")
print(end-start)