-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdv_functions.py
427 lines (388 loc) · 21.2 KB
/
pdv_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
import pandas as pd # standard python data library
import geopandas as gp # the geo-version of pandas
import numpy as np
import os
import fiona
from statistics import mean, median
from pandas import read_csv
gp.io.file.fiona.drvsupport.supported_drivers['KML'] = 'rw' #To load KML files
import string
import xml.etree.ElementTree as et
from matplotlib.lines import Line2D
def county_totals_check(partner_df, partner_name, source_df, source_name, column_list,county_col,full_print=False):
"""Compares the totals of two election result dataframes at the county level
Args:
partner_df: DataFrame of election results we are comparing against
partner_name: String of what to call the partner in the print statement
source_df: DataFrame of election results we are comparing to
source_name: String of what to call the source in the print statement
column_list: List of races that there are votes for
county_col: String of the column name that contains county information
full_print: Boolean specifying whether to print out everything, including counties w/ similarities
Returns:
Nothing, only prints out an analysis
"""
print("***Countywide Totals Check***")
print("")
diff_counties=[]
for race in column_list:
diff = partner_df.groupby([county_col]).sum()[race]-source_df.groupby([county_col]).sum()[race]
for val in diff[diff != 0].index.values.tolist():
if val not in diff_counties:
diff_counties.append(val)
if len(diff[diff != 0]!=0):
print(race + " contains differences in these counties:")
for val in diff[diff != 0].index.values.tolist():
county_differences = diff[diff != 0]
print("\t"+val+" has a difference of "+str(county_differences[val])+" votes")
print("\t\t"+ partner_name + ": "+str(partner_df.groupby([county_col]).sum().loc[val,race])+" votes")
print("\t\t"+ source_name +": "+str(source_df.groupby([county_col]).sum().loc[val,race])+" votes")
if (full_print):
for val in diff[diff == 0].index.values.tolist():
county_similarities = diff[diff == 0]
print("\t"+val + ": "+ str(partner_df.groupby([county_col]).sum().loc[val,race])+" votes")
else:
print(race + " is equal across all counties")
if (full_print):
for val in diff[diff == 0].index.values.tolist():
county_similarities = diff[diff == 0]
print("\t"+val + ": "+ str(partner_df.groupby([county_col]).sum().loc[val,race])+" votes")
if (len(diff_counties)>0):
print()
print(diff_counties)
def statewide_totals_check(partner_df, partner_name, source_df, source_name, column_list):
"""Compares the totals of two election result dataframes at the statewide total level
Args:
partner_df: DataFrame of election results we are comparing against
source_df: DataFrame of election results we are comparing to
column_list: List of races that there are votes for
Returns:
Nothing, only prints out an analysis
"""
print("***Statewide Totals Check***")
diff_races=[]
for race in column_list:
if (partner_df[race].sum()- source_df[race].sum() != 0):
if race not in diff_races:
diff_races.append(race)
print(race+" has a difference of "+str(partner_df[race].sum()-source_df[race].sum())+" votes")
print("\t"+ partner_name + ": "+str(partner_df[race].sum())+" votes")
print("\t"+ source_name +": "+str(source_df[race].sum())+" votes")
else:
print(race + " is equal", "\t both dataframes " + str(partner_df[race].sum()))
if (len(diff_races)>0):
print()
print(diff_races)
def precinct_votes_check(merged_df,column_list,vest_on_left,name_col,print_level=0):
"""Checks a merged dataframe with two election results at the precinct level
Args:
merged_df: DataFrame with one set of election results joined to another
column_list: List of races that there are votes for
vest_on_left: Boolean specifying whether VEST data is on the left side of merged_df
name_col: String of the column name to refer to precincts when a difference occurs
print_level: Integer that specifies how large the vote difference in a precinct must be to be printed
Returns:
Nothing, only prints out an analysis
"""
merged_df = merged_df.sort_values(by=[name_col],inplace=False)
matching_rows = 0
different_rows = 0
diff_list=[]
diff_values = []
max_diff = 0
for index,row in merged_df.iterrows():
same = True
for i in column_list:
left_data = i + "_x"
right_data = i + "_y"
if ((row[left_data] is None) or (row[right_data] is None) or (np.isnan(row[right_data])or(np.isnan(row[left_data])))):
print("FIX NaN value at: ", row[name_col])
return;
diff = abs(row[left_data]-row[right_data])
if (diff>0):
same = False
diff_values.append(abs(diff))
if (diff>max_diff):
max_diff = diff
if(diff>print_level):
if (vest_on_left):
print(i, "{:.>72}".format(row[name_col]), "(V)","{:.>5}".format(int(row[left_data]))," (S){:.>5}".format(int(row[right_data])),"(D):{:>5}".format(int(row[left_data]-row[right_data])))
else:
print(i, "{:.>72}".format(row[name_col]), "(S)","{:.>5}".format(int(row[left_data]))," (V){:.>5}".format(int(row[right_data])),"(D):{:>5}".format(int(row[left_data]-row[right_data])))
if(same != True):
different_rows +=1
diff_list.append(row[name_col])
else:
matching_rows +=1
print("")
print("There are ", len(merged_df.index)," total rows")
print(different_rows," of these rows have election result differences")
print(matching_rows," of these rows are the same")
print("")
print("The max difference between any one shared column in a row is: ", max_diff)
if(len(diff_values)!=0):
print("The average difference is: ", str(sum(diff_values)/len(diff_values)))
count_big_diff = len([i for i in diff_values if i > 10])
print("There are ", str(count_big_diff), "precinct results with a difference greater than 10")
print("")
print("All precincts containing differences:")
diff_list.sort()
print(diff_list)
def precinct_votes_check_mod(merged_df,column_list,vest_on_left,name_col,print_level=0):
"""Checks a merged dataframe with two election results at the precinct level
Args:
merged_df: DataFrame with one set of election results joined to another
column_list: List of races that there are votes for
vest_on_left: Boolean specifying whether VEST data is on the left side of merged_df
name_col: String of the column name to refer to precincts when a difference occurs
print_level: Integer that specifies how large the vote difference in a precinct must be to be printed
Returns:
Nothing, only prints out an analysis
"""
merged_df = merged_df.sort_values(by=[name_col],inplace=False)
matching_rows = 0
different_rows = 0
diff_list=[]
diff_values = []
max_diff = 0
for index,row in merged_df.iterrows():
same = True
for i in column_list:
left_data = i + "_x"
right_data = i + "_y"
if ((row[left_data] is None) or (row[right_data] is None) or (np.isnan(row[right_data])or(np.isnan(row[left_data])))):
print("FIX NaN value at: ", row[name_col])
return;
diff = abs(row[left_data]-row[right_data])
if (diff>0):
same = False
if (diff>max_diff):
max_diff = diff
if(diff>print_level):
diff_values.append(abs(diff))
if row[name_col] not in diff_list:
diff_list.append(row[name_col])
if (vest_on_left):
print(i, "{:.>72}".format(row[name_col]), "(V)","{:.>5}".format(int(row[left_data]))," (S){:.>5}".format(int(row[right_data])),"(D):{:>5}".format(int(row[left_data]-row[right_data])))
else:
print(i, "{:.>72}".format(row[name_col]), "(S)","{:.>5}".format(int(row[left_data]))," (V){:.>5}".format(int(row[right_data])),"(D):{:>5}".format(int(row[left_data]-row[right_data])))
if(same != True):
different_rows +=1
else:
matching_rows +=1
print("")
print("There are ", len(merged_df.index)," total rows")
print(different_rows," of these rows have election result differences")
print(matching_rows," of these rows are the same")
print("")
print("The max difference between any one shared column in a row is: ", max_diff)
if(len(diff_values)!=0):
print("The average difference is: ", str(sum(diff_values)/len(diff_values)))
count_big_diff = len([i for i in diff_values if i > 10])
print("There are ", str(count_big_diff), "precinct results with a difference greater than 10")
print("")
print("All precincts containing differences:")
diff_list.sort()
print(len(diff_list))
print(diff_list)
def allocate_absentee(df_receiving_votes,df_allocating,column_list,col_allocating,allocating_to_all_empty_precs=False):
"""Allocates votes proportionally to precincts, usually by share of precinct-reported vote
Args:
df_receiving_votes: DataFrame with precinct-level votes
df_allocating: DataFrame with the votes to allocate
column_list: List of races that votes are being allocated for
col_allocating: String referring to what level the allocation occurs at (most often county)
allocating_to_all_empty_precs: Boolean for special case where all votes in df_receiving_votes are 0
Returns:
The precinct-level votes dataframe (df_receiving_votes) with the allocated votes
"""
#Fill any n/a values with 0
df_receiving_votes = df_receiving_votes.fillna(0)
#Grab the original columns, so we can filter back down to them later
original_cols = list(df_receiving_votes.columns)
#Add in the "Total Votes column"
if (allocating_to_all_empty_precs):
#In cases where every vote is 0, need to set the Total_Votes equal to 1 for proportional allocation
df_receiving_votes.loc[:,"Total_Votes"]=1
else:
df_receiving_votes.loc[:,"Total_Votes"]=0
for race in column_list:
df_receiving_votes.loc[:,"Total_Votes"]+=df_receiving_votes.loc[:,race]
#Create the needed dataframes
precinct_specific_totals = pd.DataFrame(df_receiving_votes.groupby([col_allocating]).sum())
precinct_specific_totals.reset_index(drop=False,inplace=True)
to_dole_out_totals = pd.DataFrame(df_allocating.groupby([col_allocating]).sum())
to_dole_out_totals.reset_index(drop=False,inplace=True)
#Add in total sum check
sum_dataframe = pd.DataFrame(columns=precinct_specific_totals.columns)
for i in column_list:
total_votes = precinct_specific_totals.loc[:,i].sum()+to_dole_out_totals.loc[:,i].sum()
sum_dataframe.at[0,i]=total_votes.astype(int)
#Check the allocating to empty precincts code
if (allocating_to_all_empty_precs):
for i in column_list:
if(sum(precinct_specific_totals[i])!=0):
print("Allocating to all empty precincts parameter incorrect")
break
#Print out any instances where the allocation, as written, won't work
special_allocation_needed = []
for index, row in precinct_specific_totals.iterrows():
for race in column_list:
if (row[race]==0):
race_district = row[col_allocating]
if race_district in to_dole_out_totals[col_allocating].unique():
to_allocate = int(to_dole_out_totals.loc[to_dole_out_totals[col_allocating]==race_district][race])
if (to_allocate != 0):
special_allocation_needed.append([race_district,race])
if(row["Total_Votes"]==0):
precinct_specific_totals.loc[index,"Total_Votes"]=1
col_val = row[col_allocating]
df_receiving_votes.loc[df_receiving_votes[col_allocating]==col_val,"Total_Votes"]=1
#Create some new columns for each of these races to deal with the allocation
for race in column_list:
add_var = race+"_add"
rem_var = race+"_rem"
floor_var = race+"_floor"
df_receiving_votes.loc[:,add_var]=0.0
df_receiving_votes.loc[:,rem_var]=0.0
df_receiving_votes.loc[:,floor_var]=0.0
#Iterate over the rows
#Note this function iterates over the dataframe two times so the rounded vote totals match the totals to allocate
for index, row in df_receiving_votes.iterrows():
if row[col_allocating] in to_dole_out_totals[col_allocating].unique():
for race in column_list:
add_var = race+"_add"
rem_var = race+"_rem"
floor_var = race+"_floor"
#Grab the district
county_id = row[col_allocating]
if [county_id,race] in special_allocation_needed:
#Get the denominator for the allocation - the summed "total votes" for precincts in that grouping
denom = precinct_specific_totals.loc[precinct_specific_totals[col_allocating]==county_id]["Total_Votes"]
#Get one of the numerators, how many districtwide votes to allocate
numer = to_dole_out_totals.loc[to_dole_out_totals[col_allocating]==county_id][race]
#Get the "total votes" for this particular precinct
val = df_receiving_votes.at[index,"Total_Votes"]
#Get the vote share, the precincts % of total precinct votes in the district times votes to allocate
else:
#Get the denominator for the allocation (the precinct vote totals)
denom = precinct_specific_totals.loc[precinct_specific_totals[col_allocating]==county_id][race]
#Get one of the numerators, how many districtwide votes to allocate
numer = to_dole_out_totals.loc[to_dole_out_totals[col_allocating]==county_id][race]
#Get the vote totals for this race in this precinct
val = df_receiving_votes.at[index,race]
#Get the vote share, the precincts % of total precinct votes in the district times votes to allocate
if ((float(denom)==0)):
vote_share = 0
else:
vote_share = (float(val)/float(denom))*float(numer)
df_receiving_votes.at[index,add_var] = vote_share
#Take the decimal remainder of the allocation
df_receiving_votes.at[index,rem_var] = vote_share%1
#Take the floor of the allocation
df_receiving_votes.at[index,floor_var] = np.floor(vote_share)
#After the first pass through, get the sums of the races by district to assist in the rounding
first_allocation = pd.DataFrame(df_receiving_votes.groupby([col_allocating]).sum())
#Now we want to iterate district by district to work on rounding
county_list = list(to_dole_out_totals[col_allocating].unique())
#Iterate over the district
for county in county_list:
for race in column_list:
add_var = race+"_add"
rem_var = race+"_rem"
floor_var = race+"_floor"
#County how many votes still need to be allocated (because we took the floor of all the initial allocations)
to_go = int(np.round((int(to_dole_out_totals.loc[to_dole_out_totals[col_allocating]==county][race])-first_allocation.loc[first_allocation.index==county,floor_var])))
#Grab the n precincts with the highest remainders and round these up, where n is the # of votes that still need to be allocated
for index in df_receiving_votes.loc[df_receiving_votes[col_allocating]==county][rem_var].nlargest(to_go).index:
df_receiving_votes.at[index,add_var] = np.ceil(df_receiving_votes.at[index,add_var])
#Iterate over every race again
for race in column_list:
add_var = race+"_add"
#Round every allocation down to not add fractional votes
df_receiving_votes.loc[:,add_var]=np.floor(df_receiving_votes.loc[:,add_var])
df_receiving_votes.loc[:,race]+=df_receiving_votes.loc[:,add_var]
df_receiving_votes.loc[:,race] = df_receiving_votes.loc[:,race].astype(int)
#Check to make sure all the votes have been allocated
if ((sum_dataframe.loc[:,race].sum()-df_receiving_votes.loc[:,race].sum()!=0)):
print("Some issue in allocating votes for:", i)
#Filter down to original columns
df_receiving_votes = df_receiving_votes[original_cols]
return df_receiving_votes
def get_fips_dict(state):
'''
Returns a dictionary mapping from county name to FIPS for a given state.
The state should be called with its full name
'''
#Load in the nationwide FIPS file
fips_file = pd.read_csv("./raw-from-source/FIPS/US_FIPS_Codes.csv")
fips_file = fips_file[fips_file["State"]== state]
fips_file["FIPS County"]=fips_file["FIPS County"].astype(str)
#Make the FIPS three digits
fips_file["FIPS County"]=fips_file["FIPS County"].str.zfill(3)
#Create the dictionary
fips_dict = dict(zip(fips_file["County Name"],fips_file["FIPS County"]))
return fips_dict
def compare_geometries(gdf_1,gdf_2,left_gdf_name,right_gdf_name,join_col_name, shp_names, area_threshold=.1):
'''
Function that joins to GeoDataFrames on a column and reports area differences row-by-row
'''
gdf_1 = gdf_1.to_crs(3857)
gdf_2 = gdf_2.to_crs(3857)
both = pd.merge(gdf_1,gdf_2,how="outer",on=join_col_name,validate="1:1",indicator=True)
if(both["_merge"].str.contains("_")).any():
print("Non-unique merge values")
print(both[both["_merge"]!="both"])
both = both[both["_merge"]=="both"]
both.reset_index(inplace = True, drop = True)
left_geoms = gp.GeoDataFrame(both,geometry="geometry_x")
right_geoms = gp.GeoDataFrame(both,geometry="geometry_y")
left_geoms["geometry_x"]=left_geoms.buffer(0)
right_geoms["geometry_y"]=right_geoms.buffer(0)
# if (left_geoms.is_valid==False).any():
# raise ValueError
# elif(right_geoms.is_valid==False).any():
# raise ValueError
count = 0
area_list = []
print("Checking " + str(both.shape[0])+" " + shp_names + " for differences of greater than "+str(area_threshold)+" km^2")
print()
for index,row in both.iterrows():
diff = left_geoms.iloc[[index]].symmetric_difference(right_geoms.iloc[[index]])
intersection = left_geoms.iloc[[index]].intersection(right_geoms.iloc[[index]])
area = float(diff.area/10e6)
area_list.append(area)
if (area > area_threshold):
count += 1
name = left_geoms.at[index,join_col_name]
print(str(count)+") For " + name + " difference in area is " + str(area))
if (intersection.iloc[0].is_empty):
base = left_geoms.iloc[[index]].plot(color="orange",figsize=(10,10))
right_geoms.iloc[[index]].plot(color="blue",ax=base)
base.set_title(name)
custom_lines = [Line2D([0], [0], color='green', lw=4),
Line2D([0], [0], color='orange', lw=4),
Line2D([0], [0], color='blue', lw=4)]
base.legend(custom_lines, ['Overlap', left_gdf_name,right_gdf_name])
else:
base = left_geoms.iloc[[index]].plot(color="orange",figsize=(10,10))
right_geoms.iloc[[index]].plot(color="blue",ax=base)
intersection.plot(color="green",ax=base)
base.set_title(name)
custom_lines = [Line2D([0], [0], color='green', lw=4),
Line2D([0], [0], color='orange', lw=4),
Line2D([0], [0], color='blue', lw=4)]
base.legend(custom_lines, ['Overlap', left_gdf_name,right_gdf_name])
df = pd.DataFrame(area_list)
print()
print("Scroll down to see plots of any differences")
print()
print("Of the "+ str(both.shape[0])+" "+ shp_names +":")
print()
print(str(len(df[df[0]==0]))+" " + shp_names + " w/ a difference of 0 km^2")
print(str(len(df[(df[0]<.1) & (df[0]>0)]))+" " + shp_names + " w/ a difference between 0 and .1 km^2")
print(str(len(df[(df[0]<.5) & (df[0]>=.1)]))+" " + shp_names + " w/ a difference between .1 and .5 km^2")
print(str(len(df[(df[0]<1) & (df[0]>=.5)]))+" " + shp_names + " w/ a difference between .5 and 1 km^2")
print(str(len(df[(df[0]<2) & (df[0]>=1)]))+" " + shp_names + " w/ a difference between 1 and 2 km^2")
print(str(len(df[(df[0]<5) & (df[0]>=2)]))+" " + shp_names + " w/ a difference between 2 and 5 km^2")
print(str(len(df[(df[0]>=5)]))+" " + shp_names + " w/ a difference greater than 5 km^2")