-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstats133_election_project.Rmd
1466 lines (1151 loc) · 80.2 KB
/
stats133_election_project.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Election Project"
author: "Adnan Hemani, Scott Numamoto, Nami Saghaei, Kian Taylor, Marisa Wong"
date: "December 12, 2016"
output: html_document
---
```{r setup}
loadGeo = TRUE
load2016 = TRUE
load2012 = TRUE
load2008 = TRUE
load2004 = TRUE
loadCensus = TRUE
#----------------
merge_2004_2008 = TRUE
merge_2016 = TRUE
merge_2012 = TRUE
merge_latlon = TRUE
merge_census = TRUE
analysis = TRUE
show_code = FALSE
#setwd("https://drive.google.com/drive/u/1/folders/0B9f_E-erNqh8cDFiZ3dxZkhCd1E")
```
### Require the following packages
```{r, eval=TRUE, echo=show_code}
require(XLConnect)
require(RCurl)
require(XML)
require(xml2)
require(ggplot2)
require(maps)
require(RColorBrewer)
require(rpart)
require(rpart.plot)
require(class)
```
### Function created to take duplicate county names and create distinct entries
(done by Kian Taylor)
When doing merges, we found that different data frames did or didn't use the word "city" in their county names. Initially, we eliminated the word "city" from all county names; however, upon further analysis, we found that this created duplicate entries for county name and state. Example: in Virginia, Franklin County is different that Franklin City County, but were viewed identically when we eliminated the word "city." To return uniqueness, we added the number "2" before duplicate cities, turning Franklin County into "franklin" and Franklin City County into "2franklin."
```{r, eval=TRUE, echo=show_code}
modify_duplicates = function(Data, state_index, county_index) {
x = Data
x$id = paste(x[[county_index]], x[[state_index]])
#Extract one set of duplicate county names.
y = x[duplicated(x[,'id']),]
#Add number '2' to the front of the name.
y[[county_index]] = gsub('^([a-z]{0})([a-z]+)$', '\\12\\2', y[[county_index]])
#Extract remaining counties.
x = x[!duplicated(x[,'id']),]
#Recombine and remove added columns.
Data = rbind(x, y)
Data = Data[-which(names(Data) %in% "id")]
return(Data)
}
#Help taken from:
#http://stackoverflow.com/questions/13863599/insert-a-character-at-a-specific-location-in-a-string
```
##Part 1: Data Wrangling
### Load in Latitude and Longitude Data
(done by Scott Numamoto)
First, we create the initial setup for parsing the XML document, including some intermediate functions for formatting. Then we extract all the name tags for each of the counties, remove the word 'County' from each name, and extract the state information. Finally, we extract all of the X and Y tags, convert them to integers, and merge our resulting data into a dataframe. After we have a dataframe, we check our data to make sure it accurately reflects the US, refine the dataframe a bit, including add some data manually, and then clean up our global environment and save our dataframe.
The xml2 package was used rather than the XML package. The XML package presented errors with namespaces for the document. After working with a GSI for a while during lab, he recommended using the xml2 package instead. One of the effects of this switch is the lack of a xmlapply or xmlsapply function. The apply and sapply functions were used instead.
```{r, eval=loadGeo, echo=show_code}
#Create the initial setup for parsing the XML, including a function to remove whitespace that occurs on the sides of strings. The xml2 package was used as opposed to the XML package due to errors with namespaces.
xml_doc = read_xml("http://www.stat.berkeley.edu/~nolan/data/voteProject/counties.gml")
#Extract all the name tags for each of the counties. Refine to just the name information.
county_names_xml = xml_find_all(xml_doc, "/doc/state/county/gml:name")
county_names = xml_text(county_names_xml, TRUE)
#Remove the word county from the end of each name.
county_names = gsub(" County", "", county_names)
#Extract the state name and abbreviation for each county.
states_xml = xml_find_all(xml_doc, "/doc/state/gml:name")
state_abbr = xml_attr(states_xml, "abbreviation")
state_names = xml_text(states_xml, TRUE)
#The absolute XML paths to each county.
county_paths = sapply(county_names_xml, xml_path)
extract_state_number = function(path) {
return(as.numeric(regmatches(path, regexpr("[0-9]+", path))[[1]]))
}
#The number of each county's state, as numbered in the XML doc.
county_state_number = sapply(county_paths, extract_state_number)
#Get state name for each county, based on the county state number.
county_state_name = sapply(county_state_number, function(number) {
state_names[number]
})
#Get state abbreviations for each county, based on the county state number.
county_state_abbreviation = sapply(county_state_number, function(number) {
state_abbr[number]
})
#Extract all the X and Y tags and refine and scale to integers.
x_coord_xml = xml_find_all(xml_doc, "/doc/state/county/gml:location/gml:coord/gml:X")
x_coord = as.numeric(xml_text(x_coord_xml, TRUE)) / 10^6
y_coord_xml = xml_find_all(xml_doc, "/doc/state/county/gml:location/gml:coord/gml:Y")
y_coord = as.numeric(xml_text(y_coord_xml, TRUE)) / 10^6
#Merge the data together into a dataframe.
county_locations = data.frame(county_names)
county_locations$name = county_names
county_locations$x_coord = x_coord
county_locations$y_coord = y_coord
county_locations$state = county_state_name
county_locations$state_abbr = county_state_abbreviation
county_locations$state = tolower(county_locations$state)
county_locations$county_names = tolower(county_locations$county_names)
head(county_locations)
```
Above we can see that the extraction was successful for the first several counties. Next we can check the data on a larger scale.
Here we plot the data to see if it accurately resembles the U.S. and ensure no large chunks are missing.
```{r, eval=loadGeo, echo=show_code}
ggplot(county_locations, aes(x_coord, y_coord)) + geom_point() + ggtitle("Counties in the US") + xlab("Longitude") + ylab("Latitude")
```
The graph closely resembles the U.S. so the extraction has been successful.
```{r, eval=loadGeo, echo=show_code}
#Modifying for merge.
county_locations$county_names = gsub("parish", "", county_locations$county_names)
county_locations$county_names = gsub("city", "", county_locations$county_names)
county_locations$county_names = gsub(" ", "", county_locations$county_names)
county_locations$county_names = gsub("\\.", "", county_locations$county_names)
county_locations$county_names = gsub("districtofcolumbia", "district-of-columbia", county_locations$county_names)
county_locations$state = gsub("district of columbia", "district-of-columbia", county_locations$state)
county_locations$county_names = gsub("jeffdavis", "jeffersondavis", county_locations$county_names)
county_locations$county_names = gsub("'", "", county_locations$county_names)
county_locations$county_names = gsub("censusarea", "", county_locations$county_names)
county_locations$county_names = gsub("miami-dade", "dade", county_locations$county_names)
county_locations$county_names = gsub("saint", "st", county_locations$county_names)
#Add Broomsfield, CO in manually.
#Data provided by https://en.wikipedia.org/wiki/Broomfield,_Colorado.
broomsRow = c("broomfield", "Broomfield", -105.052038, 39.953302, "colorado", "CO")
county_locations = rbind(county_locations, broomsRow)
#Alter the names of certain counties with the word city in them for later matching.
county_locations[county_locations$name == "Baltimore city",]$county_names = "2baltimore"
county_locations[county_locations$name == "St. Louis city",]$county_names = "2stlouis"
county_locations[county_locations$name == "Richmond city",]$county_names = "2richmond"
county_locations[county_locations$name == "Roanoke city",]$county_names = "2roanoke"
county_locations[county_locations$name == "Fairfax city",]$county_names = "2fairfax"
county_locations[county_locations$name == "Franklin city",]$county_names = "2franklin"
county_locations[county_locations$name == "Bedford city",]$county_names = "2bedford"
#Removing Alaska County/Area Data - Alaska is only one county, the data provided by this document gives information about its boroughs and census areas which is not needed.
county_locations = county_locations[county_locations$state != "alaska", ]
county_locations = county_locations[!county_locations$county_names %in% c("kalawao", "cliftonforge", "southboston"),]
#Convert the x and y coordinates to numerics.
county_locations$x_coord = as.numeric(county_locations$x_coord)
county_locations$y_coord = as.numeric(county_locations$y_coord)
```
After fixing the naming of counties and adding some special cases, we check that the classes within the dataframe are correct and that the map still closely resembles the U.S.
```{r, eval=loadGeo, echo=show_code}
#Check the data type of each of the columns. Coordinates should be numeric.
sapply(county_locations, class)
```
All classes are as expected.
```{r, eval=loadGeo, echo=show_code}
#Replot the map to check that Alaska has poroperly been removed.
ggplot(county_locations, aes(x_coord, y_coord)) + geom_point() + ggtitle("Counties in the US") + xlab("Longitude") + ylab("Latitude")
```
The map does indeed resemble the U.S.
```{r, eval=loadGeo, echo=show_code}
#Take a look at summary to ensure Broomsfield and Alaska modifications didn't cause any major problems with missing values or misalignments, and that the data was properly modified for merge.
summary(county_locations)
```
Reviewing the summary of the data reveals no extreme surprises. The extraction pulled out 3113 counties from the data.
```{r, eval=loadGeo, echo=show_code}
#Clean up global environment by removing unnecessary values.
#We do this rather frequently so that our global environment only has the necessary data. It makes R more efficient and our code easier to understand as you go through step by step, showing only the variables pertinent for each step.
rm(county_names, county_names_xml, county_paths, county_state_abbreviation, county_state_name, county_state_number, state_abbr, state_names, states_xml, x_coord, x_coord_xml, xml_doc, y_coord, y_coord_xml, extract_state_number, broomsRow)
#Save dataframe to be loaded later
save(county_locations, file = "county_locations.rda")
```
### Load in 2016 Data from Github
(done by Kian Taylor)
Because this is a single csv file, is it a simple function loading it into R as a data frame. We then changed the column names to represent the year the data was taken from. Some smaller adjustments were made, like changing the character string for Washington DC or eliminating periods, spaces, and apostrophes, all in order to make the different data frames match up. What was tedious for this data set was changing state abbreviations to state names. We took a character string with all the state names and their respective abbreviations and converted it into a data frame. We then merged that with our csv and removed extra columns, ulimately converting our state abbreviation column into a state name column. We finish with cleaning up the dataframe and our global environment and saving our data.
```{r, eval=load2016, echo=show_code}
url2016 = "http://www.stat.berkeley.edu/users/nolan/data/voteProject/2016_US_County_Level_Presidential_Results.csv"
Data2016 = read.csv(url2016)
#Remove Alaska redundancies.
Data2016 = Data2016[-c(1:28),]
#Re-index.
rownames(Data2016) = 1:nrow(Data2016)
#Add election year to data columns.
Data2016$votes_dem.16 = Data2016$votes_dem
Data2016$votes_gop.16 = Data2016$votes_gop
Data2016$total_votes.16 = Data2016$total_votes
Data2016$per_dem.16 = Data2016$per_dem
Data2016$per_gop.16 = Data2016$per_gop
Data2016$diff.16 = as.numeric(Data2016$diff)
Data2016$per_point_diff.16 = as.numeric(Data2016$per_point_diff)
#Remove excess columns.
Data2016 = Data2016[-c(1:8,11)]
#Clean up county names to match other data frames.
Data2016$county_name = tolower(Data2016$county_name)
Data2016$county_name = gsub("[\\'[:space:]*\\.]", "", Data2016$county_name)
Data2016$county_name = gsub("county", "", Data2016$county_name)
Data2016$county_name = gsub("parish", "", Data2016$county_name)
Data2016$county_name[289] = "district-of-columbia"
#Convert state abbreviations to state names.
##state_info taken from http://www.whypad.com/posts/excel-spreadsheet-of-us-states/583/
state_info = "ALABAMA Alabama AL
ALASKA Alaska AK
ARIZONA Arizona AZ
ARKANSAS Arkansas AR
CALIFORNIA California CA
COLORADO Colorado CO
CONNECTICUT Connecticut CT
DISTRICT-OF-COLUMBIA district-of-columbia DC
DELAWARE Delaware DE
FLORIDA Florida FL
GEORGIA Georgia GA
HAWAII Hawaii HI
IDAHO Idaho ID
ILLINOIS Illinois IL
INDIANA Indiana IN
IOWA Iowa IA
KANSAS Kansas KS
KENTUCKY Kentucky KY
LOUISIANA Louisiana LA
MAINE Maine ME
MARYLAND Maryland MD
MASSACHUSETTS Massachusetts MA
MICHIGAN Michigan MI
MINNESOTA Minnesota MN
MISSISSIPPI Mississippi MS
MISSOURI Missouri MO
MONTANA Montana MT
NEBRASKA Nebraska NE
NEVADA Nevada NV
NEW HAMPSHIRE New Hampshire NH
NEW JERSEY New Jersey NJ
NEW MEXICO New Mexico NM
NEW YORK New York NY
NORTH CAROLINA North Carolina NC
NORTH DAKOTA North Dakota ND
OHIO Ohio OH
OKLAHOMA Oklahoma OK
OREGON Oregon OR
PENNSYLVANIA Pennsylvania PA
RHODE ISLAND Rhode Island RI
SOUTH CAROLINA South Carolina SC
SOUTH DAKOTA South Dakota SD
TENNESSEE Tennessee TN
TEXAS Texas TX
UTAH Utah UT
VERMONT Vermont VT
VIRGINIA Virginia VA
WASHINGTON Washington WA
WEST VIRGINIA West Virginia WV
WISCONSIN Wisconsin WI
WYOMING Wyoming WY
"
temp = strsplit(state_info, '\n')
temp = unlist(temp)
temp = strsplit(temp, "\t")
temp = unlist(temp)
state_df = data.frame(abb = c(temp[3 * 1:50]), name = c(tolower(temp[(3 * 1:50) - 2])))
state_df = sapply(state_df, as.character)
state_df = rbind(state_df, c("WY", "wyoming"))
Data2016 = merge(state_df, Data2016, by.x = "abb", by.y = "state_abbr")
Data2016 = Data2016[-1]
#Change specific county names that differ across election data.
Data2016$county_name = gsub("miami-dade", "dade", Data2016$county_name)
Data2016$county_name = gsub("jeffdavis", "jeffersondavis", Data2016$county_name)
Data2016$county_name = gsub("oglala", "shannon", Data2016$county_name)
Data2016$county_name = gsub("county", "", Data2016$county_name)
Data2016$county_name = gsub("city", "", Data2016$county_name)
#Call modify_duplicates function.
Data2016 = modify_duplicates(Data2016, 1, 2)
#Look at summary to make sure that specific county names were changed properly, that there aren't any other inconsistencies with rest of data, and that formatting is correct.
summary(Data2016) #great
#Clean up global environment by removing unnecessary values
rm(state_df, state_info, temp, url2016)
#Save our dataframe for later use.
save(Data2016, file = "Data2016.rda")
```
### Load in 2012 Data from Politico
(done by Marisa Wong)
First we obtain a list of state names and create a dataframe of URLs for each state. We then remove data pertaining to Alaska due to issues with availability of data, extract our county information, and create a vector associating counties with their respective states. We then obtain the popular vote information for Obama and Romney, before constructing a dataframe associating each county with percent vote info for Obama and Romney. Finally, we clean up the dataframe to match the standard configuration for dataframes, clean up our global environment, and save our dataframe.
```{r, eval=load2012, echo=show_code}
#Obtain a list of state names in alphabetical order.
stateNames = read.table("http://www.stat.berkeley.edu/~nolan/data/voteProject/countyVotes2012/stateNames.txt")
stateNames = stateNames[-c(1, 3),]
stateNames = lapply(stateNames, as.character)
stateNames = unlist(stateNames)
#Creates a data frame of URLs for each state.
url2012 = "http://www.stat.berkeley.edu/~nolan/data/voteProject/countyVotes2012/"
stateURL = lapply(stateNames,
function(state) paste(url2012, state, ".xml", sep = ""))
stateURL = cbind(unlist(stateURL))
#Taking out Alaska because of issue with no data
stateDocs = lapply(stateURL, function(url) xmlParse(url))
stateRoots = lapply(stateDocs, function(doc) xmlRoot(doc))
stateNodes = lapply(stateDocs, function(state) getNodeSet(state, "//tbody[@id]"))
#Obtaining county ids for each state.
counties = lapply(stateNodes,
function(state) lapply(state, xmlGetAttr, "id"))
numStates = sapply(counties, length)
counties = unlist(counties)
counties = sapply(counties, function(county) strsplit(county, split = "county")[[1]][2])
#Obtaining county names
countyNames = unlist(lapply(stateDocs,
function(state) {
stateNodeSet = getNodeSet(state, '//th[@class = "results-county"]')
xmlSApply(stateNodeSet, xmlValue)
}))
countyNames = countyNames[countyNames != "County"]
countyNames = sapply(countyNames, function(name) strsplit(name, split = "100.0%")[[1]][1])
countyNames = tolower(countyNames)
#Creating a vector of states associated with each county.
states = rep(stateNames, time = numStates)
#Obtaining popular vote for Obama.
obama = unlist(lapply(stateRoots,
function(state) {
stateNodeSet = getNodeSet(state, '//tr[@class = "party-democrat" or @class = "party-democrat race-winner"]/td[@class="results-popular"]')
xmlSApply(stateNodeSet, xmlValue)
}))
#Obtaining popular vote for Romney.
romney = unlist(lapply(stateRoots,
function(state) {
stateNodeSet = getNodeSet(state, '//tr[@class = "party-republican" or @class = "party-republican race-winner"]/td[@class="results-popular"]')
xmlSApply(stateNodeSet, xmlValue)
}))
#Gets rid of white spaces and commas in the obama and romney vectors. Converts these from character to numeric vectors.
obama = gsub("\\s|,", "", obama)
obama = as.numeric(obama)
romney = gsub("\\s|,", "", romney)
romney = as.numeric(romney)
#Creates a data frame of county names (listed as id numbers), percent who voted for Obama, and percent who voted for Romney.
Data2012 = data.frame(states, countyNames, obama, romney)
colnames(Data2012) = c("State", "County Name", "ObamaVotes.12", "RomneyVotes.12")
rownames(Data2012) = 1:nrow(Data2012)
#Clean up county and state names to match other data frames.
Data2012$`County Name` = gsub("[\\'[:space:]*\\.]", "", Data2012$`County Name`)
Data2012$`County Name` = gsub("[0-9]+.*", "", Data2012$`County Name`)
Data2012$`County Name` = gsub("saint", "st", Data2012$`County Name`)
Data2012$`County Name` = gsub("districtofcolumbia", "district-of-columbia", Data2012$`County Name`)
Data2012$`County Name` = gsub("miami-dade", "dade", Data2012$`County Name`)
Data2012$`County Name` = gsub("jeffdavis", "jeffersondavis", Data2012$`County Name`)
Data2012$`County Name` = gsub("county", "", Data2012$`County Name`)
Data2012$`County Name` = gsub("city", "", Data2012$`County Name`)
Data2012$`County Name` = gsub("brooklyn", "kings", Data2012$`County Name`)
Data2012$`County Name` = gsub("manhattan", "newyork", Data2012$`County Name`)
Data2012$`County Name` = gsub("statenisland", "richmond", Data2012$`County Name`)
Data2012$`County Name` = gsub("city", "", Data2012$`County Name`)
Data2012$State = gsub("-", " ", Data2012$State)
Data2012$State = gsub("district of columbia", "district-of-columbia", Data2012$State)
#Call modify_duplicates function.
Data2012 = modify_duplicates(Data2012, 1, 2)
#Clean up global environment by removing unnecessary values.
rm(stateURL, counties, countyNames, numStates, obama, romney, stateDocs, stateNames, stateNodes, stateRoots, states, url2012)
#Save dataframe to be loaded later.
save(Data2012, file = "data2012.rda")
```
### Load in 2008 Data from The Guardian
(done by Adnan Hemani)
We first downloaded the xlsx file from Prof. Nolan's website and then read it in using the XLConnect package. Then we made a vector of the all of the states which were repeated the amount of times of the number of counties they had. Given that there was a problem with Mississippi in that there was an extra couple of rows with no needed data, we removed those rows from the data frame. We then take Washington DC's data from the first sheet with all of the state's results and then add that to my data frame as well. We then format the data properly by making the respecitve columns numerics and lowercasing the columns with characters. Then we modify the data we have so that it's easier to merge with everyone else's data. We then renamed the columns so that they're more descriptive and then clear any unused variables from the global environment. We included Washington DC's data, because we felt it was still important to represent them as they still have three electoral votes in the Electoral College and also because the census and our other sources also have data on them. We renamed some of the counties, as our group had decided on a few tips to make sure that all of the counties would be able to merge later on.
```{r, eval=load2008, echo=show_code}
#Downloading file and reading it into a data frame.
url2008 = "http://www.stat.berkeley.edu/users/nolan/data/voteProject/countyVotes2008.xlsx"
tmp = tempfile(fileext = ".xls")
download.file(url = url2008, destfile = tmp)
wb = loadWorkbook(tmp)
statesWorksheets = readWorksheetFromFile(file = tmp, sheet = getSheets(wb), header = TRUE, startRow = 1, endRow = 260)
Data2008 = do.call("rbind", unname(statesWorksheets[-1]))
#Finding the states' vector and placing it into the data frame.
states_labels = sapply(statesWorksheets[-1], function(x){nrow(x)})
states = rep(getSheets(wb)[-1], states_labels)
Data2008$state = states
#Fix MS's data.
Data2008 = Data2008[-c(1454, 1455), ]
#Trying to get DC's Data.
total_results = data.frame(statesWorksheets[1])
dc = total_results[total_results$Total.results.STATE == "D.C.",]
dc_vector = c('district-of-columbia', -1, -1, dc$Total.results.OBAMA, dc$Total.results.MCCAIN, -1, 'district-of-columbia')
Data2008 = rbind(Data2008, dc_vector)
#Classifying numerics correctly, Lowercasing strings, removing spaces from strings.
Data2008$state = tolower(Data2008$state)
Data2008$Total.Precincts. = as.numeric(Data2008$Total.Precincts.)
Data2008$Precincts.Reporting. = as.numeric(Data2008$Precincts.Reporting.)
Data2008$Obama. = as.numeric(Data2008$Obama.)
Data2008$McCain. = as.numeric(Data2008$McCain.)
Data2008$Other = as.numeric(Data2008$Other)
Data2008$County. = tolower(Data2008$County.)
Data2008$County. = gsub("[\\'[:space:]*\\.]", "", Data2008$County.)
#modifying data frame for merging.
Data2008$County. = gsub("miami-dade", "dade", Data2008$County.)
Data2008$County. = gsub("county", "", Data2008$County.)
Data2008$County. = gsub("lewis&clark", "lewisandclark", Data2008$County.)
Data2008$County. = gsub("jeffdavis", "jeffersondavis", Data2008$County.)
Data2008$County. = gsub("saint", "st", Data2008$County.)
Data2008$County. = gsub("statenisland", "richmond", Data2008$County.)
Data2008$County. = gsub("manhattan", "newyork", Data2008$County.)
Data2008$County. = gsub("brooklyn", "kings", Data2008$County.)
Data2008$County. = gsub("county", "", Data2008$County.)
Data2008$County. = gsub("city", "", Data2008$County.)
#Specify data with date of election.
Data2008$Total.Precincts.08 = as.numeric(Data2008$Total.Precincts.)
Data2008$Precincts.Reporting.08 = as.numeric(Data2008$Precincts.Reporting.)
Data2008$Obama.08 = as.numeric(Data2008$Obama.)
Data2008$McCain.08 = as.numeric(Data2008$McCain.)
Data2008$Other.08 = as.numeric(Data2008$Other)
Data2008 = Data2008[-c(2:6)]
#Call modify_duplicates function.
Data2008 = modify_duplicates(Data2008, 2, 1)
#Clean up global environment by removing unnecessary values.
rm(dc, total_results, dc_vector, states, states_labels, statesWorksheets, tmp, url2008, wb)
#Save dataframe.
save(Data2008, file = "Data2008.rda")
```
### Load in 2004 Data from Professor Nolan
(done by Kian Taylor and Adnan Hemani)
Opening the file in a plain text editor, one can see that it is a space-deliminated file containing the state, county name, number of votes for Bush, and number of votes for Kerry. We then separated county name and state into two separate columns. Column names were changed to reflect the year from which the data was taken. Minor adjustments were made so that county names matched up across data frames. We finishsed up by cleaning up the global environment, and saving our dataframe.
```{r,eval=load2004, echo=show_code}
#Load raw data into global environment.
url2004 = "http://www.stat.berkeley.edu/users/nolan/data/voteProject/countyVotes2004.txt"
Data2004 = read.delim(url2004, header = TRUE, sep = "")
#Separate county names from states.
Data2004[1] = sapply(Data2004[[1]], as.character)
temp = strsplit(Data2004[[1]], ',')
Data2004$state = sapply(temp, function(x) x[1])
Data2004$countyName = sapply(temp, function(x) x[2])
rm(temp)
#Getting VA data from Wikipedia and adding it to the rest of the 2004 data.
wikiURL = "https://en.wikipedia.org/wiki/United_States_presidential_election_in_Virginia,_2004"
va2004pageContents = getURLContent(wikiURL)
va2004Doc = htmlParse(va2004pageContents)
va2004Root = xmlRoot(va2004Doc)
va2004Table = getNodeSet(va2004Root,
"//table//td/a[@title='Accomack County, Virginia']/../../..")
nrows = xmlSize(va2004Table[[1]])
tableChar = do.call(rbind,
sapply(1:nrows,
function(i) strsplit(xmlValue(va2004Table[[1]][[i]]), "\n")))
valsVA2004 = apply(tableChar[-1, -1], 2, function(vec) {
as.numeric(gsub("[%,]", "", vec))
})
valsVA2004 = valsVA2004[,-c(1,3)]
VAcounties = tableChar[,c(1)]
VAcounties = VAcounties[-c(1)]
virginia = rep("virginia", each = length(VAcounties))
dataVA2004 = cbind(virginia, VAcounties, valsVA2004)
dataVA2004 = dataVA2004[, -c(5,6)]
dataVA2004 = as.data.frame(dataVA2004)
dataVA2004$VAcounties = sapply(dataVA2004$VAcounties, function(x){tolower(as.character(x))})
colnames(dataVA2004) = c("state", "countyName", "kerryVote", "bushVote")
Data2004 = rbind(Data2004, dataVA2004)
#Redefine specific county names to match other data frames and remove unnecessary
Data2004 = Data2004[order(Data2004$state),]
Data2004$countyName = gsub("[\\'[:space:]*\\.]", "", Data2004$countyName)
row.names(Data2004) = 1:nrow(Data2004)
Data2004$countyName[291] = "district-of-columbia"
Data2004$state[291] = "district-of-columbia"
Data2004$countyName = gsub("jeffdavis", "jeffersondavis", Data2004$countyName)
Data2004$countyName = gsub("county", "", Data2004$countyName)
Data2004$countyName = gsub(",virginia", "", Data2004$countyName)
Data2004$countyName = gsub("city", "", Data2004$countyName)
#Label data columns with year of election
Data2004$bushVote.04 = Data2004$bushVote
Data2004$kerryVote.04 = Data2004$kerryVote
Data2004 = Data2004[-c(2,3)]
#Look at summary to make sure that Virginia was inserted properly and didn't cause any misalignments, that there aren't any other inconsistencies with the rest of the group's dataframes, and that formatting is correct.
summary(Data2004) #great
#Make votes into numerical data.
Data2004$bushVote.04 = as.numeric(Data2004$bushVote.04)
Data2004$kerryVote.04 = as.numeric(Data2004$kerryVote.04)
#Call modify_duplicates function.
Data2004 = modify_duplicates(Data2004, 2, 1)
#Clean up global environment by removing unnecessary values.
rm(url2004, dataVA2004, valsVA2004, va2004Doc, va2004pageContents, va2004Root, va2004Table, VAcounties, virginia, wikiURL, tableChar, nrows)
#Save the dataframe.
save(Data2004, file = "Data2004.rda")
```
### Load Census Data (Interesting Variables)
(done by Nami Saghaei)
This code arranges a dataframe of census data to be used in combination with election data in our analysis. First, we read in all of the data from the three files. We quickly observe that two of the files include more counties than one. Observing closer, we realize that the census data includes Puerto Rico. We start by removing the Puerto Rico data from the two files. We then grab the population data from B103 and construct a dataframe, making sure that the state and county information lines up precisely with state and county information in the other two files, making it easy to just grab columns of particular data from those files and append them to our dataframe instead of having 35 merge statements. We then fetch our desired data and append it to the dataframe, modify the dataframe to adhere to our group's standardized dataframe format, clean up our global environment, and save the dataframe for later use.
```{r, eval=loadCensus, echo=show_code}
#Read in csv data from 3 census files
B103 = read.csv("http://www.stat.berkeley.edu/users/nolan/data/voteProject/census2010/B01003.csv")
DP02 = read.csv("http://www.stat.berkeley.edu/users/nolan/data/voteProject/census2010/DP02.csv")
DP03 = read.csv("http://www.stat.berkeley.edu/users/nolan/data/voteProject/census2010/DP03.csv")
#check number of counties in B103 file
combined = as.character(B103$GEO.display.label)
combined = combined[!duplicated(combined)]
#interesting..
dim(DP02) #3139 counties
dim(DP03) #3217 counties
length(combined) #B103, when duplicates are removed, also has 3217 counties.
#Turns out, B103 and DP03 contain data about Puerto Rico, but DP02 doesn't. We will first remove all Puerto Rico data.
puerto_rico = setdiff(DP03$GEO.display.label, DP02$GEO.display.label)
#Drops all the data belonging to Puerto Rico in B103
B103 = B103[!B103$GEO.display.label %in% puerto_rico, ]
#Drops all the data belonging to Puerto Rico in DP03
DP03 = DP03[!DP03$GEO.display.label %in% puerto_rico, ]
#Check again for consistent number of counties
combined = as.character(B103$GEO.display.label)
combined = combined[!duplicated(combined)]
length(combined) #3139 counties. Looks good.
#Grabbing population data from B103 file.
total_population = B103[B103$POPGROUP.id == 1, c("GEO.display.label", 'HD01_VD01')]
white_population = B103[B103$POPGROUP.id == 2, c("GEO.display.label", 'HD01_VD01')]
white_population_error = B103[B103$POPGROUP.id == 2, c("GEO.display.label", 'HD02_VD01')]
black_population = B103[B103$POPGROUP.id == 4, c("GEO.display.label", 'HD01_VD01')]
black_population_error = B103[B103$POPGROUP.id == 4, c("GEO.display.label", 'HD02_VD01')]
ids = DP02[, c('GEO.id2','GEO.display.label')]
#Merging the data into a coherent dataframe.
population = merge(total_population, white_population, by.x ='GEO.display.label' , by.y = 'GEO.display.label', all=TRUE, sort=F)
population = merge(population, white_population_error, by.x ='GEO.display.label' , by.y = 'GEO.display.label', all=TRUE, sort=F)
population = merge(population, black_population, by.x ='GEO.display.label' , by.y = 'GEO.display.label', all=TRUE, sort=F)
population = merge(population, black_population_error, by.x ='GEO.display.label' , by.y = 'GEO.display.label', all=TRUE, sort=F)
population = merge(population, ids, by.x ='GEO.display.label' , by.y = 'GEO.display.label', sort=T)
names(population) = c('combined', 'total_population', 'white_population', 'white_population_error', 'black_population', 'black_population_error','id')
#Observe that our rows match up 1 to 1 between population, DP02, DP03. We can just sort the population data frame and add on columns of data from DP02 and DP03 as we wish.
setdiff(population$combined, DP02$GEO.display.label) # no difference
setdiff(population$combined, DP03$GEO.display.label) # no difference
#Sorts the population data frame so it lines up row-row with DP02 and DP03
census = population[order(population$id), ]
#Checking the 1-1 property. All good. 0 difference.
sum(as.character(DP02$GEO.display.label) != as.character(census$combined))
census$total_households = DP02$HC01_VC03
sum(census$total_households != DP02$HC01_VC03) # 0 -> it really did line up!!
#Code to fetch various data from DP02 and DP03. For units and other information, we will reference the text files. In the future, we will add attributes to columns to indicate units and provide some more information.
#NOTE: we WILL grab total household info, percentages, and margines of error in the future. For now, we just take the variables we are interested in potentially using.
#HC01 --> Estimate
#HC02 --> Error
#HC03 --> Percent
#HC04 --> Percent Error
census$average_household_size = DP02$HC01_VC20
census$average_family_size = DP02$HC01_VC21
census$fertility = DP02$HC01_VC51
census$enrolled_hs = DP02$HC01_VC79
census$enrolled_higher_ed = DP02$HC01_VC80
census$edu_less_than_9th = DP02$HC01_VC85
census$edu_hs_no_diploma = DP02$HC01_VC86
census$edu_hs_diploma = DP02$HC01_VC87
census$edu_college_no_degree = DP02$HC01_VC88
census$edu_college_bachelors = DP02$HC01_VC90
census$edu_college_graduate_professional = DP02$HC01_VC91
census$native = DP02$HC01_129
census$born_us = DP02$HC01_130
census$born_abroad_american_parents = DP02$HC01_133
census$born_abroad_foreign = DP02$HC01_134
census$citizen_yes = DP02$HC01_139
census$citizen_no = DP02$HC01_140
#------------------------------------- DP03
census$in_labor_force_yes = DP03$HC01_VC05
census$in_labor_force_no = DP03$HC01_VC10
census$public_transportation_to_work = DP03$HC01_VC31
census$working_at_home = DP03$HC01_VC34
census$occupation_management_business_science_arts = DP03$HC01_VC41
census_occupation_service = DP03$HC01_VC42
census_occupation_agriculture_forestry_fishing_hunting_mining = DP03$HC01_VC50
census_occupation_construction = DP03$HC01_VC51
census_occupation_education_healthcare = DP03$HC01_VC59
census$income_less_than_10000 = DP03$HC01_VC75
census$income_10000_to_24999 = DP03$HC01_VC76 + DP03$HC01_VC77
#combining income brackets because there are too many
census$income_25000_to_49999 = DP03$HC01_VC78 + DP03$HC01_VC79
census$income_50000_to_99999 = DP03$HC01_VC80 + DP03$HC01_VC81
census$income_100000_to_149999 = DP03$HC01_VC82
census$income_150000_to_199999 = DP03$HC01_VC82
census$income_more_than_200000= DP03$HC01_VC84
options(warn=-1)
#Code to change naming config of dataframe to match final dataframe
#Suppressing warnings because of a small bug... silly R..
separator = function(item) {
return(strsplit(item, ", "))
}
county_getter = function(item) { #abstraction
return(item[1])
}
state_getter = function(item) { #abstraction
return(item[2])
}
result = sapply(as.character(census$combined), separator)
census$states = tolower(sapply(result, state_getter))
census$counties = tolower(sapply(result, county_getter))
#Dropping old combined state and county name column.
census = census[,-1]
#State and county names ended up at far right of data frame. That's ok, it won't conflict with our merges. In the future, we may move it to the beginning to make it easier on the eyes. But it really doesn't make a difference. :)
#Standardizing dataframe formatting for merging
census$counties = gsub("county", "", census$counties)
census$counties = gsub(" ", "", census$counties)
census$counties = gsub("districtofcolumbia", "district-of-columbia", census$counties)
census$states = gsub("district of columbia", "district-of-columbia", census$states)
census$counties = gsub("\\.", "", census$counties)
census$counties = gsub("parish", "", census$counties)
census$counties = gsub("city", "", census$counties)
census$counties = gsub("jeffdavis", "jeffersondavis", census$counties)
census$counties = gsub("\\'", "", census$counties)
census$counties = gsub("miami-dade", "dade", census$counties)
#Removing Alaskan Counties/Boroughs and NA row
census = census[census$states != "alaska", ]
census = census[!is.na(census$counties),]
#Observe dataframe summary to make sure standardized formatting is there, Alaska and NA were removed properly without causing any misalignments or NAs, combined name was converted to state and county columns properly without any misalignments, and that there aren't any other inconsistencies with the formatting of other dataframes.
summary(census) #great
#Making error columns into numeric data
census$white_population_error = as.numeric(census$white_population_error)
census$black_population_error = as.numeric(census$black_population_error)
census = modify_duplicates(census, 31, 32)
#Cleaning up global environment by removing unnecessary values.
rm(B103, black_population_error, black_population, DP02, DP03, ids, population, total_households, white_population, white_population_error, census_occupation_education_healthcare, census_occupation_construction, census_occupation_agriculture_forestry_fishing_hunting_mining, census_occupation_service, combined, puerto_rico, result, county_getter, separator, state_getter)
#Saving the dataframe for quick use later
save(census, file="census.rda")
```
### Merge 2004 and 2008 election data
(done by Kian Taylor and Adnan Hemani)
Choosing to see where there was a lack of overlap of information, we set all equal to TRUE and explored from there where information might be missing. When we merge, we must do so by county name and state name. Multiple states may have the same county name, so it isn't a unique variable across a data set; however, the combination of the state name and county name is now unique (because of our created function).
```{r,eval=merge_2004_2008, echo=show_code}
merged_04_08 = merge(Data2004, Data2008, by.x = c('countyName', 'state'), by.y = c('County.', 'state'), all = TRUE)
merged_04_08 = merged_04_08[order(merged_04_08$state),]
#Remove observations without a county name. They are of no use to us.
merged_04_08 = merged_04_08[!is.na(merged_04_08$countyName),]
#Reindex rows.
row.names(merged_04_08) = 1:nrow(merged_04_08)
```
### Merge 2004/2008 data with 2016
(done by Kian Taylor)
It appears that there are still some counties that have not reported their 2016 election data. However, we can still work with the other yearly data, so we set all equal to TRUE.
```{r,eval=merge_2016, echo=show_code}
merged_04_08_16 = merge(Data2016, merged_04_08, by.x = c('name', 'county_name'), by.y = c('state', 'countyName'), all = TRUE)
names(merged_04_08_16)[names(merged_04_08_16)=="name"] = "state"
```
### Final merge of 2004/2008/2016 data with 2012 data
(done by Kian Taylor)
Similar to our first merge, we set all equal to TRUE looking to find any errors which we would then correct manually.
```{r, merge_2012, echo=show_code}
merged_04_08_12_16 = merge(Data2012, merged_04_08_16, by.x = c("State", "County Name"), by.y = c("state", "county_name"), all = TRUE)
```
### Merge 04/08/16 Data with Latitude and Longitude
(done by Adnan Hemani)
```{r, eval=merge_latlon, echo=show_code}
merged_04_08_12_16_lat_lon = merge(merged_04_08_12_16, county_locations, by.x = c("County Name", "State"), by.y = c("county_names", "state"), all = TRUE)
```
### Merge Total Elections Data with Census Data
(done by Adnan Hemani)
```{r, eval=merge_census, echo=show_code}
merged_total = merge(merged_04_08_12_16_lat_lon, census, by.x = c("County Name", "State"), by.y = c("counties", "states"), all = TRUE)
#Summary to make sure that final merge worked properly with the census data, and that there aren't any large inconsistencies/missing data points/misalignments/missing counties (as had occured earlier).
summary(merged_total)
#Cleaning up global environment by removing unnecessary values.
rm(census, county_locations, Data2004, Data2008, Data2012, Data2016, merged_04_08, merged_04_08_16, merged_04_08_12_16, merged_04_08_12_16_lat_lon, total_population, loadCensus, load2004, load2008, load2012, load2016, loadGeo, merge_2004_2008, merge_2012, merge_2016, merge_census, merge_latlon)
#Saving the dataframe for quick use later
save(merged_total, file="merged_total.rda")
```
##Part 2: Exploration
The following task was divided among each of our members equally. We were all assigned to create several EDA plots, revealing the nature of our data and seeing if it matches our expectations.
### Plots done by Adnan Hemani
```{r, echo=show_code}
#Education Levels.
education_levels = merged_total[, c("total_population", "per_dem.16", "per_gop.16", "edu_less_than_9th", "edu_hs_no_diploma" ,"edu_hs_diploma", "edu_college_no_degree", "edu_college_bachelors", "edu_college_graduate_professional")]
#Remove NAs.
education_levels = education_levels[!is.na(education_levels$total_population), ]
education_levels = education_levels[!is.na(education_levels$per_dem.16), ]
#Finding proportion of each education type in each county.
prop_less_than_9th = education_levels$edu_less_than_9th / education_levels$total_population
prop_no_hs = education_levels$edu_hs_no_diploma / education_levels$total_population
prop_hs = education_levels$edu_hs_diploma / education_levels$total_population
prop_college_no_degree = education_levels$edu_college_no_degree / education_levels$total_population
prop_college_degree = education_levels$edu_college_bachelors / education_levels$total_population
prop_grad = education_levels$edu_college_graduate_professional / education_levels$total_population
#Concatenate education type proportions and put into a single data frame.
plotting_df = data.frame(prop_gop = rep(education_levels$per_gop.16, times = 6),
prop_edu = c(prop_less_than_9th, prop_no_hs, prop_hs, prop_college_no_degree,
prop_college_degree, prop_grad),
Education = factor(rep(c("Less than 9th Grade", "Some High School",
"High School Diploma", "Some College", "College Degree",
"Professional Degree"), each = 3108),
levels = c("Less than 9th Grade", "Some High School",
"High School Diploma", "Some College", "College Degree",
"Professional Degree"),
ordered = TRUE))
#Plotting education levels against GOP Support.
ggplot(data = plotting_df, mapping = aes(x = prop_edu, y = prop_gop, color = Education)) +
geom_point(alpha=0.01) +
geom_smooth(se = FALSE, method='lm') +
labs(x = "Proportion of people in a county with this educational background",
y = "Proportion of support for GOP in 2016",
title = "Educational Background vs GOP support per county")
rm(education_levels, plotting_df, prop_college_degree, prop_college_no_degree, prop_grad, prop_hs, prop_less_than_9th, prop_no_hs)
```
The above plot shows that there is a weak positive correlation between the proportion people in each county have not attended high school at all, a weak (but stronger than the above plot) positive correlation between the proportion of people who have attended high school but not graduated, a very strong positive correlation between the proportion of people who have graduated high school but have not attended college, a strong positive correlation between the proportion of people who attended college but did not graduate, a strong negative correlation between the proportion of people who graduated college, and a very strong negative correlation between the proportion of people who graduated college with a graduate or professional degree and the proportion of votes for Donald Trump in the 2016 Presidential Election.
### Plots done by Scott Numamoto
```{r, echo=show_code}
df = merged_total
ggplot(df, aes(log(total_population), votes_gop.16 / total_votes.16)) +
geom_point() +
geom_smooth(method='lm') +
ggtitle("Total Population vs 2016 Republican Support") +
xlab("Log of the total county population") +
ylab("Proportion of voters who support Donald Trump")
```
The graph shows a strong negative correlation between the total county population and republican support. Larger cities tend to have a less support for Donald Trump.
### Public Transportation vs. Candiate Support
```{r, echo=show_code}
ggplot(df, aes(public_transportation_to_work / total_population)) +
geom_point(aes(y = votes_dem.16 / total_votes.16)) +
xlab("Proportion of Population That Takes Public Transportation to Work") +
ylab("Proportion of Voters for Democratic Candidate") +
ggtitle("Use of Public Transportation to Work vs. 2016 Democratic Support")
```
The graph shows that most counties in which more than .05 percent of the poulation take public transport to work heavily favored the Hillary Clinton. The public transportation proportion does not have a high correlation with democratic support when the proportion is between 0.0 and 0.25.
Public transportation is more common in areas of high population density, namely cities. Thus, this graph suggests that cities strongly supported Hillary Clinton.
### Plots done by Kian Taylor
```{r, echo=show_code, fig.height=4, fig.width=8}
#Reduce county-wide data to total national votes.
national_totals = sapply(merged_total[3:18], sum, na.rm = TRUE)
#Re-order values by chronology, political party (GOP, Dem) and desired data.
national_totals = national_totals[c(10, 11, 15, 14, 2, 1, 4, 3)]
x = national_totals
#Turn number of votes into percentage of votes for the two political parties.
vote_totals = c(x[1] + x[2], x[3] + x[4], x[5] + x[6], x[7] + x[8])
names(vote_totals) = c(2004, 2008, 2012, 2016)
election_percentages = national_totals / rep(vote_totals, each = 2)
#Set up three data frames: one for percentage of votes, one for showing the popular vote winner, and one for showing the electoral college (de facto) winner.
year = rep(c(2004, 2008, 2012, 2016), each = 2)
party = rep(c("GOP", "Dem"), times = 4)
df_votes = data.frame(Percentages = election_percentages, year = year, party = party)
df_pop_winners = cbind(df_votes[c(1, 4, 6, 8),], Winner = 'Popular Winner')
df_college_winners = cbind(df_votes[c(1, 4, 6, 7),], Winner = 'Electoral College Winner')
winners = rbind(df_pop_winners, df_college_winners)
#When creating the graph, put a reference line at 50% to distinguish the majority winner.
#Overlay graphs from all three data frames.
ggplot(data = df_votes) +
geom_line(mapping = aes(x = year, y = Percentages, color = party)) +
geom_hline(yintercept = 0.50) +
geom_point(data = winners, mapping = aes(x = year, y = Percentages, color = party, shape = Winner), size = 4) +
scale_colour_manual(values = c("blue", "red")) +
scale_shape_manual(values = c(10, 8)) +
scale_x_continuous(breaks = c(2004, 2008, 2012, 2016)) +
theme_bw() +
xlab('Year') + ylab('Percentage of Votes') + ggtitle('Voting Percentages by Year')
rm(df, df_college_winners, df_pop_winners, df_votes, winners, cols, election_percentages, national_totals, party, vote_totals, x, year)
```
The purpose of this graph is to confirm the obvious. We can google the popular vote winner from the past four presidential elections, and compare them to the result taken from our data frame. Initially, running the plot gave us wrong results, saying that certain candidates won the popular vote when they didn't. That prompted us to return to part 1 and debug our code until this graph looked as expected.
For the below plot, we chose three states that tend to predict the results of an election; however, the following code could be applied to any state.
```{r, echo=show_code}
#Isolate columns that we want from our large data frame.
mini_merged = merged_total[c(1,2, 12, 13, 17, 16, 4, 3, 6, 5)]
```
```{r, echo=show_code}
#Florida data
florida_counties = mini_merged[grepl("florida", mini_merged$State),]
#Create a logical that shows whether or not the GOP candidate won the majority vote in that specific county and convert it to a factor.
gop_winner = data.frame(florida_counties[3] > florida_counties[4],
florida_counties[5] > florida_counties[6],
florida_counties[7] > florida_counties[8],
florida_counties[9] > florida_counties[10])
winner = data.frame(sapply(gop_winner, as.factor))
levels(winner[[1]]) = c("Dem", "GOP")
levels(winner[[2]]) = c("Dem", "GOP")
levels(winner[[3]]) = c("Dem", "GOP")
levels(winner[[4]]) = c("Dem", "GOP")
#Create a data frame that has the year, each political party, and the number of counties where they had the majority vote.
florida_county_winners = data.frame(Year = rep(c(2004, 2008, 2012, 2016), times = 2),
Party = rep(c("GOP", "Dem"), each =4),
Number_of_Counties = c(sapply(winner, function(x) sum(x == 'GOP')),
sapply(winner, function(x) sum(x == 'Dem'))),
State = rep("Florida", times = 8))
#Create a similar data frame, but instead of the number of counties, this looks at the overall number of votes toward each candidate.
florida_total_winners = data.frame(State = rep("Florida", times = 8),
Party = rep(c("GOP", "Dem"), times =4),
Year = rep(c(2004, 2008, 2012, 2016), each = 2),
Votes = sapply(florida_counties[3:10], sum))
```
```{r, echo=show_code}
#Ohio data (similar procedure as for Florida).
ohio_counties = mini_merged[grepl("ohio", mini_merged$State),]
gop_winner = data.frame(ohio_counties[3] > ohio_counties[4],
ohio_counties[5] > ohio_counties[6],
ohio_counties[7] > ohio_counties[8],
ohio_counties[9] > ohio_counties[10])
winner = data.frame(sapply(gop_winner, as.factor))
levels(winner[[1]]) = c("Dem", "GOP")
levels(winner[[2]]) = c("Dem", "GOP")
levels(winner[[3]]) = c("Dem", "GOP")
levels(winner[[4]]) = c("Dem", "GOP")
ohio_county_winners = data.frame(Year = rep(c(2004, 2008, 2012, 2016), times = 2),
Party = rep(c("GOP", "Dem"), each =4),
Number_of_Counties = c(sapply(winner, function(x) sum(x == 'GOP')),
sapply(winner, function(x) sum(x == 'Dem'))),
State = rep("Ohio", times = 8))
ohio_total_winners = data.frame(State = rep("Ohio", times = 8),
Party = rep(c("GOP", "Dem"), times =4),
Year = rep(c(2004, 2008, 2012, 2016), each = 2),
Votes = sapply(ohio_counties[3:10], sum))
```
```{r, echo=show_code}
#North Carolina data (similar procedure as for Florida).
northcarolina_counties = mini_merged[grepl("north carolina", mini_merged$State),]
gop_winner = data.frame(northcarolina_counties[3] > northcarolina_counties[4],
northcarolina_counties[5] > northcarolina_counties[6],
northcarolina_counties[7] > northcarolina_counties[8],
northcarolina_counties[9] > northcarolina_counties[10])
winner = data.frame(sapply(gop_winner, as.factor))
levels(winner[[1]]) = c("Dem", "GOP")
levels(winner[[2]]) = c("Dem", "GOP")
levels(winner[[3]]) = c("Dem", "GOP")
levels(winner[[4]]) = c("Dem", "GOP")
northcarolina_county_winners = data.frame(Year = rep(c(2004, 2008, 2012, 2016), times = 2),
Party = rep(c("GOP", "Dem"), each =4),
Number_of_Counties = c(sapply(winner, function(x) sum(x == 'GOP')),
sapply(winner, function(x) sum(x == 'Dem'))),
State = rep("North Carolina", times = 8))
northcarolina_total_winners = data.frame(State = rep("North Carolina", times = 8),
Party = rep(c("GOP", "Dem"), times =4),
Year = rep(c(2004, 2008, 2012, 2016), each = 2),
Votes = sapply(northcarolina_counties[3:10], sum))
```
```{r, echo=show_code}
#Combine data across all three states and plot.
county_winners = rbind(florida_county_winners, ohio_county_winners,
northcarolina_county_winners)
total_winners = rbind(florida_total_winners, ohio_total_winners,
northcarolina_total_winners)