-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathapd-crime-data.Rmd
101 lines (83 loc) · 3.87 KB
/
apd-crime-data.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
---
title: "Automatically Get Data From ATL PD Data Downloads Page"
output: html_notebook
---
### Description
The Atlanta Police Department (APD) publishes data on a weekly basis at http://www.atlantapd.org/i-want-to/crime-data-downloads. Files are separated by year, and the more recent years' datasets are .zip archives; older years are published as .xlsx files. This code automatically downloads the data and combines it into a single CSV file.
```{r, message=FALSE}
library(dplyr); library(rvest); library(stringr); library(lubridate);
library(readr); library(httr); library(readxl)
```
### Get metadata of the download links
```{r Get Crime Link Metadata}
crime_links_html <-
read_html("http://www.atlantapd.org/i-want-to/crime-data-downloads") %>%
html_nodes("#widget_362_0_197 .content_link")
crime_links <-
data_frame(filename = crime_links_html %>% html_text(),
url = crime_links_html %>% html_attr("href") %>% paste0("http://www.atlantapd.org", .))
temp_list <- vector(mode = "list", length = 9)
for(i in seq_along(crime_links$url)){
temp_list[[i]] <- headers(GET(crime_links$url[i]))
}
crime_links_fileinfo <-
lapply(temp_list, function(x) data_frame(type = x$`content-type`,
disposition = x$`content-disposition`,
documenttitle = x$`content-documenttitle`,
lastmodified = x$`last-modified`)) %>%
bind_rows()
crime_links <- bind_cols(crime_links, crime_links_fileinfo) %>%
mutate(filename2 = str_sub(disposition, start = 11, end = -2))
crime_links
```
### Functions for downloading data files
```{r Data download functions}
get_crime_zipfile <- function(zip.url){
td <- tempdir()
print(paste0("Reading ", zip.url))
tf <- tempfile(tmpdir=td, fileext=".zip")
download.file(zip.url, tf, mode = "wb")
fname <- unzip(tf, list=TRUE)$Name[1]
unzip(tf, files=fname, exdir=td,overwrite=TRUE)
fpath <- file.path(td, fname)
print(excel_sheets(fpath))
read_excel(fpath, sheet = "Query")
}
get_crime_nonzipfile <- function(file.url, file.name){
print(paste0("Downloading file: ", file.name, ", URL: ", file.url))
download.file(url = file.url, destfile = file.name, mode = "wb")
print(excel_sheets(file.name))
return(read_excel(file.name, sheet="Query"))
}
```
### Download data & combine into one dataset
```{r}
crime_data <- vector(mode = "list", length = nrow(crime_links))
for(i in seq_along(crime_links$filename)){
if(crime_links$type[i] == "application/zip"){
crime_data[[i]] <- get_crime_zipfile(crime_links$url[i])
}else if(crime_links$type[i] == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"){
crime_data[[i]] <-
get_crime_nonzipfile(crime_links$url[i], file.name = crime_links$filename2[i])
}
}
xlsx_filenames <- crime_links %>% .[str_detect(.$filename2, "\\.xlsx"),] %>% .$filename2
for(i in seq_along(xlsx_filenames)){
unlink(xlsx_filenames[i])
}
crime_data2 <-
bind_rows(crime_data) %>%
mutate(rpt_date = mdy(rpt_date),
occur_datetime = mdy_hms(paste(occur_date, occur_time)), occur_date = mdy(occur_date),
poss_datetime = mdy_hms(paste(poss_date, poss_time)), poss_date = mdy(poss_date),
MaxOfnum_victims = as.integer(MaxOfnum_victims),
x = round(as.numeric(x), 5), y = round(as.numeric(y), 5))
```
### Write crime data & link metadata to file
- Write the crime data to `Data/crime_data.csv` & `Data/crime_data.rds` (RDS is a file type used in R, that preserves data types of the elements in a data frame, making reading & writing easier)
- Write link metadata to `Data/crime_links.csv`
```{r Write data to files}
write_rds(crime_data2, "Data/crime_data.rds")
write_csv(crime_data2, "Data/crime_data.csv")
write_csv(crime_links, "Data/crime_links.csv")
```