-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_data.Rmd
413 lines (332 loc) · 10.9 KB
/
make_data.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
---
title: "Cleaning GPS data"
csl: the-american-naturalist.csl
output:
html_document:
theme: cerulean
toc: yes
pdf_document:
toc: yes
<!-- bibliography: references.bib -->
editor_options:
chunk_output_type: console
---
<!--
IMAGES:
Insert them with: ![alt text](image.png)
You can also resize them if needed: convert image.png -resize 50% image.png
If you want to center the image, go through HTML code:
<div style="text-align:center"><img src ="image.png"/></div>
REFERENCES:
For references: Put all the bibTeX references in the file "references.bib"
in the current folder and cite the references as @key or [@key] in the text.
Uncomment the bibliography field in the above header and put a "References"
title wherever you want to display the reference list.
-->
<style type="text/css">
.main-container {
max-width: 1370px;
margin-left: auto;
margin-right: auto;
}
</style>
```{r general_options, include = FALSE}
knitr::knit_hooks$set(
margin = function(before, options, envir) {
if (before) par(mgp = c(1.5, .5, 0), bty = "n", plt = c(.105, .97, .13, .97))
else NULL
},
prompt = function(before, options, envir) {
options(prompt = if (options$engine %in% c("sh", "bash")) "$ " else "> ")
})
knitr::opts_chunk$set(margin = TRUE, prompt = TRUE, comment = "",
collapse = TRUE, cache = FALSE, autodep = TRUE,
dev.args = list(pointsize = 11), fig.height = 3.5,
fig.width = 4.24725, fig.retina = 2, fig.align = "center")
options(width = 137)
```
## Packages
Packages that we need from [CRAN](https://cran.r-project.org):
```{r}
cran <- c("dplyr", # data frames manipulation
"geosphere", # spherical trigonometry
"magrittr", # pipe operators
"measurements", # tools for units measurement
"purrr", # functional programming tools
"sp", # classes and methods for spatial data
"readxl", # to read excel files
"spatstat" # spatial point pattern analysis
)
```
Installing these packages when not already installed:
```{r}
to_install <- setdiff(cran, rownames(installed.packages()))
if (length(to_install)) install.packages(to_install)
```
Loading the packages for interactive use at the command line:
```{r message = FALSE}
invisible(lapply(cran, library, character.only = TRUE))
```
## Utilitary functions
The following function convert DMS coordinates into decimal degrees coordiantes:
```{r}
dms2dd <- function(x) {
require(magrittr) # %>%
require(measurements) # conv_unit
x %>%
sub("°", " ", .) %>%
sub("'", " ", .) %>%
sub("\\.*''[EN]", "", .) %>%
conv_unit("deg_min_sec", "dec_deg")
}
```
The following function identifies in a vector of coordinates the ones that are
in DMS and then uses the above function to convert them into decimal degrees
coordinates:
```{r}
dms2dd2 = function(x) {
sel <- which(grepl("[EN]", x))
x[sel] <- sapply(x[sel], dms2dd)
as.numeric(x)
}
```
The following function reads and reformat GPS coordinates data collected from one
of the 2 GPS devices (the old and the new one) as well as from the WhatsApp app:
```{r}
read_gps <- function(file, tab, tabs) {
require(dplyr) # %>%, transmute
require(readxl) # read_excel
read_excel(file, grep(tab, tabs, TRUE, value = TRUE)) %>%
transmute(id = as.integer(`N° patient`),
longitude = dms2dd2(sub("/",".", Longitude)),
latitude = dms2dd2(Latitude))
}
```
The following function returns the ID that have more than GPS coordinate record:
```{r}
check_duplicates <- function(df) {
as.numeric(names(which(table(df[["id"]]) > 1)))
}
```
The following function takes a data frame with 2 columns, longitude and latitude
and two rows (2 points) and tests whether these 2 points have exactly the same
coordinates:
```{r}
are_same_points <- function(x) {
require(purrr) # invoke
invoke(identical, x$longitude) & invoke(identical, x$latitude)
}
```
The following function takes a data frame of coordinates and ID, and the ID that
has more than one entry in the data frame, and returns the distance in km between
these two entries:
```{r}
dist_btw_2_houses <- function(df, i) {
require(dplyr) # %>%, filter, select
require(geosphere) # distHarversine
tmp <- df %>%
filter(id == i) %>%
select(-id, -source) %>%
as.matrix()
distHaversine(tmp[1, ], tmp[2, ]) / 1000
}
```
## Reading data
Reading the GPS coordinates from the 4 sources:
```{r}
file <- "../../raw_data/GPS/all main data_2018-09-27.xls"
tabs <- excel_sheets(file)
oldmachine <- read_gps(file, "old", tabs)
newmachine <- read_gps(file, "new", tabs)
smartphoneapp <- read_gps(file, "app", tabs)
iplserver <- read_excel(file, grep("server", tabs, TRUE, value = TRUE)) %>%
transmute(id = as.integer(Reference),
longitude = Longitude,
latitude = Latitude)
```
## Testing for duplicates in each of the 4 files, and fixing
```{r}
lapply(list(iplserver, smartphoneapp, oldmachine, newmachine), check_duplicates) %>%
setNames(c("iplserver", "smartphoneapp", "oldmachine", "newmachine"))
```
Note that patients `7681` and `6060` live in 2 different houses at the same
time (see issue [#1](https://github.com/ecomore2/gps/issues/1)), hence the reason
for their duplication. Patients `4374` and `6640` sent the GPS data twice (see
issue [#1](https://github.com/ecomore2/gps/issues/1)), so we can delete the first
coordinates for the 2 of them. Patient `5925` first reported the house only and
the coordinates were retrieved from Google Maps; then he sent the GPS coordinates
(see issue [#1](https://github.com/ecomore2/gps/issues/1)). So we can delete the
first coordinates for this one too:
```{r}
to_delete <- sapply(c(4374, 5925, 6640),
function(x) which(smartphoneapp$id == x)[1])
smartphoneapp <- smartphoneapp[-to_delete, ]
```
## Merging the 4 sources of data
```{r}
gps <- bind_rows(server = iplserver,
whatsapp = smartphoneapp,
old_gps = oldmachine,
new_gps = newmachine, .id = "source")
```
Looking for GPS coordinates that have been taken by more than one mean:
```{r}
duplicates <- check_duplicates(gps)
```
Here we remove the duplicates that are exactly the same points:
```{r}
duplicates <- gps %>%
filter(id %in% duplicates) %>%
split(.$id) %>%
sapply(are_same_points) %>%
`!`() %>%
which() %>%
names() %>%
as.numeric()
```
Now we look to what sources these duplicates are:
```{r}
duplicates <- list(iplserver, smartphoneapp, oldmachine, newmachine) %>%
sapply(function(x) is.element(duplicates, x$id)) %>%
as.data.frame() %>%
setNames(c("server", "whatsapp", "oldgps", "newgps")) %>%
mutate(id = duplicates) %>%
select(id, everything())
duplicates
```
So, we have 20 houses that were checked both with WhatsApp and the server:
```{r}
duplicates %>%
filter(whatsapp, newgps) %>%
nrow()
```
One house that was checked both with the server and by the GPS:
```{r}
duplicates %>%
filter(server, newgps) %>%
nrow()
```
And the 2 patients that live in 2 houses (see above):
```{r}
names(which(rowSums(duplicates) < 2))
```
Patients 4445, 4536, 4582, 4600, 4632, 4831, 4840, 4897, 5018, 5019, 5033, 5122,
5217, 5218, 5251, 5252, 5254, 6323, 7413 and 7436 had their coordinates reported
both on-site from GPS device and through the smartphone app in order to assess
the accuracy of the latter (see issue #3). Thus we can remove these ID from the
smartphone data. But, before that, we can look at the distances between the two
means of data collection. Let's first get the ID of these duplicates:
```{r}
dpl <- duplicates %>%
filter(whatsapp, newgps) %>%
select(id) %>%
unlist() %>%
unname()
```
The following function makes a matrix of coordinates as required by the function
`distHaversine`:
```{r}
make_coord_mat <- function(df, idsel, sourcesel) {
df %>%
filter(id %in% idsel, source == sourcesel) %>%
arrange(id) %>% # to make sure that the coordinates are in the same order
select(longitude, latitude) %>% # in the two matrices
as.matrix()
}
```
```{r}
dpl_nw <- make_coord_mat(gps, dpl, "new_gps")
dpl_sp <- make_coord_mat(gps, dpl, "whatsapp")
sort(distHaversine(dpl_nw, dpl_sp)) / 1000
```
Let's remove all the whatsapp data that are also collected by GPS, as well as
the server datum that is also collected by GPS:
```{r}
whatsapp_dplcts <- duplicates %>%
filter(whatsapp, newgps) %>%
select(id) %>%
unlist()
gps %<>% filter(!(id %in% whatsapp_dplcts & source == "whatsapp"))
```
Let's do the same with the server datum. For ID 8101, the coordinates were
collected on-site with GPS device by mistake, forgetting that the patient had
aleady sent the data through the server
(see issue [#1](https://github.com/ecomore2/gps/issues/1)). Let's first look at
the distance:
```{r}
ip <- make_coord_mat(gps, "8101", "server")
nm <- make_coord_mat(gps, "8101", "new_gps")
distHaversine(ip, nm) / 1000
```
And then we remove the server datum:
```{r}
server_dplcts <- duplicates %>%
filter(server, newgps) %>%
select(id) %>%
unlist()
gps %<>% filter(!(id %in% server_dplcts & source == "server"))
```
We can check that the only 2 duplicated IDs and the ones that live in two
different houses:
```{r}
gps %>%
select(id) %>%
table() %>%
`>`(1) %>%
which() %>%
names()
```
We can fancy looking at the distances, in km, between these two houses:
```{r}
dist_btw_2_houses(gps, 6060)
dist_btw_2_houses(gps, 7681)
```
Let's check whether there are missing values in the coordinates:
```{r}
filter(gps, is.na(longitude) | is.na(latitude))
```
Let's remove the points with missing values in the coordinates:
```{r}
gps %<>% na.exclude()
```
## Checking that GPS coordinates are in Vientiane Capital
Let's get the polygons of the provinces of Lao PDR:
```{r}
provinces <- readRDS("../../cleaned_data/gadm/gadm36_LAO_1_sp.rds")
```
Let's extract the province of Vientiane Capital:
```{r}
vientiane <- subset(provinces, grepl("Vientiane", provinces$VARNAME_1))
```
Let's identify the GPS points that are not inside the polygon of the province of
Vientiane Capital:
```{r}
not_in_vc <- gps %>%
select(-id, -source) %>%
SpatialPoints(CRS(proj4string(vientiane))) %>%
over(vientiane) %>%
`[`(, 1) %>%
is.na() %>%
which()
gps[not_in_vc, ]
```
ID `7004` is obviously a mistake, let's remove it too:
```{r}
out_of_vc <- filter(gps[not_in_vc, ], latitude < 100, longitude > 100)
```
And let's see where these points left are on the map:
```{r}
plot(provinces, col = "grey")
with(out_of_vc, points(longitude, latitude, col = "red"))
```
`7136` is from **Phôngsali province**! The other ones are actually very close to
Vientiane capital:
```{r}
plot(vientiane, col = "grey")
with(out_of_vc, points(longitude, latitude, col = "red"))
```
## Writting to disk
```{r}
if (!dir.exists("data")) dir.create("data")
write.csv(select(gps, id, source, longitude, latitude), "data/gps.csv", FALSE, row.names = FALSE)
```