-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest.R
114 lines (83 loc) · 4.97 KB
/
test.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
## DATA
require(tidyverse) ; require(data.table) ; require("SDjoygret") ; require(tibble) ; require(dplyr)
sacurine <- SDjoygret::sacurine.dlist
data <- list("data.frame" = list("Datamatrix" = data.frame("RowID" = sacurine[[1]][[1]], sacurine[[1]][-1]),
"SampleMetadata" = data.frame("RowID" = sacurine[[2]][[1]], sacurine[[2]][-1]),
"VariableMetadata" = data.frame("RowID" = names(sacurine[[1]][-1]), sacurine[[3]][-1])),
"tibble" = list("Datamatrix" = tbl_df(data.frame("RowID" = sacurine[[1]][[1]], sacurine[[1]][-1])),
"SampleMetadata" = tbl_df(data.frame("RowID" = sacurine[[2]][[1]], sacurine[[2]][-1])),
"VariableMetadata" = tbl_df(data.frame("RowID" = names(sacurine[[1]][-1]), sacurine[[3]][-1]))),
"data.table" = list("Datamatrix" = data.table("RowID" = sacurine[[1]][[1]], sacurine[[1]][-1]),
"SampleMetadata" = data.table("RowID" = sacurine[[2]][[1]], sacurine[[2]][-1]),
"VariableMetadata" = data.table("RowID" = names(sacurine[[1]][-1]), sacurine[[3]][-1]))
)
sacurine.dt <- data[[3]]
save(sacurine.dt, file = "./data/sacurine.dt.rda")
## DF with rownames
dataframe <- data.frame(A = 1:10,
B = 1:10,
C = 1:10,
row.names = paste0("A", 1:10))
## check
lapply(data, check.list.format)
dlist <- data[[1]]
## DEVELOPMENT
lapply(data[[3]], tbl_df)
lapply(data[[2]], data.table)
require(dtplyr)
temp <- rbenchmark::benchmark(
dplyr.tibble = filter(data[[2]][[1]], RowID %in% c("HU_015", "HU_011")),
dplyr.d.t = filter(data[[3]][[1]], RowID %in% c("HU_015", "HU_011")),
data.table.d.t = data[[3]][[1]][RowID %in% c("HU_015", "HU_011")],
replications = 1000
)
### DATA.TABLE TUTORIAL
# https://cran.r-project.org/web/packages/data.table/vignettes/datatable-intro.html
flights <- fread("https://raw.githubusercontent.com/wiki/arunsrinivasan/flights/NYCflights14/flights14.csv")
## Select and order rows
flights[1:10]
flights[month == 7 & origin == "JFK"]
flights[order(month, -flight)][1:10]
## Select column
flights[,.(arr_delay)] ## column
flights[,arr_delay] ## vector
flights[,.(month, day, cancelled)] ## multiples
flights[,.(delay_arr = arr_delay, delay_dep = dep_delay)] ## subset and rename columns
## calcul on column
flights[,sum((arr_delay + dep_delay) > 0)]
flights[origin == "JFK" & month == 7, length(dest)] ## count row
flights[origin == "JFK" & month == 7, .N] ## count row (any column) : .N holds the number of observations in the current group
## calcul
flights[origin == "JFK" & month == 6,.(Av_dep_delay = mean(dep_delay), Av_ar_delay = mean(arr_delay))] ## on rows subset
## use names as reference with arguments : with = F
flights[, c("arr_delay", "dep_delay"), with = F]
flights[, !c("arr_delay", "dep_delay"), with = F]
flights[, -c("arr_delay", "dep_delay"), with = F]
## by group
flights[,.(Av_dep_delay = mean(dep_delay), Av_ar_delay = mean(arr_delay)), .(origin, month)] ## by group
flights[, .N, origin] ## number of observation by origin
flights[, .N, "origin"] ## by accept quoted variables
flights[, .(.N), "origin"] ## same
flights[carrier == "AA", .(.N), .(origin, dest)] ## subset and group by
flights[carrier == "AA", .(Av_arr_delay = mean(arr_delay), Av_dep_delay = mean(dep_delay)), .(origin, dest, month)] ## subset and group by
flights[carrier == "AA", .(Av_arr_delay = mean(arr_delay), Av_dep_delay = mean(dep_delay)), .(origin, dest, month)][order(origin, dest, month)] ## subset and group by with order, but better use keys :
flights[carrier == "AA", .(Av_arr_delay = mean(arr_delay), Av_dep_delay = mean(dep_delay)), keyby = .(origin, dest, month)] ## keys are a lot faster
## chaining
flights[carrier == "AA", .N, by = .(origin, dest)][order(origin, -dest)] ## horizontally
flights[carrier == "AA", .N, by = .(origin, dest) ## or vertically
][order(origin, -dest)]
## group by functions
flights[, .N, .(dep_delay>0, arr_delay>0)]
## group by and return mean of all column (summarize_all)
flights[carrier == "AA", ## select one carrier
lapply(.SD, mean), ## compute mean of all column (.SD)
.(origin, dest, month), ## group by origin, dest and month
.SDcols = c("arr_delay", "dep_delay")] ## only include this two column in .SD (accept inverse : all but... (- or !)
# the same :
flights[carrier == "AA", ## select one carrier
lapply(.(arr_delay, dep_delay), mean), ## compute mean of all column (.SD)
.(origin, dest, month)] ## only include this two column in .SD
## concatenate two column
flights[,.(groups = paste0(carrier, flight), carrier, flight), by = origin]
## concatenate two column : all flights of a carrier by origin
flights[,.(Flights_names = paste(c(origin, flights))), carrier]