diff --git a/DESCRIPTION b/DESCRIPTION index 49db96c..e2501da 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: tidyext Type: Package Title: Tidy Extensions for Data Processing -Version: 0.3.5 +Version: 0.3.6 Authors@R: person("Michael", "Clark", role = c("aut", "cre"), email = "micl@umich.edu") Maintainer: Michael Clark Description: Common data processing and summary functions to extend your tidy ways. @@ -16,10 +16,8 @@ Depends: Imports: dplyr (>= 1.0.0), purrr, - magrittr, rlang, scales, - tibble, tidyr (>= 1.0.0) Suggests: ggplot2, @@ -28,5 +26,6 @@ Suggests: stringi, stringr, testthat, + tibble, covr RoxygenNote: 7.1.1 diff --git a/NAMESPACE b/NAMESPACE index 41e569b..4ebf8ef 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,5 @@ # Generated by roxygen2: do not edit by hand -export("%>%") export(cat_by) export(combn_2_col) export(create_prediction_data) @@ -15,17 +14,16 @@ export(onehot) export(pre_process) export(rnd) export(row_apply) +export(row_max) export(row_means) +export(row_min) export(row_sums) export(spread2) export(sum_NA) export(sum_NaN) export(sum_blank) -export(vars) import(dplyr) importFrom(dplyr,quo_name) -importFrom(dplyr,vars) -importFrom(magrittr,"%>%") importFrom(purrr,map) importFrom(purrr,map_df) importFrom(purrr,map_dfr) @@ -39,7 +37,6 @@ importFrom(stats,model.matrix) importFrom(stats,na.omit) importFrom(stats,quantile) importFrom(stats,sd) -importFrom(tibble,rowid_to_column) importFrom(tidyr,gather) importFrom(tidyr,nest) importFrom(tidyr,unnest) diff --git a/NEWS.md b/NEWS.md index 4820c4a..5528188 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,10 @@ +# tidyext 0.3.6 + +Added `row_min` and `row_max`. Removed some dependencies. + # tidyext 0.3.5 -Cleanup and update for R 4.0 and dplyr 1.0. Deprecate gather_multi and spread2. +Cleanup and update for R 4.0 and dplyr 1.0. Deprecate `gather_multi` and `spread2`. # tidyext 0.3.1 diff --git a/R/combn_2_col.R b/R/combn_2_col.R index b6d8fee..2aa6f42 100644 --- a/R/combn_2_col.R +++ b/R/combn_2_col.R @@ -47,75 +47,38 @@ #' only the indicator columns. #' @examples #' library(tidyext) -#' d = data.frame(id = 1:4, -#' labs = c('A/B', 'B/C/D/E', 'A/E', 'D/E')) -#' test = combn_2_col(data=d, var='labs', max_m=3) +#' +#' d = data.frame(id = 1:4, labs = c('A/B', 'B/C/D/E', 'A/E', 'D/E')) +#' test = combn_2_col(data = d, var = 'labs', max_m = 3) #' test #' str(test) +#' #' d$labs = c('A B', 'B C D E', 'A E', 'D E') -#' combn_2_col(data=d, var='labs', max_m=1) +#' combn_2_col(data = d, var = 'labs', max_m = 1) +#' #' d$labs = c('Tom, Dick & Harriet', "J'Sean", "OBG, Andreas", NA) -#' combn_2_col(data=d, var='labs', sep=',', max_m=2, collapse='-') -#' -#' \dontrun{ -#' # requires at least tidytext -#' tidy_dtm <- function(data, var, sep='-', max_m=3) { -#' init = stringr::str_split(data[[var]], pattern = sep) # creates a list of separated letters -#' -#' # the following gets the combos with a dot separating drugs in a given combo -#' # this first lapply could be parallelized if need be and is probably slowest -#' # probably want to change to m = min(c(4, m)) so as to only limit to 4 -#' # see also, combinat::combn which is slightly faster than base R below -#' observation_combos = init %>% -#' lapply(function(x) -#' sapply(seq_along(x), function(m) -#' utils::combn(x, min(max_m, m), FUN=paste, collapse = '_'))) -#' -#' # now we have a standard text analysis problem in need of a document term -#' matrix -#' documents = observation_combos %>% lapply(unlist) -#' -#' # create a 'tidy' form of documents and terms; each term (i.e. combo) only -#' occurs once in a document -#' doc_df = data.frame(id=rep(data$id, sapply(documents, length)), -#' combos=unlist(documents), -#' count=1) # each term only occurs once in the document -#' doc_df %>% -#' tidytext::cast_dfm(document=id, term=combos, value=count) -#' } -#' -#' # requires at least text2vec -#' ttv <- function(data, var, sep='-', max_m=3) { -#' docs = sapply(stringr::str_split(data[[var]], pattern=sep), -#' function(str_vec) -#' sapply(seq_along(str_vec), -#' function(m) -#' combn(str_vec, -#' m = min(max_m, m), -#' FUN = paste, -#' collapse = '_') -#' ) %>% unlist() -#' ) -#' -#' toks = itoken(docs, progressbar = FALSE) -#' vocab = create_vocabulary(toks) -#' create_dtm(toks, vectorizer = vocab_vectorizer(vocab), progressbar = FALSE) %>% -#' as.matrix() %>% -#' cbind(data,.) -#' } -#' -#' } +#' +#' combn_2_col( +#' data = d, +#' var = 'labs', +#' sep = ',', +#' max_m = 2, +#' collapse = '-' +#' ) +#' #' #' #' #' @export -combn_2_col <- function(data, - var, - sep='[^[:alnum:]]+', - max_m=1, - collapse = '_', - toInteger=FALSE, - sparse=FALSE) { +combn_2_col <- function( + data, + var, + sep = '[^[:alnum:]]+', + max_m = 1, + collapse = '_', + toInteger = FALSE, + sparse = FALSE +) { if (is.null(data) | is.null(var)) stop('Need data and variable name to continue.') @@ -123,16 +86,18 @@ combn_2_col <- function(data, if (max_m < 1) stop('Need positive value for max_m.') data$combo <- - map(stringr::str_split(data[[var]], pattern=sep), - function(str_vec) - map(seq_along(str_vec), - function(m) - combn(str_vec, - m = min(max_m, m), - FUN = paste, - collapse = collapse) - ) %>% unlist() + map(stringr::str_split(data[[var]], pattern = sep), + function(str_vec) + map(seq_along(str_vec), + function(m) + combn(str_vec, + m = min(max_m, m), + FUN = paste, + collapse = collapse) + ) %>% + unlist() ) + combo_cols <- unique(unlist(data$combo)) if (sparse) { @@ -142,7 +107,7 @@ combn_2_col <- function(data, do.call(rbind,.) %>% Matrix::Matrix(sparse = TRUE, dimnames = list(rownames(data), combo_cols)) - ) + ) } if (toInteger) { @@ -156,6 +121,7 @@ combn_2_col <- function(data, map(function(x) combo_cols %in% x) %>% do.call(rbind,.) } + data } diff --git a/R/gather_multi.R b/R/gather_multi.R index c9de413..b5d925b 100644 --- a/R/gather_multi.R +++ b/R/gather_multi.R @@ -132,7 +132,7 @@ gather_multi <- function(data, na.rm = na.rm, convert = convert, factor_key = factor_key) %>% - rowid_to_column() + mutate(rowid = 1:nrow(.)) %>% # changed to get rid of tibble requirement while deprecated for (i in 2:length(varlist)) { data_long <- data %>% @@ -144,7 +144,7 @@ gather_multi <- function(data, na.rm = na.rm, convert = convert, factor_key = factor_key) %>% - rowid_to_column()%>% + mutate(rowid = 1:nrow(.)) %>% # changed to get rid of tibble requirement while deprecated select(rowid, !!values[[i]]) %>% left_join(data_long, ., by='rowid') } diff --git a/R/globals.R b/R/globals.R index c685973..78877f1 100644 --- a/R/globals.R +++ b/R/globals.R @@ -3,4 +3,4 @@ utils::globalVariables(c(".", 'Max', 'Mean', 'Median', 'Min', 'Missing', 'N', 'Q1', 'Q3', 'SD', 'Variable', 'X1st.Qu.', 'X3rd.Qu.', 'perc', 'result', 'results', 'target', 'value', 'x.Freq', 'x.x', 'y.Freq', 'y.x', '%', 'rowid', - 'Group', 'Frequency')) + 'Group', 'Frequency', 'rn')) diff --git a/R/head_tail.R b/R/head_tail.R index 62646ac..0b5100a 100644 --- a/R/head_tail.R +++ b/R/head_tail.R @@ -11,8 +11,8 @@ #' @examples #' #' library(tidyext) -#' as.matrix(mtcars) %>% -#' head_tail(6) +#' +#' head_tail(mtcars) #' head_tail = function(data, n_slice = 6) { # initial checks diff --git a/R/num_summary.R b/R/num_summary.R index bdfa26c..901903c 100644 --- a/R/num_summary.R +++ b/R/num_summary.R @@ -35,7 +35,7 @@ num_summary <- function(x, digits = 1, extra = FALSE) { x <- as.numeric(x) - d <- tibble( + d <- dplyr::tibble( N = length(na.omit(x)), data.frame(t(c(summary(x)))), SD = sd(x, na.rm = TRUE), diff --git a/R/pre_process.R b/R/pre_process.R index 03f5d55..df22c9a 100644 --- a/R/pre_process.R +++ b/R/pre_process.R @@ -33,14 +33,17 @@ #' them. #' @importFrom scales rescale #' @return A data frame that has been processed -#' @export #' #' @examples #' library(tidyext) +#' library(dplyr) +#' #' pre_process(mtcars) -#' pre_process(mtcars, log_vars=vars(mpg, wt)) -#' pre_process(mtcars, zero_start=vars(cyl, gear)) -#' pre_process(mtcars, zero_one=vars(mpg)) +#' pre_process(mtcars, log_vars = vars(mpg, wt)) +#' pre_process(mtcars, zero_start = vars(cyl, gear)) +#' pre_process(mtcars, zero_one = vars(mpg)) +#' +#' @export pre_process <- function( data, std = TRUE, diff --git a/R/row_sums.R b/R/row_sums.R index 33e7789..ae0a17f 100644 --- a/R/row_sums.R +++ b/R/row_sums.R @@ -7,7 +7,7 @@ #' @param ... The columns to sum, take the mean of, etc. \emph{Required}. #' @param .fun The function to apply. #' @param na_rm Whether to remove \code{NA} values or not. Default is \code{FALSE}. -#' @param varname The column name of the sums means etc. +#' @param varname The column name of the sums means etc. as a character string. #' #' @details Simple wrappers for applying rowwise operations only for selected #' columns within the tidyverse approach to data processing. The @@ -28,6 +28,9 @@ #' d %>% #' row_means(matches('x|z')) #' +#' d %>% +#' row_max(matches('x|y')) +#' #' row_apply( #' d , #' everything(), @@ -37,6 +40,8 @@ #' #' @export row_sums <- function(data, ..., na_rm = FALSE, varname = 'sum') { + # note: dplyr 1.0 included rowwise operations, but it wasn't obvious what + # advantage there would be for these functions except for min and max dplyr::mutate(data, !!varname := rowSums(select(data, ...), na.rm = na_rm)) } @@ -46,6 +51,28 @@ row_means <- function(data, ..., na_rm = FALSE, varname = 'mean') { dplyr::mutate(data, !!varname := rowMeans(select(data, ...), na.rm = na_rm)) } +#' @export +#' @rdname row_sums +row_min <- function(data, ..., na_rm = FALSE, varname = 'min') { + dplyr::select(data, ...) %>% + dplyr::mutate(rn = 1:nrow(.)) %>% + dplyr::rowwise(rn) %>% + dplyr::mutate(!!varname := min(dplyr::c_across(...), na.rm = na_rm)) %>% + dplyr::ungroup() %>% + dplyr::select(-rn) +} + +#' @export +#' @rdname row_sums +row_max <- function(data, ..., na_rm = FALSE, varname = 'max') { + dplyr::select(data, ...) %>% + dplyr::mutate(rn = 1:nrow(.)) %>% + dplyr::rowwise(rn) %>% + dplyr::mutate(!!varname := max(dplyr::c_across(...), na.rm = na_rm)) %>% + dplyr::ungroup() %>% + dplyr::select(-rn) +} + #' @export #' @rdname row_sums row_apply <- function(data, ..., .fun, varname = 'var') { diff --git a/R/spread2.R b/R/spread2.R index c89c4b3..789f11b 100644 --- a/R/spread2.R +++ b/R/spread2.R @@ -29,7 +29,6 @@ #' #' @return A data frame with 'wide' format. #' @seealso \code{\link[tidyr]{spread}} -#' @importFrom tibble rowid_to_column #' @examples #' \dontrun{ #' library(tidyext); library(tidyr) @@ -90,7 +89,7 @@ spread2 <- function(data, } else { data <- data %>% bind_cols(data %>% - tibble::rowid_to_column() %>% + mutate(rowid = 1:nrow(.)) %>% # changed to get rid of tibble requirement while deprecated select(rowid) ) } diff --git a/R/utils.R b/R/utils.R index 4a897c6..259c729 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,9 +1 @@ -#' @importFrom magrittr %>% -#' @export -magrittr::`%>%` - - - -#' @importFrom dplyr vars -#' @export -dplyr::vars +#' @importFrom dplyr vars `%>%` tibble diff --git a/_pkgdown.yml b/_pkgdown.yml index 243efc7..e757c8c 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -40,7 +40,7 @@ reference: - create_prediction_data - gather_multi - onehot - # - pre_process + - pre_process # - spread2 - row_sums - title: "Miscellaneous" @@ -48,6 +48,8 @@ reference: Miscellaneous functions. contents: - select_not + - head_tail + - rnd - tidyext figures: dev: svglite::svglite diff --git a/docs/404.html b/docs/404.html index 28d862e..c957034 100644 --- a/docs/404.html +++ b/docs/404.html @@ -17,23 +17,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -46,7 +50,6 @@ - @@ -64,7 +67,7 @@ - +
@@ -136,6 +139,12 @@

Page not found (404)

+ + @@ -146,7 +155,7 @@

Page not found (404)

-

Site built with pkgdown 1.4.1.

+

Site built with pkgdown 1.5.1.

diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 2027b5d..8c85440 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -17,23 +17,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -46,7 +50,6 @@ - @@ -64,7 +67,7 @@ - +
@@ -138,6 +141,12 @@

License

+ + @@ -148,7 +157,7 @@

License

-

Site built with pkgdown 1.4.1.

+

Site built with pkgdown 1.5.1.

diff --git a/docs/articles/index.html b/docs/articles/index.html index 9841c59..e15d098 100644 --- a/docs/articles/index.html +++ b/docs/articles/index.html @@ -17,23 +17,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -46,7 +50,6 @@ - @@ -64,7 +67,7 @@ - +
@@ -136,9 +139,10 @@

Articles

All vignettes

- +
+
Demonstration of tidyext
+
+
@@ -150,7 +154,7 @@

All vignettes

-

Site built with pkgdown 1.4.1.

+

Site built with pkgdown 1.5.1.

diff --git a/docs/articles/introduction.html b/docs/articles/introduction.html index a54a909..6d3c596 100644 --- a/docs/articles/introduction.html +++ b/docs/articles/introduction.html @@ -12,21 +12,21 @@ - - - - + + + + + - + - - +
+

Data Processing

In addition there are some functions for data processing. We can start with the simple one-hot encoding function.

-
onehot(iris) %>% 
-  slice(c(1:2, 51:52, 101:102))
-  Sepal.Length Sepal.Width Petal.Length Petal.Width Species_setosa Species_versicolor Species_virginica
-1          5.1         3.5          1.4         0.2              1                  0                 0
-2          4.9         3.0          1.4         0.2              1                  0                 0
-3          7.0         3.2          4.7         1.4              0                  1                 0
-4          6.4         3.2          4.5         1.5              0                  1                 0
-5          6.3         3.3          6.0         2.5              0                  0                 1
-6          5.8         2.7          5.1         1.9              0                  0                 1
+
onehot(iris) %>% 
+  slice(c(1:2, 51:52, 101:102))
+    Sepal.Length Sepal.Width Petal.Length Petal.Width Species_setosa Species_versicolor Species_virginica
+1            5.1         3.5          1.4         0.2              1                  0                 0
+2            4.9         3.0          1.4         0.2              1                  0                 0
+51           7.0         3.2          4.7         1.4              0                  1                 0
+52           6.4         3.2          4.5         1.5              0                  1                 0
+101          6.3         3.3          6.0         2.5              0                  0                 1
+102          5.8         2.7          5.1         1.9              0                  0                 1

It can do it sparsely.

-
iris %>% 
-  slice(c(1:2, 51:52, 101:102)) %>% 
-  onehot(sparse = TRUE)
-6 x 3 sparse Matrix of class "dgCMatrix"
-  Species_xsetosa Species_xversicolor Species_xvirginica
-1               1                   .                  .
-2               1                   .                  .
-3               .                   1                  .
-4               .                   1                  .
-5               .                   .                  1
-6               .                   .                  1
+
iris %>% 
+  slice(c(1:2, 51:52, 101:102)) %>% 
+  onehot(sparse = TRUE)
+6 x 3 sparse Matrix of class "dgCMatrix"
+  Species_xsetosa Species_xversicolor Species_xvirginica
+1               1                   .                  .
+2               1                   .                  .
+3               .                   1                  .
+4               .                   1                  .
+5               .                   .                  1
+6               .                   .                  1

Choose a specific variable, whether you want to keep the others, and how to deal with NA.

-
df_miss %>%
-  onehot(var = c('g1', 'g2'), nas = 'na.omit', keep.original = FALSE) %>%
-  head()
-           a  b c     d g1_a g1_b g2_1 g2_2 g2_3 g2_4
-1  0.1348980 12 o FALSE    0    1    0    1    0    0
-2  0.1175821  5 r  TRUE    1    0    0    0    1    0
-3 -0.8255891 14 p FALSE    0    1    0    1    0    0
-4 -2.1352363  7 l FALSE    1    0    0    1    0    0
-5  0.2142085 10 n  TRUE    0    1    0    1    0    0
-6 -0.9470681 11 s FALSE    1    0    0    0    1    0
+
df_miss %>%
+  onehot(var = c('g1', 'g2'), nas = 'na.omit', keep.original = FALSE) %>%
+  head()
+           a  b c     d g1_a g1_b g2_1 g2_2 g2_3 g2_4
+1  0.1348980 12 o FALSE    0    1    0    1    0    0
+2  0.1175821  5 r  TRUE    1    0    0    0    1    0
+3 -0.8255891 14 p FALSE    0    1    0    1    0    0
+4 -2.1352363  7 l FALSE    1    0    0    1    0    0
+5  0.2142085 10 n  TRUE    0    1    0    1    0    0
+6 -0.9470681 11 s FALSE    1    0    0    0    1    0

With create_prediction_data, we can quickly create data for use with predict after a model. By default it will put numeric variables at their mean, and categorical variables at their most common category.

-
create_prediction_data(iris)
-  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
-1     5.843333    3.057333        3.758    1.199333  setosa
-
-create_prediction_data(iris, num = function(x) quantile(x, p=.25))
-  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
-1          5.1         2.8          1.6         0.3  setosa
+
create_prediction_data(iris)
+  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
+1     5.843333    3.057333        3.758    1.199333  setosa
+
+create_prediction_data(iris, num = function(x) quantile(x, p=.25))
+  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
+1          5.1         2.8          1.6         0.3  setosa

We can also supply specific values.

-
cd = data.frame(cyl=4, hp=100)
-create_prediction_data(mtcars, conditional_data = cd)
-  cyl  hp      mpg     disp     drat      wt     qsec     vs      am   gear   carb
-1   4 100 20.09062 230.7219 3.596563 3.21725 17.84875 0.4375 0.40625 3.6875 2.8125
+
cd = data.frame(cyl=4, hp=100)
+create_prediction_data(mtcars, conditional_data = cd)
+  cyl  hp      mpg     disp     drat      wt     qsec     vs      am   gear   carb
+1   4 100 20.09062 230.7219 3.596563 3.21725 17.84875 0.4375 0.40625 3.6875 2.8125

For modeling purposes, we often want to center or scale the data, take logs etc. The pre_process function will standardize numeric data by default.

-
pre_process(df1)
-# A tibble: 50 x 6
-   g1        g2      a        b c     d    
-   <fct>  <dbl>  <dbl>    <dbl> <chr> <lgl>
- 1 b     -0.478  0.133  0.659   o     FALSE
- 2 a      0.408  0.116 -1.62    r     TRUE 
- 3 b     -0.478 -0.828  1.31    p     FALSE
- 4 a     -0.478 -2.14  -0.972   l     FALSE
- 5 b     -0.478  0.212  0.00652 n     TRUE 
- 6 a      0.408 -0.949  0.333   s     FALSE
- 7 a      0.408 -0.654 -0.972   i     FALSE
- 8 a     -0.478  0.727 -0.972   n     FALSE
- 9 b     -1.36   1.05  -1.95    s     TRUE 
-10 a      0.408  0.100 -1.95    j     TRUE 
-# … with 40 more rows
+
pre_process(df1)
+# A tibble: 50 x 6
+   g1        g2      a        b c     d    
+   <fct>  <dbl>  <dbl>    <dbl> <chr> <lgl>
+ 1 b     -0.478  0.133  0.659   o     FALSE
+ 2 a      0.408  0.116 -1.62    r     TRUE 
+ 3 b     -0.478 -0.828  1.31    p     FALSE
+ 4 a     -0.478 -2.14  -0.972   l     FALSE
+ 5 b     -0.478  0.212  0.00652 n     TRUE 
+ 6 a      0.408 -0.949  0.333   s     FALSE
+ 7 a      0.408 -0.654 -0.972   i     FALSE
+ 8 a     -0.478  0.727 -0.972   n     FALSE
+ 9 b     -1.36   1.05  -1.95    s     TRUE 
+10 a      0.408  0.100 -1.95    j     TRUE 
+# … with 40 more rows

Other options are to simply center the data (scale_by = 0), start some variables at zero (e.g. time indicators), log some variables (with chosen base), and scale some to range from zero to one.

-
pre_process(mtcars, 
-            scale_by = 0, 
-            log_vars = vars(mpg, wt), 
-            zero_start = vars(cyl), 
-            zero_one = vars(hp, starts_with('d'))) %>% 
-  describe_all_num()
-   Variable  N Mean   SD   Min    Q1 Median   Q3  Max Missing
-1       mpg 32 2.96 0.30  2.34  2.74   2.95 3.13 3.52       0
-2       cyl 32 2.19 1.79  0.00  0.00   2.00 4.00 4.00       0
-3      disp 32 0.40 0.31  0.00  0.12   0.31 0.64 1.00       0
-4        hp 32 0.33 0.24  0.00  0.16   0.25 0.45 1.00       0
-5      drat 32 0.39 0.25  0.00  0.15   0.43 0.53 1.00       0
-6        wt 32 1.12 0.32  0.41  0.95   1.20 1.28 1.69       0
-7      qsec 32 0.00 1.79 -3.35 -0.96  -0.14 1.05 5.05       0
-8        vs 32 0.00 0.50 -0.44 -0.44  -0.44 0.56 0.56       0
-9        am 32 0.00 0.50 -0.41 -0.41  -0.41 0.59 0.59       0
-10     gear 32 0.00 0.74 -0.69 -0.69   0.31 0.31 1.31       0
-11     carb 32 0.00 1.62 -1.81 -0.81  -0.81 1.19 5.19       0
+
pre_process(mtcars, 
+            scale_by = 0, 
+            log_vars = vars(mpg, wt), 
+            zero_start = vars(cyl), 
+            zero_one = vars(hp, starts_with('d'))) %>% 
+  describe_all_num()
+# A tibble: 11 x 10
+   Variable     N  Mean    SD   Min    Q1 Median    Q3   Max `% Missing`
+   <chr>    <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl>       <dbl>
+ 1 mpg         32  2.96  0.3   2.34  2.74   2.95  3.13  3.52           0
+ 2 cyl         32  2.19  1.79  0     0      2     4     4              0
+ 3 disp        32  0.4   0.31  0     0.12   0.31  0.64  1              0
+ 4 hp          32  0.33  0.24  0     0.16   0.25  0.45  1              0
+ 5 drat        32  0.39  0.25  0     0.15   0.43  0.53  1              0
+ 6 wt          32  1.12  0.32  0.41  0.95   1.2   1.28  1.69           0
+ 7 qsec        32  0     1.79 -3.35 -0.96  -0.14  1.05  5.05           0
+ 8 vs          32  0     0.5  -0.44 -0.44  -0.44  0.56  0.56           0
+ 9 am          32  0     0.5  -0.41 -0.41  -0.41  0.59  0.59           0
+10 gear        32  0     0.74 -0.69 -0.69   0.31  0.31  1.31           0
+11 carb        32  0     1.62 -1.81 -0.81  -0.81  1.19  5.19           0

Note that center/standardizing is done to any numeric variables not chosen for log, zero_start, and zero_one.

Here’s a specific function you will probably never need, but will be glad to have if you do. Some data columns have multiple entries for each observation/cell. While it’s understandable why someone would do this, it’s not very good practice. This will split out the entries, or any particular combination of them, into their own indicator column.

-
d = data.frame(id = 1:4,
-               labs = c('A-B', 'B-C-D-E', 'A-E', 'D-E'))
-combn_2_col(data=d, var='labs', max_m=2, sep = '-', collapse = ':', toInteger = T)
-  id    labs                                                                                                combo A B A:B C D E B:C B:D B:E C:D C:E D:E A:E
-1  1     A-B                                                                                            A, B, A:B 1 1   1 0 0 0   0   0   0   0   0   0   0
-2  2 B-C-D-E B, C, D, E, B:C, B:D, B:E, C:D, C:E, D:E, B:C, B:D, B:E, C:D, C:E, D:E, B:C, B:D, B:E, C:D, C:E, D:E 0 1   0 1 1 1   1   1   1   1   1   1   0
-3  3     A-E                                                                                            A, E, A:E 1 0   0 0 0 1   0   0   0   0   0   0   1
-4  4     D-E                                                                                            D, E, D:E 0 0   0 0 1 1   0   0   0   0   0   1   0
-
-combn_2_col(data=d, var='labs', max_m=2, sparse = T)
-4 x 13 sparse Matrix of class "dgCMatrix"
-   [[ suppressing 13 column names 'A', 'B', 'A_B' ... ]]
-                           
-1 1 1 1 . . . . . . . . . .
-2 . 1 . 1 1 1 1 1 1 1 1 1 .
-3 1 . . . . 1 . . . . . . 1
-4 . . . . 1 1 . . . . . 1 .
-

In addition, there is a function that makes tidyr’s spread work like it should when you don’t have unique identifiers. I spent a lot of time coming up with the name for this one.

-
# initial example from spread
-stocks_init <- data.frame(
-  time = as.Date('2009-01-01') + 0:9,
-  X = rnorm(10, 0, 1),
-  Y = rnorm(10, 0, 2),
-  Z = rnorm(10, 0, 4)
-)
-
-# a very common situation
-stocks <- data.frame(
-  X = rnorm(10, 0, 1),
-  Y = rnorm(10, 0, 2),
-  Z = rnorm(10, 0, 4)
-)
-
-
-stocksm_init <- stocks_init %>% 
-  gather(stock, price, -time)
-head(stocksm_init)
-        time stock       price
-1 2009-01-01     X  0.09786637
-2 2009-01-02     X -0.07268211
-3 2009-01-03     X -0.01375654
-4 2009-01-04     X -2.72281902
-5 2009-01-05     X  0.41191338
-6 2009-01-06     X  1.23162468
-
-stocksm_init %>% 
-  spread(stock, price)  %>% 
-  head()                                 # works fine
-        time           X          Y         Z
-1 2009-01-01  0.09786637 -3.1067803  4.115598
-2 2009-01-02 -0.07268211  2.7425990 -6.643963
-3 2009-01-03 -0.01375654 -1.2177186 -3.671878
-4 2009-01-04 -2.72281902  0.1097497 -5.486397
-5 2009-01-05  0.41191338 -2.2411176  3.706905
-6 2009-01-06  1.23162468  2.8402477 -2.990143
-
-# no time
-stocksm <- stocks %>% 
-  gather(stock, price)
-head(stocksm)
-  stock      price
-1     X  1.1563222
-2     X -0.9381941
-3     X  0.5240523
-4     X -0.1280867
-5     X -1.8263122
-6     X  0.5326658
-
-stocksm %>% 
-  spread(stock, price)         # annoying
-Error: Each row of output must be identified by a unique combination of keys.
-Keys are shared for 30 rows:
-* 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
-* 11, 12, 13, 14, 15, 16, 17, 18, 19, 20
-* 21, 22, 23, 24, 25, 26, 27, 28, 29, 30
-
-stocksm %>% 
-  spread2(stock, price)        # works fine
-             X           Y          Z
-1   1.15632219  3.20580570  3.4522024
-2  -0.93819413 -0.05906042 -3.9751471
-3   0.52405233 -0.79476986 -1.9407887
-4  -0.12808670 -0.19162662  2.0707290
-5  -1.82631218 -1.66025840 -1.4190455
-6   0.53266578  1.22780477 -0.7697497
-7   0.04044005  0.11893358  7.6421795
-8   1.01614077 -3.35639245  7.2477876
-9   0.27227199  0.93510359  1.3064328
-10  0.99972077 -3.72109524 -0.1234475
-

However, with unbalanced data the result can be interpreted in different ways, so you’ll have a choice to make.

-
stocksm$price[sample(1:nrow(stocksm), 5)] = NA
-stocksm %>% 
-  spread2(stock, price)
-             X           Y          Z
-1   1.15632219  3.20580570         NA
-2  -0.93819413 -0.05906042 -3.9751471
-3   0.52405233 -0.79476986 -1.9407887
-4           NA -0.19162662  2.0707290
-5           NA -1.66025840 -1.4190455
-6   0.53266578  1.22780477 -0.7697497
-7   0.04044005  0.11893358  7.6421795
-8           NA -3.35639245  7.2477876
-9   0.27227199  0.93510359  1.3064328
-10          NA -3.72109524 -0.1234475
-
-stocksm %>% 
-  spread2(stock, price, compact = FALSE)
-             X           Y          Z
-1   1.15632219          NA         NA
-2  -0.93819413          NA         NA
-3   0.52405233          NA         NA
-4           NA          NA         NA
-5           NA          NA         NA
-6   0.53266578          NA         NA
-7   0.04044005          NA         NA
-8           NA          NA         NA
-9   0.27227199          NA         NA
-10          NA          NA         NA
-11          NA  3.20580570         NA
-12          NA -0.05906042         NA
-13          NA -0.79476986         NA
-14          NA -0.19162662         NA
-15          NA -1.66025840         NA
-16          NA  1.22780477         NA
-17          NA  0.11893358         NA
-18          NA -3.35639245         NA
-19          NA  0.93510359         NA
-20          NA -3.72109524         NA
-21          NA          NA         NA
-22          NA          NA -3.9751471
-23          NA          NA -1.9407887
-24          NA          NA  2.0707290
-25          NA          NA -1.4190455
-26          NA          NA -0.7697497
-27          NA          NA  7.6421795
-28          NA          NA  7.2477876
-29          NA          NA  1.3064328
-30          NA          NA -0.1234475
-

I can speak from experience that having longitudinal data in the compact = FALSE format is about the worst way you could keep such data.

-

Use gather_multi to extend tidyr::gather to deal with multiple sets of variables.

-
demo_data_wide = data.frame(id = 1:10,
-                            X = matrix(rnorm(40), ncol = 4),
-                            Y = matrix(sample(0:1, 40, replace = T), ncol = 4),
-                            Z = matrix(rpois(40, 5), ncol = 4))
-head(demo_data_wide)
-  id         X.1        X.2        X.3        X.4 Y.1 Y.2 Y.3 Y.4 Z.1 Z.2 Z.3 Z.4
-1  1 -0.53445423  0.5065096 -1.3589762  0.7598957   0   1   1   0   2   7   6  11
-2  2 -0.09368249 -1.2568838  0.5249439  0.6650419   0   1   0   1   4   4   6   2
-3  3 -0.33135633 -1.0589727  2.5174225  0.9208250   1   0   1   1   2   3   9   6
-4  4 -1.05702864  0.6320716  0.8481803 -0.3679236   0   0   1   0   4   8   6   0
-5  5  0.18527863 -0.7138931 -2.1592811  0.1919997   0   0   0   1   3   7   4   6
-6  6  0.32825631  0.8307916 -0.4392953 -1.2894000   1   1   0   1   5   6   2  10
-
-gather_multi(demo_data_wide,
-             key = wave,
-             values  = vars(X, Y, Z),
-             varlist = vars(starts_with('X'),
-                            starts_with('Y'),
-                            starts_with('Z')),
-             -id)
-   id wave           X Y  Z
-1   1  X.1 -0.53445423 0  2
-2   2  X.1 -0.09368249 0  4
-3   3  X.1 -0.33135633 1  2
-4   4  X.1 -1.05702864 0  4
-5   5  X.1  0.18527863 0  3
-6   6  X.1  0.32825631 1  5
-7   7  X.1  1.55306164 1  4
-8   8  X.1  1.18040282 0  5
-9   9  X.1 -1.17539903 0  6
-10 10  X.1 -0.42584154 1  7
-11  1  X.2  0.50650964 1  7
-12  2  X.2 -1.25688378 1  4
-13  3  X.2 -1.05897267 0  3
-14  4  X.2  0.63207158 0  8
-15  5  X.2 -0.71389314 0  7
-16  6  X.2  0.83079161 1  6
-17  7  X.2 -0.14366319 0  4
-18  8  X.2  0.48591854 0  6
-19  9  X.2  0.66086652 1  5
-20 10  X.2  2.86171590 1  4
-21  1  X.3 -1.35897615 1  6
-22  2  X.3  0.52494388 0  6
-23  3  X.3  2.51742254 1  9
-24  4  X.3  0.84818031 1  6
-25  5  X.3 -2.15928107 0  4
-26  6  X.3 -0.43929526 0  2
-27  7  X.3  0.65278320 1  8
-28  8  X.3  0.07591408 1  2
-29  9  X.3 -0.55796017 1  8
-30 10  X.3 -0.25082981 0  6
-31  1  X.4  0.75989565 0 11
-32  2  X.4  0.66504187 1  2
-33  3  X.4  0.92082505 1  6
-34  4  X.4 -0.36792358 0  0
-35  5  X.4  0.19199969 1  6
-36  6  X.4 -1.28940004 1 10
-37  7  X.4 -1.69247051 1  7
-38  8  X.4  0.35244834 1  5
-39  9  X.4  0.54339917 1  3
-40 10  X.4  0.83661156 1  4
-

Note that this only makes sense with balanced data, though it will work if NAs are present. Also, you may want to change the key in some fashion.

+
d = data.frame(id = 1:4, labs = c('A-B', 'B-C-D-E', 'A-E', 'D-E'))
+
+combn_2_col(
+  data = d,
+  var = 'labs',
+  max_m = 2,
+  sep = '-',
+  collapse = ':',
+  toInteger = T
+)
+  id    labs                                                                                                combo A B A:B C D E B:C B:D B:E C:D C:E D:E A:E
+1  1     A-B                                                                                            A, B, A:B 1 1   1 0 0 0   0   0   0   0   0   0   0
+2  2 B-C-D-E B, C, D, E, B:C, B:D, B:E, C:D, C:E, D:E, B:C, B:D, B:E, C:D, C:E, D:E, B:C, B:D, B:E, C:D, C:E, D:E 0 1   0 1 1 1   1   1   1   1   1   1   0
+3  3     A-E                                                                                            A, E, A:E 1 0   0 0 0 1   0   0   0   0   0   0   1
+4  4     D-E                                                                                            D, E, D:E 0 0   0 0 1 1   0   0   0   0   0   1   0
+
+combn_2_col(
+  data = d,
+  var = 'labs',
+  max_m = 2,
+  sparse = T
+)
+4 x 13 sparse Matrix of class "dgCMatrix"
+   [[ suppressing 13 column names 'A', 'B', 'A_B' ... ]]
+                           
+1 1 1 1 . . . . . . . . . .
+2 . 1 . 1 1 1 1 1 1 1 1 1 .
+3 1 . . . . 1 . . . . . . 1
+4 . . . . 1 1 . . . . . 1 .

Oftentimes I need to create a column that represents the total scores or means of just a few columns. This is a slight annoyance in the tidyverse, and there isn’t much support behind the dplyr:rowwise function. As such, tidyext has a couple simple wrappers for row_sums, row_means, and row_apply.

-
d = data.frame(x = 1:3,
-               y = 4:6,
-               z = 7:9,
-               q = NA)
-
-d  %>%
- row_sums(x:y)
-  x y z  q sum
-1 1 4 7 NA   5
-2 2 5 8 NA   7
-3 3 6 9 NA   9
-
-d  %>%
- row_means(matches('x|z'))
-  x y z  q mean
-1 1 4 7 NA    4
-2 2 5 8 NA    5
-3 3 6 9 NA    6
-
-row_apply(
- d ,
- x:z,
- .fun = function(x)
-   apply(x, 1, paste, collapse = '')
-)
-  x y z  q var
-1 1 4 7 NA 147
-2 2 5 8 NA 258
-3 3 6 9 NA 369
+
d = data.frame(x = 1:3,
+               y = 4:6,
+               z = 7:9,
+               q = NA)
+
+d  %>%
+ row_sums(x:y)
+  x y z  q sum
+1 1 4 7 NA   5
+2 2 5 8 NA   7
+3 3 6 9 NA   9
+
+d  %>%
+ row_means(matches('x|z'))
+  x y z  q mean
+1 1 4 7 NA    4
+2 2 5 8 NA    5
+3 3 6 9 NA    6
+
+row_apply(
+ d ,
+ x:z,
+ .fun = function(x)
+   apply(x, 1, paste, collapse = '')
+)
+  x y z  q var
+1 1 4 7 NA 147
+2 2 5 8 NA 258
+3 3 6 9 NA 369







- @@ -623,7 +455,7 @@

-

Site built with pkgdown 1.4.1.

+

Site built with pkgdown 1.5.1.

diff --git a/docs/articles/introduction_files/accessible-code-block-0.0.1/empty-anchor.js b/docs/articles/introduction_files/accessible-code-block-0.0.1/empty-anchor.js new file mode 100644 index 0000000..ca349fd --- /dev/null +++ b/docs/articles/introduction_files/accessible-code-block-0.0.1/empty-anchor.js @@ -0,0 +1,15 @@ +// Hide empty tag within highlighted CodeBlock for screen reader accessibility (see https://github.com/jgm/pandoc/issues/6352#issuecomment-626106786) --> +// v0.0.1 +// Written by JooYoung Seo (jooyoung@psu.edu) and Atsushi Yasumoto on June 1st, 2020. + +document.addEventListener('DOMContentLoaded', function() { + const codeList = document.getElementsByClassName("sourceCode"); + for (var i = 0; i < codeList.length; i++) { + var linkList = codeList[i].getElementsByTagName('a'); + for (var j = 0; j < linkList.length; j++) { + if (linkList[j].innerHTML === "") { + linkList[j].setAttribute('aria-hidden', 'true'); + } + } + } +}); diff --git a/docs/articles/introduction_files/figure-html/describe_cat-1.svg b/docs/articles/introduction_files/figure-html/describe_cat-1.svg index 2619925..b8f80ff 100644 --- a/docs/articles/introduction_files/figure-html/describe_cat-1.svg +++ b/docs/articles/introduction_files/figure-html/describe_cat-1.svg @@ -12,49 +12,49 @@ ]]> - + - - + + - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + -0 -10 -20 -30 -40 -50 - - - - - - - - -a -b -Group -% +0 +10 +20 +30 +40 +50 + + + + + + + + +a +b +Group +% diff --git a/docs/authors.html b/docs/authors.html index a00586f..c93716a 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -17,23 +17,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -46,7 +50,6 @@ - @@ -64,7 +67,7 @@ - +
@@ -151,7 +154,7 @@

Authors

-

Site built with pkgdown 1.4.1.

+

Site built with pkgdown 1.5.1.

diff --git a/docs/bootstrap-toc.css b/docs/bootstrap-toc.css new file mode 100644 index 0000000..5a85941 --- /dev/null +++ b/docs/bootstrap-toc.css @@ -0,0 +1,60 @@ +/*! + * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) + * Copyright 2015 Aidan Feldman + * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ + +/* modified from https://github.com/twbs/bootstrap/blob/94b4076dd2efba9af71f0b18d4ee4b163aa9e0dd/docs/assets/css/src/docs.css#L548-L601 */ + +/* All levels of nav */ +nav[data-toggle='toc'] .nav > li > a { + display: block; + padding: 4px 20px; + font-size: 13px; + font-weight: 500; + color: #767676; +} +nav[data-toggle='toc'] .nav > li > a:hover, +nav[data-toggle='toc'] .nav > li > a:focus { + padding-left: 19px; + color: #563d7c; + text-decoration: none; + background-color: transparent; + border-left: 1px solid #563d7c; +} +nav[data-toggle='toc'] .nav > .active > a, +nav[data-toggle='toc'] .nav > .active:hover > a, +nav[data-toggle='toc'] .nav > .active:focus > a { + padding-left: 18px; + font-weight: bold; + color: #563d7c; + background-color: transparent; + border-left: 2px solid #563d7c; +} + +/* Nav: second level (shown on .active) */ +nav[data-toggle='toc'] .nav .nav { + display: none; /* Hide by default, but at >768px, show it */ + padding-bottom: 10px; +} +nav[data-toggle='toc'] .nav .nav > li > a { + padding-top: 1px; + padding-bottom: 1px; + padding-left: 30px; + font-size: 12px; + font-weight: normal; +} +nav[data-toggle='toc'] .nav .nav > li > a:hover, +nav[data-toggle='toc'] .nav .nav > li > a:focus { + padding-left: 29px; +} +nav[data-toggle='toc'] .nav .nav > .active > a, +nav[data-toggle='toc'] .nav .nav > .active:hover > a, +nav[data-toggle='toc'] .nav .nav > .active:focus > a { + padding-left: 28px; + font-weight: 500; +} + +/* from https://github.com/twbs/bootstrap/blob/e38f066d8c203c3e032da0ff23cd2d6098ee2dd6/docs/assets/css/src/docs.css#L631-L634 */ +nav[data-toggle='toc'] .nav > .active > ul { + display: block; +} diff --git a/docs/bootstrap-toc.js b/docs/bootstrap-toc.js new file mode 100644 index 0000000..1cdd573 --- /dev/null +++ b/docs/bootstrap-toc.js @@ -0,0 +1,159 @@ +/*! + * Bootstrap Table of Contents v0.4.1 (http://afeld.github.io/bootstrap-toc/) + * Copyright 2015 Aidan Feldman + * Licensed under MIT (https://github.com/afeld/bootstrap-toc/blob/gh-pages/LICENSE.md) */ +(function() { + 'use strict'; + + window.Toc = { + helpers: { + // return all matching elements in the set, or their descendants + findOrFilter: function($el, selector) { + // http://danielnouri.org/notes/2011/03/14/a-jquery-find-that-also-finds-the-root-element/ + // http://stackoverflow.com/a/12731439/358804 + var $descendants = $el.find(selector); + return $el.filter(selector).add($descendants).filter(':not([data-toc-skip])'); + }, + + generateUniqueIdBase: function(el) { + var text = $(el).text(); + var anchor = text.trim().toLowerCase().replace(/[^A-Za-z0-9]+/g, '-'); + return anchor || el.tagName.toLowerCase(); + }, + + generateUniqueId: function(el) { + var anchorBase = this.generateUniqueIdBase(el); + for (var i = 0; ; i++) { + var anchor = anchorBase; + if (i > 0) { + // add suffix + anchor += '-' + i; + } + // check if ID already exists + if (!document.getElementById(anchor)) { + return anchor; + } + } + }, + + generateAnchor: function(el) { + if (el.id) { + return el.id; + } else { + var anchor = this.generateUniqueId(el); + el.id = anchor; + return anchor; + } + }, + + createNavList: function() { + return $(''); + }, + + createChildNavList: function($parent) { + var $childList = this.createNavList(); + $parent.append($childList); + return $childList; + }, + + generateNavEl: function(anchor, text) { + var $a = $(''); + $a.attr('href', '#' + anchor); + $a.text(text); + var $li = $('
  • '); + $li.append($a); + return $li; + }, + + generateNavItem: function(headingEl) { + var anchor = this.generateAnchor(headingEl); + var $heading = $(headingEl); + var text = $heading.data('toc-text') || $heading.text(); + return this.generateNavEl(anchor, text); + }, + + // Find the first heading level (`

    `, then `

    `, etc.) that has more than one element. Defaults to 1 (for `

    `). + getTopLevel: function($scope) { + for (var i = 1; i <= 6; i++) { + var $headings = this.findOrFilter($scope, 'h' + i); + if ($headings.length > 1) { + return i; + } + } + + return 1; + }, + + // returns the elements for the top level, and the next below it + getHeadings: function($scope, topLevel) { + var topSelector = 'h' + topLevel; + + var secondaryLevel = topLevel + 1; + var secondarySelector = 'h' + secondaryLevel; + + return this.findOrFilter($scope, topSelector + ',' + secondarySelector); + }, + + getNavLevel: function(el) { + return parseInt(el.tagName.charAt(1), 10); + }, + + populateNav: function($topContext, topLevel, $headings) { + var $context = $topContext; + var $prevNav; + + var helpers = this; + $headings.each(function(i, el) { + var $newNav = helpers.generateNavItem(el); + var navLevel = helpers.getNavLevel(el); + + // determine the proper $context + if (navLevel === topLevel) { + // use top level + $context = $topContext; + } else if ($prevNav && $context === $topContext) { + // create a new level of the tree and switch to it + $context = helpers.createChildNavList($prevNav); + } // else use the current $context + + $context.append($newNav); + + $prevNav = $newNav; + }); + }, + + parseOps: function(arg) { + var opts; + if (arg.jquery) { + opts = { + $nav: arg + }; + } else { + opts = arg; + } + opts.$scope = opts.$scope || $(document.body); + return opts; + } + }, + + // accepts a jQuery object, or an options object + init: function(opts) { + opts = this.helpers.parseOps(opts); + + // ensure that the data attribute is in place for styling + opts.$nav.attr('data-toggle', 'toc'); + + var $topContext = this.helpers.createChildNavList(opts.$nav); + var topLevel = this.helpers.getTopLevel(opts.$scope); + var $headings = this.helpers.getHeadings(opts.$scope, topLevel); + this.helpers.populateNav($topContext, topLevel, $headings); + } + }; + + $(function() { + $('nav[data-toggle="toc"]').each(function(i, el) { + var $nav = $(el); + Toc.init($nav); + }); + }); +})(); diff --git a/docs/index.html b/docs/index.html index dbaab5c..76e61cf 100644 --- a/docs/index.html +++ b/docs/index.html @@ -12,21 +12,21 @@ - - - - + + + + + - - +

    Overview

    @@ -106,7 +106,7 @@

    Installation

    To install from GitHub the devtools package is required.

    -
    devtools::install_github('m-clark/tidyext')
    +
    devtools::install_github('m-clark/tidyext')

    Note that this package more or less assumes your are working within the tidyverse, especially dplyr. As such you should have the tidyverse packages installed.

    @@ -117,12 +117,10 @@

  • combn_2_col: Takes a column with multiple entries per cell and creates indicator columns of all possible combinations of the cell values up to m combinations.

  • create_prediction_data: Straightforward way to quickly create data to make model predictions.

  • describe_all: A summary function for mixed data types that provides the information I usually want. Saves one from doing a group_by %>% summarize operation to create multiple results for multiple types of variables. Has corresponding describe_all_num and describe_all_cat for numeric-only and categorical-only data respectively.

  • -
  • gather_multi: Gather multiple sets of variables.

  • num_by: A quick summarize, possibly with dplyr::group_by, that provides things like mean, sd, etc. See num_summary.

  • num_summary: A little better than the base R summary, gives the info one typically wants as well as options for rounding and other statistics.

  • onehot: A function for one-hot encoding with a few helpful options for dealing with missing data, using sparse matrices, and more.

  • pre_process: Easily pre-process a data set with common operations like standardization, logging, etc.

  • -
  • spread2: The tidyr spread without the duplicate row id problem.

  • sum_NA, sum_NaN, sum_blank: Understand your nothingness.

  • row_sums, row_means, row_apply: Simple (intuitive) rowwise calculations.

  • @@ -149,11 +147,11 @@

    -
    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/describe_all-1.svg b/docs/reference/describe_all-1.svg index 1991084..57f10e4 100644 --- a/docs/reference/describe_all-1.svg +++ b/docs/reference/describe_all-1.svg @@ -12,50 +12,50 @@ ]]> - + - - + + - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + -25 -30 -35 -40 -45 - - - - - - - - -8 -4 -6 -Group -% +25 +30 +35 +40 +45 + + + + + + + + +8 +4 +6 +Group +% diff --git a/docs/reference/describe_all.html b/docs/reference/describe_all.html index 396b27d..9f1d305 100644 --- a/docs/reference/describe_all.html +++ b/docs/reference/describe_all.html @@ -17,23 +17,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -47,7 +51,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -171,7 +174,7 @@

    Arg digits -

    See round. Default is 2, which for +

    See [base::round()]. Default is 2, which for categorical is applied to the proportion (i.e. before converting to percentage).

    @@ -252,20 +255,22 @@

    Examp #> intersect, setdiff, setequal, union

    X = data.frame(f1 =gl(2, 1, 20, labels=c('A', 'B')), f2=gl(2, 2, 20, labels=c('X', 'Q'))) X = X %>% mutate(bin1 = rbinom(20, 1, p=.5), - logic1 = sample(c(TRUE, FALSE), 20, replace = TRUE), + logic1 = sample(c(TRUE, FALSE), 20, replace = TRUE), num1 = rnorm(20), num2 = rpois(20, 5), - char1 = sample(letters, 20, replace = TRUE)) + char1 = sample(letters, 20, replace = TRUE)) describe_all(X)
    #> $`Numeric Variables` -#> Variable N Mean SD Min Q1 Median Q3 Max Missing -#> 1 bin1 20 0.30 0.47 0.00 0.00 0.00 1.00 1.00 0 -#> 2 num1 20 0.01 0.99 -1.91 -0.44 0.05 0.54 1.89 0 -#> 3 num2 20 4.95 2.42 1.00 3.00 5.00 6.00 12.00 0 +#> # A tibble: 3 x 10 +#> Variable N Mean SD Min Q1 Median Q3 Max `% Missing` +#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> +#> 1 bin1 20 0.3 0.47 0 0 0 1 1 0 +#> 2 num1 20 0.01 0.99 -1.91 -0.44 0.05 0.54 1.89 0 +#> 3 num2 20 4.95 2.42 1 3 5 6 12 0 #> #> $`Categorical Variables` -#> # A tibble: 19 x 4 +#> # A tibble: 19 x 4 #> Variable Group Frequency `%` -#> <chr> <chr> <int> <dbl> +#> <chr> <fct> <int> <dbl> #> 1 f1 A 10 50 #> 2 f1 B 10 50 #> 3 f2 Q 10 50 @@ -301,35 +306,25 @@

    Examp #> 6 x 6 1 14.3 #> 7 x 7 1 14.3 #>

    describe_all(mtcars, digits=5, include_numeric=TRUE, max_levels=3)
    #> $`Numeric Variables` -#> Variable N Mean SD Min Q1 Median Q3 Max -#> 1 mpg 32 20.09062 6.02695 10.400 15.42500 19.200 22.80 33.900 -#> 2 cyl 32 6.18750 1.78592 4.000 4.00000 6.000 8.00 8.000 -#> 3 disp 32 230.72188 123.93869 71.100 120.82500 196.300 326.00 472.000 -#> 4 hp 32 146.68750 68.56287 52.000 96.50000 123.000 180.00 335.000 -#> 5 drat 32 3.59656 0.53468 2.760 3.08000 3.695 3.92 4.930 -#> 6 wt 32 3.21725 0.97846 1.513 2.58125 3.325 3.61 5.424 -#> 7 qsec 32 17.84875 1.78694 14.500 16.89250 17.710 18.90 22.900 -#> 8 vs 32 0.43750 0.50402 0.000 0.00000 0.000 1.00 1.000 -#> 9 am 32 0.40625 0.49899 0.000 0.00000 0.000 1.00 1.000 -#> 10 gear 32 3.68750 0.73780 3.000 3.00000 4.000 4.00 5.000 -#> 11 carb 32 2.81250 1.61520 1.000 2.00000 2.000 4.00 8.000 -#> Missing -#> 1 0 -#> 2 0 -#> 3 0 -#> 4 0 -#> 5 0 -#> 6 0 -#> 7 0 -#> 8 0 -#> 9 0 -#> 10 0 -#> 11 0 +#> # A tibble: 11 x 10 +#> Variable N Mean SD Min Q1 Median Q3 Max `% Missing` +#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> +#> 1 mpg 32 20.1 6.03 10.4 15.4 19.2 22.8 33.9 0 +#> 2 cyl 32 6.19 1.79 4 4 6 8 8 0 +#> 3 disp 32 231. 124. 71.1 121. 196. 326 472 0 +#> 4 hp 32 147. 68.6 52 96.5 123 180 335 0 +#> 5 drat 32 3.60 0.535 2.76 3.08 3.70 3.92 4.93 0 +#> 6 wt 32 3.22 0.978 1.51 2.58 3.32 3.61 5.42 0 +#> 7 qsec 32 17.8 1.79 14.5 16.9 17.7 18.9 22.9 0 +#> 8 vs 32 0.438 0.504 0 0 0 1 1 0 +#> 9 am 32 0.406 0.499 0 0 0 1 1 0 +#> 10 gear 32 3.69 0.738 3 3 4 4 5 0 +#> 11 carb 32 2.81 1.62 1 2 2 4 8 0 #> #> $`Categorical Variables` #> # A tibble: 10 x 4 #> Variable Group Frequency `%` -#> <chr> <chr> <int> <dbl> +#> <chr> <fct> <int> <dbl> #> 1 cyl 8 14 43.8 #> 2 cyl 4 11 34.4 #> 3 cyl 6 7 21.9 @@ -350,20 +345,14 @@

    Examp filter(Variable == 'cyl') %>% tidyr::unnest() %>% ggplot(aes(x=Group, y=`%`)) + - geom_point(size = 10)

    #> Warning: `cols` is now required. + geom_point(size = 10)
    #> Warning: `cols` is now required when using unnest(). #> Please use `cols = c(data)`
    - @@ -374,7 +363,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/gather_multi.html b/docs/reference/gather_multi.html index 9d4002c..4e4c5ea 100644 --- a/docs/reference/gather_multi.html +++ b/docs/reference/gather_multi.html @@ -17,23 +17,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -46,9 +50,9 @@ + columns. It is now deprecated and no longer needed due to advancements in + `tidyr`." /> - @@ -66,7 +70,7 @@ - +
    @@ -138,7 +142,8 @@

    Extend tidyr gather to multiple sets of variables

    This function extends gather to work on multiple sets of - columns.

    + columns. It is now deprecated and no longer needed due to advancements in + `tidyr`.

    gather_multi(
    @@ -233,12 +238,13 @@ 

    Note

    You can use the key_func argument to pretty it up.

    Examples

    -
    library(tidyext); library(dplyr) +
    if (FALSE) { +library(tidyext); library(dplyr) # example of longitudinal data with 4 waves demo_data_wide = data.frame(id = 1:10, X = matrix(rnorm(40), ncol = 4), - Y = matrix(sample(0:1, 40, replace = TRUE), + Y = matrix(sample(0:1, 40, replace = TRUE), ncol = 4), Z = matrix(rpois(40, 5), ncol = 4)) @@ -248,47 +254,8 @@

    Examp varlist = vars(starts_with('X'), starts_with('Y'), starts_with('Z'))) -test

    #> id wave X Y Z -#> 1 1 X.1 0.1684728 1 1 -#> 2 2 X.1 1.3970665 0 6 -#> 3 3 X.1 -0.6790954 1 4 -#> 4 4 X.1 0.7376294 1 8 -#> 5 5 X.1 -0.8607243 0 6 -#> 6 6 X.1 0.4212304 0 6 -#> 7 7 X.1 1.4505432 0 6 -#> 8 8 X.1 0.1943924 0 5 -#> 9 9 X.1 -0.6912054 0 3 -#> 10 10 X.1 1.3398599 1 5 -#> 11 1 X.2 2.7361084 1 4 -#> 12 2 X.2 -0.9441017 1 6 -#> 13 3 X.2 -1.7810619 1 6 -#> 14 4 X.2 -0.7160587 0 4 -#> 15 5 X.2 0.9110785 0 9 -#> 16 6 X.2 -0.7721921 1 4 -#> 17 7 X.2 -0.7820777 0 2 -#> 18 8 X.2 -0.4321952 1 5 -#> 19 9 X.2 -0.6675648 0 3 -#> 20 10 X.2 1.3895059 0 3 -#> 21 1 X.3 0.9118739 0 9 -#> 22 2 X.3 0.2053894 0 7 -#> 23 3 X.3 2.5844322 0 6 -#> 24 4 X.3 -0.7893881 0 5 -#> 25 5 X.3 0.5880771 0 5 -#> 26 6 X.3 -0.7112873 0 2 -#> 27 7 X.3 1.5849968 0 4 -#> 28 8 X.3 0.6763896 0 3 -#> 29 9 X.3 -0.2327618 0 6 -#> 30 10 X.3 0.6374729 0 6 -#> 31 1 X.4 -1.3707612 1 9 -#> 32 2 X.4 -1.4256595 0 3 -#> 33 3 X.4 -1.2461920 0 10 -#> 34 4 X.4 -0.6832669 0 4 -#> 35 5 X.4 -0.9796754 0 6 -#> 36 6 X.4 -0.4625191 1 7 -#> 37 7 X.4 1.2145097 1 2 -#> 38 8 X.4 -1.2778199 1 5 -#> 39 9 X.4 0.7478688 0 7 -#> 40 10 X.4 3.3915088 0 4
    +test + test <- gather_multi(demo_data_wide, key = wave, values = c('X', 'Y', 'Z'), @@ -296,58 +263,13 @@

    Examp starts_with('Y'), starts_with('Z')), key_func = function(x) substr(x, start=3, stop=3)) -test

    #> id wave X Y Z -#> 1 1 1 0.1684728 1 1 -#> 2 2 1 1.3970665 0 6 -#> 3 3 1 -0.6790954 1 4 -#> 4 4 1 0.7376294 1 8 -#> 5 5 1 -0.8607243 0 6 -#> 6 6 1 0.4212304 0 6 -#> 7 7 1 1.4505432 0 6 -#> 8 8 1 0.1943924 0 5 -#> 9 9 1 -0.6912054 0 3 -#> 10 10 1 1.3398599 1 5 -#> 11 1 2 2.7361084 1 4 -#> 12 2 2 -0.9441017 1 6 -#> 13 3 2 -1.7810619 1 6 -#> 14 4 2 -0.7160587 0 4 -#> 15 5 2 0.9110785 0 9 -#> 16 6 2 -0.7721921 1 4 -#> 17 7 2 -0.7820777 0 2 -#> 18 8 2 -0.4321952 1 5 -#> 19 9 2 -0.6675648 0 3 -#> 20 10 2 1.3895059 0 3 -#> 21 1 3 0.9118739 0 9 -#> 22 2 3 0.2053894 0 7 -#> 23 3 3 2.5844322 0 6 -#> 24 4 3 -0.7893881 0 5 -#> 25 5 3 0.5880771 0 5 -#> 26 6 3 -0.7112873 0 2 -#> 27 7 3 1.5849968 0 4 -#> 28 8 3 0.6763896 0 3 -#> 29 9 3 -0.2327618 0 6 -#> 30 10 3 0.6374729 0 6 -#> 31 1 4 -1.3707612 1 9 -#> 32 2 4 -1.4256595 0 3 -#> 33 3 4 -1.2461920 0 10 -#> 34 4 4 -0.6832669 0 4 -#> 35 5 4 -0.9796754 0 6 -#> 36 6 4 -0.4625191 1 7 -#> 37 7 4 1.2145097 1 2 -#> 38 8 4 -1.2778199 1 5 -#> 39 9 4 0.7478688 0 7 -#> 40 10 4 3.3915088 0 4
    +test +}
    - @@ -358,7 +280,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/head_tail.html b/docs/reference/head_tail.html index 2bfe6e2..fc5f73a 100644 --- a/docs/reference/head_tail.html +++ b/docs/reference/head_tail.html @@ -17,23 +17,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -47,7 +51,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -164,8 +167,8 @@

    Details

    Examples

    library(tidyext) -as.matrix(mtcars) %>% -head_tail(6)
    #> index mpg cyl disp hp drat wt qsec vs am gear carb + +head_tail(mtcars)
    #> index mpg cyl disp hp drat wt qsec vs am gear carb #> 1 1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 #> 2 2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 #> 3 3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 @@ -180,15 +183,10 @@

    Examp #> 12 32 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2

    - @@ -199,7 +197,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/index.html b/docs/reference/index.html index 35195e8..66a7775 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -17,23 +17,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -46,7 +50,6 @@ - @@ -64,7 +67,7 @@ - +
    @@ -147,6 +150,11 @@

    Functions for summarizing data.

    + + + + + @@ -179,6 +187,11 @@

    Functions for creating data.

    + + + + + @@ -212,13 +225,7 @@

    spread2()

    - -

    Spread without frustration

    - - - -

    row_sums() row_means() row_apply()

    +

    row_sums() row_means() row_min() row_max() row_apply()

    Apply simple rowwise functions

    @@ -229,6 +236,11 @@

    Miscellaneous functions.

    + + + + + @@ -237,6 +249,18 @@

    head_tail()

    + +

    Show head and tail simultaneously

    + + + +

    rnd()

    + +

    Sensible rounding for printing.

    + +

    tidyext

    @@ -246,13 +270,10 @@

    -

    Contents

    -
    +

    @@ -263,7 +284,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/num_by.html b/docs/reference/num_by.html index 375c0ed..d40fbfb 100644 --- a/docs/reference/num_by.html +++ b/docs/reference/num_by.html @@ -17,23 +17,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -47,7 +51,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -182,7 +185,7 @@

    Arg sort_by_group

    when supplied a grouping variable for cat_by, do you -want result sorted on the grouping variable? Default is TRUE.

    +want the result sorted on the grouping variable? Default is TRUE.

    @@ -196,7 +199,7 @@

    Details median, third quartile, max, and number of missing values, possibly over a grouping variable.

    It works in the dplyr style using unquoted (bare) variable names, using the - vars() function if there is more than one variable. If using a + vars() function if there is more than one variable. If using a grouping variable, it will treat missing values as a separate group.

    For cat_by, frequencies and percentage (out of total or group_var) are returned. Warnings are given if any of the main @@ -217,70 +220,66 @@

    Examp g2 = sample(1:4, 50, replace = TRUE), a = rnorm(50), b = rpois(50, 10), - c = sample(letters, 50, replace=TRUE), - d = sample(c(TRUE, FALSE), 50, replace=TRUE) + c = sample(letters, 50, replace = TRUE), + d = sample(c(TRUE, FALSE), 50, replace = TRUE) ) -num_by(df1, main_var = a)

    #> Variable N Mean SD Min Q1 Median Q3 Max Missing -#> 1 a 50 -0.1 1.1 -3.7 -0.6 -0.1 0.7 1.9 0
    num_by(df1, main_var = a, group_var = g2, digits=2)
    #> Adding missing grouping variables: `g2`
    #> Adding missing grouping variables: `g2`
    #> # A tibble: 4 x 11 +num_by(df1, main_var = a)
    #> # A tibble: 1 x 10 +#> Variable N Mean SD Min Q1 Median Q3 Max `% Missing` +#> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> +#> 1 a 50 -0.1 1.1 -2.2 -0.9 -0.2 0.6 2 0
    num_by(df1, main_var = a, group_var = g2, digits = 2)
    #> Adding missing grouping variables: `g2`
    #> Adding missing grouping variables: `g2`
    #> # A tibble: 4 x 11 #> # Groups: g2 [4] -#> g2 Variable N Mean SD Min Q1 Median Q3 Max Missing -#> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> -#> 1 1 a 13 -0.74 1.27 -3.71 -0.78 -0.61 -0.290 1.06 0 -#> 2 2 a 13 0.37 1.26 -2.29 -0.31 0.68 1.57 1.86 0 -#> 3 3 a 11 -0.14 0.87 -1.77 -0.71 -0.07 0.6 1.18 0 -#> 4 4 a 13 0.1 0.67 -1.29 -0.25 0.01 0.38 1.2 0
    +#> g2 Variable N Mean SD Min Q1 Median Q3 Max `% Missing` +#> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> +#> 1 4 a 14 -0.17 0.87 -1.64 -0.82 -0.28 0.51 1.2 0 +#> 2 3 a 10 -0.24 1.35 -2.18 -1.23 -0.1 0.4 1.74 0 +#> 3 2 a 13 0.15 1.29 -2.12 -0.78 0.35 0.94 2.04 0 +#> 4 1 a 13 -0.21 1.04 -1.87 -0.78 -0.22 0.39 1.96 0
    num_by(df1, main_var = dplyr::vars(a,b), group_var = g1, digits=1)
    #> Adding missing grouping variables: `g1`
    #> Adding missing grouping variables: `g1`
    #> # A tibble: 4 x 11 #> # Groups: g1 [2] -#> g1 Variable N Mean SD Min Q1 Median Q3 Max Missing -#> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> -#> 1 a a 20 -0.3 0.9 -2.3 -0.8 -0.3 0.4 1.2 0 -#> 2 a b 20 9.8 3.1 6 7 10 11.5 16 0 -#> 3 b a 30 0 1.2 -3.7 -0.5 0 0.8 1.9 0 -#> 4 b b 30 10.2 3.9 2 8 10 13 17 0
    +#> g1 Variable N Mean SD Min Q1 Median Q3 Max `% Missing` +#> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> +#> 1 b a 19 -0.6 1.1 -2.2 -1.6 -0.8 0.1 1.6 0 +#> 2 b b 19 9.9 2.8 5 8.5 10 13 13 0 +#> 3 a a 31 0.2 1 -1.6 -0.7 0.3 0.7 2 0 +#> 4 a b 31 11.2 3.5 6 9 11 14.5 18 0
    cat_by(df1, main_var = g1, group_var = g2, digits=1)
    #> `mutate_if()` ignored the following grouping variables: #> Column `g2`
    #> # A tibble: 8 x 5 #> # Groups: g2 [4] #> g2 g1 N `% of Total` `% of g2` #> <int> <fct> <dbl> <dbl> <dbl> -#> 1 1 a 4 8 30.8 -#> 2 1 b 9 18 69.2 -#> 3 2 a 5 10 38.5 -#> 4 2 b 8 16 61.5 -#> 5 3 a 7 14 63.6 -#> 6 3 b 4 8 36.4 -#> 7 4 a 4 8 30.8 -#> 8 4 b 9 18 69.2
    cat_by(df1, main_var = dplyr::vars(g1,d), group_var = g2, perc_by_group=FALSE)
    #> # A tibble: 15 x 5 +#> 1 1 a 8 16 61.5 +#> 2 1 b 5 10 38.5 +#> 3 2 a 9 18 69.2 +#> 4 2 b 4 8 30.8 +#> 5 3 a 5 10 50 +#> 6 3 b 5 10 50 +#> 7 4 a 9 18 64.3 +#> 8 4 b 5 10 35.7
    cat_by(df1, main_var = dplyr::vars(g1,d), group_var = g2, perc_by_group = FALSE)
    #> # A tibble: 15 x 5 #> g2 g1 d N `% of Total` #> <int> <fct> <lgl> <int> <dbl> -#> 1 1 a FALSE 1 2 +#> 1 1 a FALSE 5 10 #> 2 1 a TRUE 3 6 -#> 3 1 b FALSE 2 4 -#> 4 1 b TRUE 7 14 -#> 5 2 a FALSE 3 6 -#> 6 2 a TRUE 2 4 -#> 7 2 b FALSE 2 4 -#> 8 2 b TRUE 6 12 -#> 9 3 a FALSE 3 6 -#> 10 3 a TRUE 4 8 -#> 11 3 b FALSE 3 6 -#> 12 3 b TRUE 1 2 +#> 3 1 b FALSE 1 2 +#> 4 1 b TRUE 4 8 +#> 5 2 a FALSE 5 10 +#> 6 2 a TRUE 4 8 +#> 7 2 b TRUE 4 8 +#> 8 3 a FALSE 1 2 +#> 9 3 a TRUE 4 8 +#> 10 3 b FALSE 4 8 +#> 11 3 b TRUE 1 2 +#> 12 4 a FALSE 5 10 #> 13 4 a TRUE 4 8 -#> 14 4 b FALSE 5 10 +#> 14 4 b FALSE 1 2 #> 15 4 b TRUE 4 8
    - @@ -291,7 +290,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/num_summary.html b/docs/reference/num_summary.html index 0793ad7..1a091f1 100644 --- a/docs/reference/num_summary.html +++ b/docs/reference/num_summary.html @@ -17,23 +17,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -47,7 +51,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -171,19 +174,19 @@

    Details

    Examples

    library(tidyext) -num_summary(c(1:10, NA))
    #> N Mean SD Min Q1 Median Q3 Max Missing -#> 1 10 5.5 3 1 3.2 5.5 7.8 10 1
    num_summary(c('1','2','3'))
    #> N Mean SD Min Q1 Median Q3 Max Missing -#> 1 3 2 1 1 1.5 2 2.5 3 0
    +num_summary(c(1:10, NA))
    #> # A tibble: 1 x 9 +#> N Mean SD Min Q1 Median Q3 Max `% Missing` +#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> +#> 1 10 5.5 3 1 3.2 5.5 7.8 10 9
    num_summary(c('1','2','3'))
    #> # A tibble: 1 x 9 +#> N Mean SD Min Q1 Median Q3 Max `% Missing` +#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> +#> 1 3 2 1 1 1.5 2 2.5 3 0
    +
    - @@ -194,7 +197,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/onehot.html b/docs/reference/onehot.html index 919bc90..217f666 100644 --- a/docs/reference/onehot.html +++ b/docs/reference/onehot.html @@ -17,23 +17,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -47,7 +51,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -233,31 +236,25 @@

    Examp iris2 = iris iris2[sample(1:150, 25),] = NA str(onehot(iris2))

    #> 'data.frame': 150 obs. of 7 variables: -#> $ Sepal.Length : num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... -#> $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... -#> $ Petal.Length : num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... -#> $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... -#> $ Species_setosa : num 1 1 1 1 1 1 1 1 1 1 ... -#> $ Species_versicolor: num 0 0 0 0 0 0 0 0 0 0 ... -#> $ Species_virginica : num 0 0 0 0 0 0 0 0 0 0 ...
    str(onehot(iris2, nas = 'na.omit'))
    #> 'data.frame': 125 obs. of 7 variables: -#> $ Sepal.Length : num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ... -#> $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ... -#> $ Petal.Length : num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ... -#> $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ... +#> $ Sepal.Length : num 5.1 4.9 NA NA 5 NA 4.6 NA 4.4 4.9 ... +#> $ Sepal.Width : num 3.5 3 NA NA 3.6 NA 3.4 NA 2.9 3.1 ... +#> $ Petal.Length : num 1.4 1.4 NA NA 1.4 NA 1.4 NA 1.4 1.5 ... +#> $ Petal.Width : num 0.2 0.2 NA NA 0.2 NA 0.3 NA 0.2 0.1 ... +#> $ Species_setosa : num 1 1 NA NA 1 NA 1 NA 1 1 ... +#> $ Species_versicolor: num 0 0 NA NA 0 NA 0 NA 0 0 ... +#> $ Species_virginica : num 0 0 NA NA 0 NA 0 NA 0 0 ...
    str(onehot(iris2, nas = 'na.omit'))
    #> 'data.frame': 125 obs. of 7 variables: +#> $ Sepal.Length : num 5.1 4.9 5 4.6 4.4 4.9 4.8 4.8 4.3 5.8 ... +#> $ Sepal.Width : num 3.5 3 3.6 3.4 2.9 3.1 3.4 3 3 4 ... +#> $ Petal.Length : num 1.4 1.4 1.4 1.4 1.4 1.5 1.6 1.4 1.1 1.2 ... +#> $ Petal.Width : num 0.2 0.2 0.2 0.3 0.2 0.1 0.2 0.1 0.1 0.2 ... #> $ Species_setosa : num 1 1 1 1 1 1 1 1 1 1 ... #> $ Species_versicolor: num 0 0 0 0 0 0 0 0 0 0 ... #> $ Species_virginica : num 0 0 0 0 0 0 0 0 0 0 ...
    - @@ -268,7 +265,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/pre_process.html b/docs/reference/pre_process.html index 1c2efe1..6915b59 100644 --- a/docs/reference/pre_process.html +++ b/docs/reference/pre_process.html @@ -17,23 +17,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -47,7 +51,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -166,19 +169,19 @@

    Arg log_vars -

    Which variables to log. Requires vars().

    +

    Which variables to log. Requires `vars()`.

    log_base -

    Log base. Default is exp(1).

    +

    Log base. Default is `exp(1)`.

    zero_start -

    Which variables to start by zero. Requires vars().

    +

    Which variables to start by zero. Requires `vars()`.

    zero_one -

    Which variables to rescale from 0 to 1. Requires vars().

    +

    Which variables to rescale from 0 to 1. Requires `vars()`.

    @@ -207,6 +210,8 @@

    Details

    Examples

    library(tidyext) +library(dplyr) + pre_process(mtcars)
    #> mpg cyl disp hp drat wt #> 1 0.15088482 -0.1049878 -0.57061982 -0.53509284 0.56751369 -0.610399567 #> 2 0.15088482 -0.1049878 -0.57061982 -0.53509284 0.56751369 -0.349785269 @@ -272,7 +277,7 @@

    Examp #> 29 -1.87401028 -0.8680278 1.1899014 1.7789276 0.7352031 #> 30 -1.31439542 -0.8680278 1.1899014 1.7789276 1.9734398 #> 31 -1.81804880 -0.8680278 1.1899014 1.7789276 3.2116766 -#> 32 0.42041067 1.1160357 1.1899014 0.4235542 -0.5030337

    pre_process(mtcars, log_vars=vars(mpg, wt))
    #> mpg cyl disp hp drat wt +#> 32 0.42041067 1.1160357 1.1899014 0.4235542 -0.5030337
    pre_process(mtcars, log_vars = vars(mpg, wt))
    #> mpg cyl disp hp drat wt #> 1 3.044522 -0.1049878 -0.57061982 -0.53509284 0.56751369 0.9631743 #> 2 3.044522 -0.1049878 -0.57061982 -0.53509284 0.56751369 1.0560527 #> 3 3.126761 -1.2248578 -0.99018209 -0.78304046 0.47399959 0.8415672 @@ -337,7 +342,7 @@

    Examp #> 29 -1.87401028 -0.8680278 1.1899014 1.7789276 0.7352031 #> 30 -1.31439542 -0.8680278 1.1899014 1.7789276 1.9734398 #> 31 -1.81804880 -0.8680278 1.1899014 1.7789276 3.2116766 -#> 32 0.42041067 1.1160357 1.1899014 0.4235542 -0.5030337

    pre_process(mtcars, zero_start=vars(cyl, gear))
    #> mpg cyl disp hp drat wt qsec +#> 32 0.42041067 1.1160357 1.1899014 0.4235542 -0.5030337
    pre_process(mtcars, zero_start = vars(cyl, gear))
    #> mpg cyl disp hp drat wt qsec #> 1 0.15088482 2 -0.57061982 -0.53509284 0.56751369 -0.610399567 -0.77716515 #> 2 0.15088482 2 -0.57061982 -0.53509284 0.56751369 -0.349785269 -0.46378082 #> 3 0.44954345 0 -0.99018209 -0.78304046 0.47399959 -0.917004624 0.42600682 @@ -402,7 +407,7 @@

    Examp #> 29 -0.8680278 1.1899014 2 0.7352031 #> 30 -0.8680278 1.1899014 2 1.9734398 #> 31 -0.8680278 1.1899014 2 3.2116766 -#> 32 1.1160357 1.1899014 1 -0.5030337

    pre_process(mtcars, zero_one=vars(mpg))
    #> mpg cyl disp hp drat wt +#> 32 1.1160357 1.1899014 1 -0.5030337
    pre_process(mtcars, zero_one = vars(mpg))
    #> mpg cyl disp hp drat wt #> 1 0.4510638 -0.1049878 -0.57061982 -0.53509284 0.56751369 -0.610399567 #> 2 0.4510638 -0.1049878 -0.57061982 -0.53509284 0.56751369 -0.349785269 #> 3 0.5276596 -1.2248578 -0.99018209 -0.78304046 0.47399959 -0.917004624 @@ -467,17 +472,13 @@

    Examp #> 29 -1.87401028 -0.8680278 1.1899014 1.7789276 0.7352031 #> 30 -1.31439542 -0.8680278 1.1899014 1.7789276 1.9734398 #> 31 -1.81804880 -0.8680278 1.1899014 1.7789276 3.2116766 -#> 32 0.42041067 1.1160357 1.1899014 0.4235542 -0.5030337

    +#> 32 0.42041067 1.1160357 1.1899014 0.4235542 -0.5030337
    +
    - @@ -488,7 +489,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/rnd.html b/docs/reference/rnd.html index f588b66..0be4572 100644 --- a/docs/reference/rnd.html +++ b/docs/reference/rnd.html @@ -17,23 +17,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -47,7 +51,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -169,15 +172,10 @@

    Examp rnd(rnorm(10))

    #> [1] "-0.560" "-0.230" " 1.559" " 0.071" " 0.129" " 1.715" " 0.461" "-1.265" #> [9] "-0.687" "-0.446"
    rnd(1)
    #> [1] "1.000"
    - @@ -188,7 +186,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/row_sums.html b/docs/reference/row_sums.html index 5f04252..cb0f65e 100644 --- a/docs/reference/row_sums.html +++ b/docs/reference/row_sums.html @@ -17,23 +17,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -48,7 +52,6 @@ - @@ -66,7 +69,7 @@ - +
    @@ -145,6 +148,10 @@

    Apply simple rowwise functions

    row_means(data, ..., na_rm = FALSE, varname = "mean") +row_min(data, ..., na_rm = FALSE, varname = "min") + +row_max(data, ..., na_rm = FALSE, varname = "max") + row_apply(data, ..., .fun, varname = "var")

    Arguments

    @@ -164,7 +171,7 @@

    Arg varname -

    The column name of the sums means etc.

    +

    The column name of the sums means etc. as a character string.

    .fun @@ -198,6 +205,13 @@

    Examp #> 1 1 4 7 4 #> 2 2 5 8 5 #> 3 3 6 9 6

    +d %>% + row_max(matches('x|y'))
    #> # A tibble: 3 x 3 +#> x y max +#> <int> <int> <int> +#> 1 1 4 4 +#> 2 2 5 5 +#> 3 3 6 6
    row_apply( d , everything(), @@ -207,17 +221,12 @@

    Examp #> 1 1 4 7 147 #> 2 2 5 8 258 #> 3 3 6 9 369

    -
    + - @@ -228,7 +237,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/select_not.html b/docs/reference/select_not.html index f53da84..ce8c952 100644 --- a/docs/reference/select_not.html +++ b/docs/reference/select_not.html @@ -17,23 +17,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -47,7 +51,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -170,27 +173,27 @@

    Details get `invalid argument` errors. I needed this functionality because an operation was to be performed on some columns, but not if some operations had been performed on others, and those others were passed as unquoted - variables via vars(). See pre_process.

    + variables via vars(). See pre_process.

    In the end, it was far easier to write a two line function than figure out the tidy eval approach that would have worked with less code. It literally just does:

    not_these = names(select(data, ...))

    -

    data %>%
    -  select(-one_of(not_these)) 
    +

    data %&gt;%
    +  select(-one_of(not_these))

    Lionel's answer on SO shows how one could do it. He is a developer for RStudio, so one can assume it's a fairly spot on suggestion. Yet even that would need a bit of modification to work with a - vars(), which I needed for multiple column entries to a single + vars(), which I needed for multiple column entries to a single argument. Here is what ultimately would work with tidyeval using that approach.

    -

    x = vars_input %>%
    -      map(function(sym) call("-", sym)) 
    +

    x = vars_input %&gt;%
    +      map(function(sym) call("-", sym))
    -

    data %>%
    -  select(!!!x) 
    +

    data %&gt;%
    +  select(!!!x)

    You then would need an 'x' for every argument's input. And no one reading the @@ -202,14 +205,10 @@

    Details is essentially the same solution.

    - @@ -220,7 +219,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/spread2.html b/docs/reference/spread2.html index df56190..a928683 100644 --- a/docs/reference/spread2.html +++ b/docs/reference/spread2.html @@ -17,23 +17,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -45,9 +49,9 @@ - + - @@ -65,7 +69,7 @@ - +
    @@ -136,7 +140,8 @@

    Spread without frustration

    -

    Spread without frustration

    +

    #' @description This is now deprecated and no longer needed due to +advancements in `tidyr`.

    spread2(
    @@ -211,7 +216,8 @@ 

    See a

    Examples

    -
    library(tidyext); library(tidyr) +
    if (FALSE) { +library(tidyext); library(tidyr) # initial example from spread stocks_init <- data.frame( @@ -230,114 +236,22 @@

    Examp stocksm_init <- stocks_init %>% gather(stock, price, -time) -stocksm_init %>% spread(stock, price) # works fine

    #> time X Y Z -#> 1 2009-01-01 1.2240818 -2.1356474 1.7058569 -#> 2 2009-01-02 0.3598138 -0.4359498 -1.1802859 -#> 3 2009-01-03 0.4007715 -2.0520089 3.5805026 -#> 4 2009-01-04 0.1106827 -1.4577825 3.5125340 -#> 5 2009-01-05 -0.5558411 -1.2500785 3.2863243 -#> 6 2009-01-06 1.7869131 -3.3733866 2.7545610 -#> 7 2009-01-07 0.4978505 1.6755741 2.2156706 -#> 8 2009-01-08 -1.9666172 0.3067462 -0.2476468 -#> 9 2009-01-09 0.7013559 -2.2762739 -1.2238507 -#> 10 2009-01-10 -0.4727914 2.5076298 -1.5218840
    stocksm <- stocks %>% gather(stock, price) +stocksm_init %>% spread(stock, price) # works fine +stocksm <- stocks %>% gather(stock, price) # stocksm %>% spread(stock, price) # annoying -stocksm %>% spread2(stock, price) # works fine
    #> X Y Z -#> 1 -0.69470698 0.50663703 1.5185579 -#> 2 -0.20791728 -0.05709351 -2.0092938 -#> 3 -1.26539635 -0.08574091 -1.3328295 -#> 4 2.16895597 2.73720457 -4.0743015 -#> 5 1.20796200 -0.45154197 -4.2871649 -#> 6 -1.12310858 3.03294121 1.2141146 -#> 7 -0.40288484 -3.09750561 1.7928391 -#> 8 -0.46665535 1.16922750 0.2120169 -#> 9 0.77996512 0.24770849 3.6890699 -#> 10 -0.08336907 0.43188314 8.2003387
    stocksm %>% spread2(stock, price, compact = FALSE)
    #> X Y Z -#> 1 -0.69470698 NA NA -#> 2 -0.20791728 NA NA -#> 3 -1.26539635 NA NA -#> 4 2.16895597 NA NA -#> 5 1.20796200 NA NA -#> 6 -1.12310858 NA NA -#> 7 -0.40288484 NA NA -#> 8 -0.46665535 NA NA -#> 9 0.77996512 NA NA -#> 10 -0.08336907 NA NA -#> 11 NA 0.50663703 NA -#> 12 NA -0.05709351 NA -#> 13 NA -0.08574091 NA -#> 14 NA 2.73720457 NA -#> 15 NA -0.45154197 NA -#> 16 NA 3.03294121 NA -#> 17 NA -3.09750561 NA -#> 18 NA 1.16922750 NA -#> 19 NA 0.24770849 NA -#> 20 NA 0.43188314 NA -#> 21 NA NA 1.5185579 -#> 22 NA NA -2.0092938 -#> 23 NA NA -1.3328295 -#> 24 NA NA -4.0743015 -#> 25 NA NA -4.2871649 -#> 26 NA NA 1.2141146 -#> 27 NA NA 1.7928391 -#> 28 NA NA 0.2120169 -#> 29 NA NA 3.6890699 -#> 30 NA NA 8.2003387
    +stocksm %>% spread2(stock, price) # works fine +stocksm %>% spread2(stock, price, compact = FALSE) + # works with NA stocksm$price[sample(1:nrow(stocksm), 5)] = NA -stocksm %>% spread2(stock, price)
    #> X Y Z -#> 1 -0.69470698 0.50663703 1.518558 -#> 2 -0.20791728 NA -2.009294 -#> 3 -1.26539635 -0.08574091 -1.332830 -#> 4 2.16895597 NA -4.074302 -#> 5 1.20796200 -0.45154197 -4.287165 -#> 6 -1.12310858 3.03294121 1.214115 -#> 7 -0.40288484 NA 1.792839 -#> 8 -0.46665535 1.16922750 NA -#> 9 0.77996512 0.24770849 3.689070 -#> 10 -0.08336907 NA 8.200339
    stocksm %>% spread2(stock, price, compact = FALSE)
    #> X Y Z -#> 1 -0.69470698 NA NA -#> 2 -0.20791728 NA NA -#> 3 -1.26539635 NA NA -#> 4 2.16895597 NA NA -#> 5 1.20796200 NA NA -#> 6 -1.12310858 NA NA -#> 7 -0.40288484 NA NA -#> 8 -0.46665535 NA NA -#> 9 0.77996512 NA NA -#> 10 -0.08336907 NA NA -#> 11 NA 0.50663703 NA -#> 12 NA NA NA -#> 13 NA -0.08574091 NA -#> 14 NA NA NA -#> 15 NA -0.45154197 NA -#> 16 NA 3.03294121 NA -#> 17 NA NA NA -#> 18 NA 1.16922750 NA -#> 19 NA 0.24770849 NA -#> 20 NA NA NA -#> 21 NA NA 1.518558 -#> 22 NA NA -2.009294 -#> 23 NA NA -1.332830 -#> 24 NA NA -4.074302 -#> 25 NA NA -4.287165 -#> 26 NA NA 1.214115 -#> 27 NA NA 1.792839 -#> 28 NA NA NA -#> 29 NA NA 3.689070 -#> 30 NA NA 8.200339
    -
    +stocksm %>% spread2(stock, price) +stocksm %>% spread2(stock, price, compact = FALSE) +}
    - @@ -348,7 +262,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/sum_NA.html b/docs/reference/sum_NA.html index b75075f..1769534 100644 --- a/docs/reference/sum_NA.html +++ b/docs/reference/sum_NA.html @@ -17,23 +17,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -47,7 +51,6 @@ - @@ -65,7 +68,7 @@ - +
    @@ -174,15 +177,10 @@

    Examp

    - @@ -193,7 +191,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/docs/reference/tidyext.html b/docs/reference/tidyext.html index 8cc3b9f..796b5dc 100644 --- a/docs/reference/tidyext.html +++ b/docs/reference/tidyext.html @@ -17,23 +17,27 @@ - + - + - + + + + + - - + + - + - - + + @@ -49,7 +53,6 @@ to do with data. I work with data for myself and others everyday, and use these functions quite often. The goal here is to not have any dependencies beyond the tidyverse." /> - @@ -67,7 +70,7 @@ - +
    @@ -151,12 +154,10 @@

    Details get created some day: browseVignettes(package = "tidyext")

    - @@ -167,7 +168,7 @@

    Contents

    -

    Site built with pkgdown 1.4.1.

    +

    Site built with pkgdown 1.5.1.

    diff --git a/man/combn_2_col.Rd b/man/combn_2_col.Rd index 47b12a6..7f9f4a5 100644 --- a/man/combn_2_col.Rd +++ b/man/combn_2_col.Rd @@ -72,64 +72,25 @@ This comes up every once in a while. Someone has for whatever } \examples{ library(tidyext) -d = data.frame(id = 1:4, - labs = c('A/B', 'B/C/D/E', 'A/E', 'D/E')) -test = combn_2_col(data=d, var='labs', max_m=3) + +d = data.frame(id = 1:4, labs = c('A/B', 'B/C/D/E', 'A/E', 'D/E')) +test = combn_2_col(data = d, var = 'labs', max_m = 3) test str(test) + d$labs = c('A B', 'B C D E', 'A E', 'D E') -combn_2_col(data=d, var='labs', max_m=1) +combn_2_col(data = d, var = 'labs', max_m = 1) + d$labs = c('Tom, Dick & Harriet', "J'Sean", "OBG, Andreas", NA) -combn_2_col(data=d, var='labs', sep=',', max_m=2, collapse='-') - -\dontrun{ -# requires at least tidytext -tidy_dtm <- function(data, var, sep='-', max_m=3) { - init = stringr::str_split(data[[var]], pattern = sep) # creates a list of separated letters - - # the following gets the combos with a dot separating drugs in a given combo - # this first lapply could be parallelized if need be and is probably slowest - # probably want to change to m = min(c(4, m)) so as to only limit to 4 - # see also, combinat::combn which is slightly faster than base R below - observation_combos = init \%>\% - lapply(function(x) - sapply(seq_along(x), function(m) - utils::combn(x, min(max_m, m), FUN=paste, collapse = '_'))) - - # now we have a standard text analysis problem in need of a document term - matrix - documents = observation_combos \%>\% lapply(unlist) - - # create a 'tidy' form of documents and terms; each term (i.e. combo) only - occurs once in a document - doc_df = data.frame(id=rep(data$id, sapply(documents, length)), - combos=unlist(documents), - count=1) # each term only occurs once in the document - doc_df \%>\% - tidytext::cast_dfm(document=id, term=combos, value=count) - } - -# requires at least text2vec -ttv <- function(data, var, sep='-', max_m=3) { - docs = sapply(stringr::str_split(data[[var]], pattern=sep), - function(str_vec) - sapply(seq_along(str_vec), - function(m) - combn(str_vec, - m = min(max_m, m), - FUN = paste, - collapse = '_') - ) \%>\% unlist() - ) - - toks = itoken(docs, progressbar = FALSE) - vocab = create_vocabulary(toks) - create_dtm(toks, vectorizer = vocab_vectorizer(vocab), progressbar = FALSE) \%>\% - as.matrix() \%>\% - cbind(data,.) -} -} +combn_2_col( +data = d, +var = 'labs', +sep = ',', +max_m = 2, +collapse = '-' +) + diff --git a/man/head_tail.Rd b/man/head_tail.Rd index 16f3aac..9b17456 100644 --- a/man/head_tail.Rd +++ b/man/head_tail.Rd @@ -23,7 +23,7 @@ This will work on matrices and also not be limited to default tibble display. \examples{ library(tidyext) -as.matrix(mtcars) \%>\% -head_tail(6) + +head_tail(mtcars) } diff --git a/man/pre_process.Rd b/man/pre_process.Rd index e04a6cc..ebb53d3 100644 --- a/man/pre_process.Rd +++ b/man/pre_process.Rd @@ -60,8 +60,11 @@ At a minimum, by default, this function will standardize } \examples{ library(tidyext) +library(dplyr) + pre_process(mtcars) -pre_process(mtcars, log_vars=vars(mpg, wt)) -pre_process(mtcars, zero_start=vars(cyl, gear)) -pre_process(mtcars, zero_one=vars(mpg)) +pre_process(mtcars, log_vars = vars(mpg, wt)) +pre_process(mtcars, zero_start = vars(cyl, gear)) +pre_process(mtcars, zero_one = vars(mpg)) + } diff --git a/man/reexports.Rd b/man/reexports.Rd deleted file mode 100644 index 8faef7d..0000000 --- a/man/reexports.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/utils.R -\docType{import} -\name{reexports} -\alias{reexports} -\alias{\%>\%} -\alias{vars} -\title{Objects exported from other packages} -\keyword{internal} -\description{ -These objects are imported from other packages. Follow the links -below to see their documentation. - -\describe{ - \item{dplyr}{\code{\link[dplyr]{vars}}} - - \item{magrittr}{\code{\link[magrittr:pipe]{\%>\%}}} -}} - diff --git a/man/row_sums.Rd b/man/row_sums.Rd index 732b053..0eab0c1 100644 --- a/man/row_sums.Rd +++ b/man/row_sums.Rd @@ -3,6 +3,8 @@ \name{row_sums} \alias{row_sums} \alias{row_means} +\alias{row_min} +\alias{row_max} \alias{row_apply} \title{Apply simple rowwise functions} \usage{ @@ -10,6 +12,10 @@ row_sums(data, ..., na_rm = FALSE, varname = "sum") row_means(data, ..., na_rm = FALSE, varname = "mean") +row_min(data, ..., na_rm = FALSE, varname = "min") + +row_max(data, ..., na_rm = FALSE, varname = "max") + row_apply(data, ..., .fun, varname = "var") } \arguments{ @@ -19,7 +25,7 @@ row_apply(data, ..., .fun, varname = "var") \item{na_rm}{Whether to remove \code{NA} values or not. Default is \code{FALSE}.} -\item{varname}{The column name of the sums means etc.} +\item{varname}{The column name of the sums means etc. as a character string.} \item{.fun}{The function to apply.} } @@ -48,6 +54,9 @@ d \%>\% d \%>\% row_means(matches('x|z')) +d \%>\% + row_max(matches('x|y')) + row_apply( d , everything(), diff --git a/tests/testthat/test_cat_by.R b/tests/testthat/test_cat_by.R index 2dafa3b..db8de68 100644 --- a/tests/testthat/test_cat_by.R +++ b/tests/testthat/test_cat_by.R @@ -1,7 +1,7 @@ context('test cat_by') set.seed(1) -df1 <- tibble( +df1 <- dplyr::tibble( g1 = factor(sample(1:2, 50, replace = TRUE), labels=c('a','b')), g2 = sample(1:4, 50, replace = TRUE), a = rnorm(50), diff --git a/tests/testthat/test_combn_2_col.R b/tests/testthat/test_combn_2_col.R index ab555d6..1d66cbd 100644 --- a/tests/testthat/test_combn_2_col.R +++ b/tests/testthat/test_combn_2_col.R @@ -22,7 +22,7 @@ test_that('combn_2_col will fail with no data', { }) test_that('combn_2_col handles factors, NAs, other separators', { - init <- tibble(id = 1:5, + init <- dplyr::tibble(id = 1:5, labs = factor(c('AB', 'B/C/D/E', 'A/E', 'D/E', NA))) expect_s3_class(combn_2_col(data=d, var='labs', sep='/', max_m=3), 'data.frame') diff --git a/tests/testthat/test_describe_all.R b/tests/testthat/test_describe_all.R index 2cb55ff..51947ee 100644 --- a/tests/testthat/test_describe_all.R +++ b/tests/testthat/test_describe_all.R @@ -2,7 +2,7 @@ context('test describe_all') set.seed(1234) -df1 <- tibble( +df1 <- dplyr::tibble( g1 = factor(sample(1:2, 50, replace = TRUE), labels=c('a','b')), g2 = sample(1:4, 50, replace = TRUE), g3 = ordered(sample(letters[1:3], 50, replace = TRUE)), diff --git a/tests/testthat/test_num_by.R b/tests/testthat/test_num_by.R index d79afd2..17df04d 100644 --- a/tests/testthat/test_num_by.R +++ b/tests/testthat/test_num_by.R @@ -1,6 +1,6 @@ context('test num_by') -df1 <- tibble( +df1 <- dplyr::tibble( g1 = factor(sample(1:2, 50, replace = TRUE), labels = c('a', 'b')), g2 = sample(1:4, 50, replace = TRUE), a = rnorm(50), diff --git a/tests/testthat/test_num_summary.R b/tests/testthat/test_num_summary.R index 69fe49f..df05e52 100644 --- a/tests/testthat/test_num_summary.R +++ b/tests/testthat/test_num_summary.R @@ -1,6 +1,6 @@ context('test num_summary') -df1 <- tibble( +df1 <- dplyr::tibble( g1 = factor(sample(1:2, 50, replace = TRUE), labels = c('a', 'b')), g2 = sample(1:4, 50, replace = TRUE), a = rnorm(50), diff --git a/tests/testthat/test_pre_process.R b/tests/testthat/test_pre_process.R index 453fe1b..68fd23f 100644 --- a/tests/testthat/test_pre_process.R +++ b/tests/testthat/test_pre_process.R @@ -2,7 +2,8 @@ context('test pre_process') set.seed(1234) -df1 <- tibble( + +df1 <- dplyr::tibble( g1 = factor(sample(1:2, 50, replace = TRUE), labels=c('a','b')), g2 = sample(1:4, 50, replace = TRUE), a = rnorm(50), diff --git a/tests/testthat/test_row_sums.R b/tests/testthat/test_row_sums.R index 2b860de..c08f4d8 100644 --- a/tests/testthat/test_row_sums.R +++ b/tests/testthat/test_row_sums.R @@ -52,25 +52,49 @@ test_that('row_means returns a data frame', { }) test_that('row_means returns correct values', { - expect_equal(d %>% row_means(x:z) %>% pull(mean), c(4, 5, 6)) + expect_equal(d %>% row_means(x:z) %>% pull(mean), 4:6) }) test_that('row_means handles NA', { - expect_equal(d %>% row_means(x:z, na_rm = TRUE) %>% pull(mean), c(4, 5, 6)) + expect_equal(d %>% row_means(x:q, na_rm = TRUE) %>% pull(mean), 4:6) }) test_that('row_means takes varname', { - expect_equal(d %>% row_means(x:z, varname = 'blah') %>% pull(blah), - c(4, 5, 6)) + expect_equal(d %>% row_means(x:z, varname = 'blah') %>% pull(blah), 4:6) }) test_that('row_means handles select helpers', { - expect_equal(d %>% row_means(dplyr::matches('x|z')) %>% pull(mean), - c(4, 5, 6)) + expect_equal(d %>% row_means(dplyr::matches('x|z')) %>% pull(mean), 4:6) }) +# row_min_max ------------------------------------------------------------- + +test_that('row_max returns a data frame', { + expect_s3_class(d %>% row_max(x:z), 'data.frame') + expect_s3_class(d %>% row_min(x:z), 'data.frame') +}) + +test_that('row_max returns correct values', { + expect_equal(d %>% row_max(x:z) %>% pull(max), 7:9) + expect_equal(d %>% row_min(x:z) %>% pull(min), 1:3) +}) + +test_that('row_max handles NA', { + expect_equal(d %>% row_max(x:q, na_rm = TRUE) %>% pull(max), 7:9) + expect_equal(d %>% row_min(x:q, na_rm = TRUE) %>% pull(min), 1:3) +}) + +test_that('row_max takes varname', { + expect_equal(d %>% row_max(x:z, varname = 'blah') %>% pull(blah), 7:9) + expect_equal(d %>% row_min(x:z, varname = 'blah') %>% pull(blah), 1:3) +}) + +test_that('row_max handles select helpers', { + expect_equal(d %>% row_max(dplyr::matches('x|z')) %>% pull(max), 7:9) + expect_equal(d %>% row_min(dplyr::matches('x|z')) %>% pull(min), 1:3) +}) # row_apply --------------------------------------------------------------- @@ -83,7 +107,7 @@ test_that('row_apply returns a data frame', { }) test_that('row_apply returns correct values', { - expect_equal(d %>% row_apply(x:z, .fun = fun_med) %>% pull(var), c(4, 5, 6)) + expect_equal(d %>% row_apply(x:z, .fun = fun_med) %>% pull(var), 4:6) }) test_that('row_apply returns correct values', { @@ -95,7 +119,7 @@ test_that('row_means takes varname', { expect_equal(d %>% row_apply(x:z, .fun = fun_med, varname = 'blah') %>% pull(blah), - c(4, 5, 6)) + 4:6) }) @@ -103,5 +127,5 @@ test_that('row_means handles select helpers', { expect_equal(d %>% row_apply(dplyr::matches('x|z'), .fun = fun_med) %>% pull(var), - c(4, 5, 6)) + 4:6) }) diff --git a/tests/testthat/test_sum_NA.R b/tests/testthat/test_sum_NA.R index e6d07d7..6ae5c0b 100644 --- a/tests/testthat/test_sum_NA.R +++ b/tests/testthat/test_sum_NA.R @@ -1,6 +1,6 @@ context('test sum_na') -df1 <- tibble( +df1 <- dplyr::tibble( g1 = factor(sample(1:2, 50, replace = TRUE), labels=c('a','b')), g2 = sample(1:4, 50, replace = TRUE), a = rnorm(50), diff --git a/vignettes/introduction.Rmd b/vignettes/introduction.Rmd index 2a815d4..ab84c53 100644 --- a/vignettes/introduction.Rmd +++ b/vignettes/introduction.Rmd @@ -31,15 +31,18 @@ knitr::opts_chunk$set(echo = T, message=F, warning=F, error=F, collapse = TRUE, To begin, we can load up the tidyverse and this package. I'll also create some data that will be useful for demonstration. ```{r packages} -library(tidyverse); library(tidyext) +library(tidyverse) +library(tidyext) + set.seed(8675309) + df1 <- tibble( - g1 = factor(sample(1:2, 50, replace = TRUE), labels=c('a','b')), + g1 = factor(sample(1:2, 50, replace = TRUE), labels = c('a', 'b')), g2 = sample(1:4, 50, replace = TRUE), a = rnorm(50), b = rpois(50, 10), - c = sample(letters, 50, replace=TRUE), - d = sample(c(T,F), 50, replace=TRUE) + c = sample(letters, 50, replace = TRUE), + d = sample(c(T, F), 50, replace = TRUE) ) df_miss = df1 @@ -189,88 +192,30 @@ Note that center/standardizing is done to any numeric variables *not* chosen for Here's a specific function you will probably never need, but will be glad to have if you do. Some data columns have multiple entries for each observation/cell. While it's understandable why someone would do this, it's not very good practice. This will split out the entries, or any particular combination of them, into their own indicator column. ```{r combn, message=TRUE} -d = data.frame(id = 1:4, - labs = c('A-B', 'B-C-D-E', 'A-E', 'D-E')) -combn_2_col(data=d, var='labs', max_m=2, sep = '-', collapse = ':', toInteger = T) - -combn_2_col(data=d, var='labs', max_m=2, sparse = T) -``` - -In addition, there is a function that makes tidyr's spread work like it should when you don't have unique identifiers. I spent a lot of time coming up with the name for this one. - -```{r spread2, error=TRUE} -# initial example from spread -stocks_init <- data.frame( - time = as.Date('2009-01-01') + 0:9, - X = rnorm(10, 0, 1), - Y = rnorm(10, 0, 2), - Z = rnorm(10, 0, 4) +d = data.frame(id = 1:4, labs = c('A-B', 'B-C-D-E', 'A-E', 'D-E')) + +combn_2_col( + data = d, + var = 'labs', + max_m = 2, + sep = '-', + collapse = ':', + toInteger = T ) -# a very common situation -stocks <- data.frame( - X = rnorm(10, 0, 1), - Y = rnorm(10, 0, 2), - Z = rnorm(10, 0, 4) +combn_2_col( + data = d, + var = 'labs', + max_m = 2, + sparse = T ) - - -stocksm_init <- stocks_init %>% - gather(stock, price, -time) -head(stocksm_init) - -stocksm_init %>% - spread(stock, price) %>% - head() # works fine - -# no time -stocksm <- stocks %>% - gather(stock, price) -head(stocksm) - -stocksm %>% - spread(stock, price) # annoying - -stocksm %>% - spread2(stock, price) # works fine -``` - -However, with unbalanced data the result can be interpreted in different ways, so you'll have a choice to make. - -```{r spread2compact} -stocksm$price[sample(1:nrow(stocksm), 5)] = NA -stocksm %>% - spread2(stock, price) - -stocksm %>% - spread2(stock, price, compact = FALSE) ``` -I can speak from experience that having longitudinal data in the `compact = FALSE` format is about the worst way you could keep such data. - -Use gather_multi to extend `tidyr::gather` to deal with multiple sets of variables. - -```{r gather_multi} -demo_data_wide = data.frame(id = 1:10, - X = matrix(rnorm(40), ncol = 4), - Y = matrix(sample(0:1, 40, replace = T), ncol = 4), - Z = matrix(rpois(40, 5), ncol = 4)) -head(demo_data_wide) - -gather_multi(demo_data_wide, - key = wave, - values = vars(X, Y, Z), - varlist = vars(starts_with('X'), - starts_with('Y'), - starts_with('Z')), - -id) -``` -Note that this only makes sense with balanced data, though it will work if NAs are present. Also, you may want to change the key in some fashion. Oftentimes I need to create a column that represents the total scores or means of just a few columns. This is a slight annoyance in the tidyverse, and there isn't much support behind the dplyr:rowwise function. As such, tidyext has a couple simple wrappers for row_sums, row_means, and row_apply. -```{r} +```{r rowapply} d = data.frame(x = 1:3, y = 4:6, z = 7:9,