Skip to content

Commit

Permalink
add row_min, row_max, minor cleanup elsewhere
Browse files Browse the repository at this point in the history
  • Loading branch information
m-clark committed Jul 13, 2020
1 parent 4ad02fe commit 87df6da
Show file tree
Hide file tree
Showing 58 changed files with 1,617 additions and 1,577 deletions.
5 changes: 2 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: tidyext
Type: Package
Title: Tidy Extensions for Data Processing
Version: 0.3.5
Version: 0.3.6
Authors@R: person("Michael", "Clark", role = c("aut", "cre"), email = "micl@umich.edu")
Maintainer: Michael Clark <micl@umich.edu>
Description: Common data processing and summary functions to extend your tidy ways.
Expand All @@ -16,10 +16,8 @@ Depends:
Imports:
dplyr (>= 1.0.0),
purrr,
magrittr,
rlang,
scales,
tibble,
tidyr (>= 1.0.0)
Suggests:
ggplot2,
Expand All @@ -28,5 +26,6 @@ Suggests:
stringi,
stringr,
testthat,
tibble,
covr
RoxygenNote: 7.1.1
7 changes: 2 additions & 5 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
# Generated by roxygen2: do not edit by hand

export("%>%")
export(cat_by)
export(combn_2_col)
export(create_prediction_data)
Expand All @@ -15,17 +14,16 @@ export(onehot)
export(pre_process)
export(rnd)
export(row_apply)
export(row_max)
export(row_means)
export(row_min)
export(row_sums)
export(spread2)
export(sum_NA)
export(sum_NaN)
export(sum_blank)
export(vars)
import(dplyr)
importFrom(dplyr,quo_name)
importFrom(dplyr,vars)
importFrom(magrittr,"%>%")
importFrom(purrr,map)
importFrom(purrr,map_df)
importFrom(purrr,map_dfr)
Expand All @@ -39,7 +37,6 @@ importFrom(stats,model.matrix)
importFrom(stats,na.omit)
importFrom(stats,quantile)
importFrom(stats,sd)
importFrom(tibble,rowid_to_column)
importFrom(tidyr,gather)
importFrom(tidyr,nest)
importFrom(tidyr,unnest)
Expand Down
6 changes: 5 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# tidyext 0.3.6

Added `row_min` and `row_max`. Removed some dependencies.

# tidyext 0.3.5

Cleanup and update for R 4.0 and dplyr 1.0. Deprecate gather_multi and spread2.
Cleanup and update for R 4.0 and dplyr 1.0. Deprecate `gather_multi` and `spread2`.

# tidyext 0.3.1

Expand Down
108 changes: 37 additions & 71 deletions R/combn_2_col.R
Original file line number Diff line number Diff line change
Expand Up @@ -47,92 +47,57 @@
#' only the indicator columns.
#' @examples
#' library(tidyext)
#' d = data.frame(id = 1:4,
#' labs = c('A/B', 'B/C/D/E', 'A/E', 'D/E'))
#' test = combn_2_col(data=d, var='labs', max_m=3)
#'
#' d = data.frame(id = 1:4, labs = c('A/B', 'B/C/D/E', 'A/E', 'D/E'))
#' test = combn_2_col(data = d, var = 'labs', max_m = 3)
#' test
#' str(test)
#'
#' d$labs = c('A B', 'B C D E', 'A E', 'D E')
#' combn_2_col(data=d, var='labs', max_m=1)
#' combn_2_col(data = d, var = 'labs', max_m = 1)
#'
#' d$labs = c('Tom, Dick & Harriet', "J'Sean", "OBG, Andreas", NA)
#' combn_2_col(data=d, var='labs', sep=',', max_m=2, collapse='-')
#'
#' \dontrun{
#' # requires at least tidytext
#' tidy_dtm <- function(data, var, sep='-', max_m=3) {
#' init = stringr::str_split(data[[var]], pattern = sep) # creates a list of separated letters
#'
#' # the following gets the combos with a dot separating drugs in a given combo
#' # this first lapply could be parallelized if need be and is probably slowest
#' # probably want to change to m = min(c(4, m)) so as to only limit to 4
#' # see also, combinat::combn which is slightly faster than base R below
#' observation_combos = init %>%
#' lapply(function(x)
#' sapply(seq_along(x), function(m)
#' utils::combn(x, min(max_m, m), FUN=paste, collapse = '_')))
#'
#' # now we have a standard text analysis problem in need of a document term
#' matrix
#' documents = observation_combos %>% lapply(unlist)
#'
#' # create a 'tidy' form of documents and terms; each term (i.e. combo) only
#' occurs once in a document
#' doc_df = data.frame(id=rep(data$id, sapply(documents, length)),
#' combos=unlist(documents),
#' count=1) # each term only occurs once in the document
#' doc_df %>%
#' tidytext::cast_dfm(document=id, term=combos, value=count)
#' }
#'
#' # requires at least text2vec
#' ttv <- function(data, var, sep='-', max_m=3) {
#' docs = sapply(stringr::str_split(data[[var]], pattern=sep),
#' function(str_vec)
#' sapply(seq_along(str_vec),
#' function(m)
#' combn(str_vec,
#' m = min(max_m, m),
#' FUN = paste,
#' collapse = '_')
#' ) %>% unlist()
#' )
#'
#' toks = itoken(docs, progressbar = FALSE)
#' vocab = create_vocabulary(toks)
#' create_dtm(toks, vectorizer = vocab_vectorizer(vocab), progressbar = FALSE) %>%
#' as.matrix() %>%
#' cbind(data,.)
#' }
#'
#' }
#'
#' combn_2_col(
#' data = d,
#' var = 'labs',
#' sep = ',',
#' max_m = 2,
#' collapse = '-'
#' )
#'
#'
#'
#'
#' @export
combn_2_col <- function(data,
var,
sep='[^[:alnum:]]+',
max_m=1,
collapse = '_',
toInteger=FALSE,
sparse=FALSE) {
combn_2_col <- function(
data,
var,
sep = '[^[:alnum:]]+',
max_m = 1,
collapse = '_',
toInteger = FALSE,
sparse = FALSE
) {

if (is.null(data) | is.null(var))
stop('Need data and variable name to continue.')

if (max_m < 1) stop('Need positive value for max_m.')

data$combo <-
map(stringr::str_split(data[[var]], pattern=sep),
function(str_vec)
map(seq_along(str_vec),
function(m)
combn(str_vec,
m = min(max_m, m),
FUN = paste,
collapse = collapse)
) %>% unlist()
map(stringr::str_split(data[[var]], pattern = sep),
function(str_vec)
map(seq_along(str_vec),
function(m)
combn(str_vec,
m = min(max_m, m),
FUN = paste,
collapse = collapse)
) %>%
unlist()
)

combo_cols <- unique(unlist(data$combo))

if (sparse) {
Expand All @@ -142,7 +107,7 @@ combn_2_col <- function(data,
do.call(rbind,.) %>%
Matrix::Matrix(sparse = TRUE,
dimnames = list(rownames(data), combo_cols))
)
)
}

if (toInteger) {
Expand All @@ -156,6 +121,7 @@ combn_2_col <- function(data,
map(function(x) combo_cols %in% x) %>%
do.call(rbind,.)
}

data
}

4 changes: 2 additions & 2 deletions R/gather_multi.R
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ gather_multi <- function(data,
na.rm = na.rm,
convert = convert,
factor_key = factor_key) %>%
rowid_to_column()
mutate(rowid = 1:nrow(.)) %>% # changed to get rid of tibble requirement while deprecated

for (i in 2:length(varlist)) {
data_long <- data %>%
Expand All @@ -144,7 +144,7 @@ gather_multi <- function(data,
na.rm = na.rm,
convert = convert,
factor_key = factor_key) %>%
rowid_to_column()%>%
mutate(rowid = 1:nrow(.)) %>% # changed to get rid of tibble requirement while deprecated
select(rowid, !!values[[i]]) %>%
left_join(data_long, ., by='rowid')
}
Expand Down
2 changes: 1 addition & 1 deletion R/globals.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ utils::globalVariables(c(".", 'Max', 'Mean', 'Median', 'Min', 'Missing', 'N',
'Q1', 'Q3', 'SD', 'Variable', 'X1st.Qu.', 'X3rd.Qu.',
'perc', 'result', 'results', 'target', 'value',
'x.Freq', 'x.x', 'y.Freq', 'y.x', '%', 'rowid',
'Group', 'Frequency'))
'Group', 'Frequency', 'rn'))
4 changes: 2 additions & 2 deletions R/head_tail.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
#' @examples
#'
#' library(tidyext)
#' as.matrix(mtcars) %>%
#' head_tail(6)
#'
#' head_tail(mtcars)
#'
head_tail = function(data, n_slice = 6) {
# initial checks
Expand Down
2 changes: 1 addition & 1 deletion R/num_summary.R
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ num_summary <- function(x, digits = 1, extra = FALSE) {

x <- as.numeric(x)

d <- tibble(
d <- dplyr::tibble(
N = length(na.omit(x)),
data.frame(t(c(summary(x)))),
SD = sd(x, na.rm = TRUE),
Expand Down
11 changes: 7 additions & 4 deletions R/pre_process.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,17 @@
#' them.
#' @importFrom scales rescale
#' @return A data frame that has been processed
#' @export
#'
#' @examples
#' library(tidyext)
#' library(dplyr)
#'
#' pre_process(mtcars)
#' pre_process(mtcars, log_vars=vars(mpg, wt))
#' pre_process(mtcars, zero_start=vars(cyl, gear))
#' pre_process(mtcars, zero_one=vars(mpg))
#' pre_process(mtcars, log_vars = vars(mpg, wt))
#' pre_process(mtcars, zero_start = vars(cyl, gear))
#' pre_process(mtcars, zero_one = vars(mpg))
#'
#' @export
pre_process <- function(
data,
std = TRUE,
Expand Down
29 changes: 28 additions & 1 deletion R/row_sums.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#' @param ... The columns to sum, take the mean of, etc. \emph{Required}.
#' @param .fun The function to apply.
#' @param na_rm Whether to remove \code{NA} values or not. Default is \code{FALSE}.
#' @param varname The column name of the sums means etc.
#' @param varname The column name of the sums means etc. as a character string.
#'
#' @details Simple wrappers for applying rowwise operations only for selected
#' columns within the tidyverse approach to data processing. The
Expand All @@ -28,6 +28,9 @@
#' d %>%
#' row_means(matches('x|z'))
#'
#' d %>%
#' row_max(matches('x|y'))
#'
#' row_apply(
#' d ,
#' everything(),
Expand All @@ -37,6 +40,8 @@
#'
#' @export
row_sums <- function(data, ..., na_rm = FALSE, varname = 'sum') {
# note: dplyr 1.0 included rowwise operations, but it wasn't obvious what
# advantage there would be for these functions except for min and max
dplyr::mutate(data, !!varname := rowSums(select(data, ...), na.rm = na_rm))
}

Expand All @@ -46,6 +51,28 @@ row_means <- function(data, ..., na_rm = FALSE, varname = 'mean') {
dplyr::mutate(data, !!varname := rowMeans(select(data, ...), na.rm = na_rm))
}

#' @export
#' @rdname row_sums
row_min <- function(data, ..., na_rm = FALSE, varname = 'min') {
dplyr::select(data, ...) %>%
dplyr::mutate(rn = 1:nrow(.)) %>%
dplyr::rowwise(rn) %>%
dplyr::mutate(!!varname := min(dplyr::c_across(...), na.rm = na_rm)) %>%
dplyr::ungroup() %>%
dplyr::select(-rn)
}

#' @export
#' @rdname row_sums
row_max <- function(data, ..., na_rm = FALSE, varname = 'max') {
dplyr::select(data, ...) %>%
dplyr::mutate(rn = 1:nrow(.)) %>%
dplyr::rowwise(rn) %>%
dplyr::mutate(!!varname := max(dplyr::c_across(...), na.rm = na_rm)) %>%
dplyr::ungroup() %>%
dplyr::select(-rn)
}

#' @export
#' @rdname row_sums
row_apply <- function(data, ..., .fun, varname = 'var') {
Expand Down
3 changes: 1 addition & 2 deletions R/spread2.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
#'
#' @return A data frame with 'wide' format.
#' @seealso \code{\link[tidyr]{spread}}
#' @importFrom tibble rowid_to_column
#' @examples
#' \dontrun{
#' library(tidyext); library(tidyr)
Expand Down Expand Up @@ -90,7 +89,7 @@ spread2 <- function(data,
} else {
data <- data %>%
bind_cols(data %>%
tibble::rowid_to_column() %>%
mutate(rowid = 1:nrow(.)) %>% # changed to get rid of tibble requirement while deprecated
select(rowid)
)
}
Expand Down
10 changes: 1 addition & 9 deletions R/utils.R
Original file line number Diff line number Diff line change
@@ -1,9 +1 @@
#' @importFrom magrittr %>%
#' @export
magrittr::`%>%`



#' @importFrom dplyr vars
#' @export
dplyr::vars
#' @importFrom dplyr vars `%>%` tibble
4 changes: 3 additions & 1 deletion _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,14 +40,16 @@ reference:
- create_prediction_data
- gather_multi
- onehot
# - pre_process
- pre_process
# - spread2
- row_sums
- title: "Miscellaneous"
desc: >
Miscellaneous functions.
contents:
- select_not
- head_tail
- rnd
- tidyext
figures:
dev: svglite::svglite
Expand Down
Loading

0 comments on commit 87df6da

Please sign in to comment.