diff --git a/R/duplicates.R b/R/duplicates.R index c517b8d..9cb7355 100644 --- a/R/duplicates.R +++ b/R/duplicates.R @@ -1,16 +1,24 @@ -#' Find duplicate rows in a data frame +#' Keep non-unique rows in a data frame #' -#' Finds duplicate rows within a data frame. Column names can optionally be -#' provided. If column names aren't provided, the entire data frame (all -#' columns) will be searched for duplicate rows. If column names are provided, -#' only the specified columns will be searched for duplicate rows. +#' Keeps only non-unique rows within a data frame. #' #' @param .data A data.frame. -#' @param ... Optional column names to use when searching for duplicate rows in specific columns. -#' @return A tibble containing only duplicate rows. +#' @param ... Optional variables to use when determining non-uniqueness. +#' If omitted, will use all variables in the data frame. +#' @param .keep_all A flag specifying whether to keep all variables in .data. +#' @return The original data frame with only non-unique rows. #' @export -duplicates <- function(.data, ...) { +#' @examples +#' data <- tibble::tibble(x = c(1,2,1,1), y = c(1,1,1,5)) +#' +#' duplicates(data) +#' duplicates(data, x) +#' duplicates(data, y) +#' duplicates(data, x, y) +#' duplicates(data, y, .keep_all = FALSE) +duplicates <- function(.data, ..., .keep_all = TRUE) { check_data(.data) + chk_flag(.keep_all) col <- rlang::ensyms(...) if (length(col) == 0) { @@ -27,12 +35,12 @@ duplicates <- function(.data, ...) { return(.data) } .data_dup <- dplyr::select(.data, dplyr::all_of(col_names)) - print(.data_dup) .data_dup <- .data_dup[duplicated(.data_dup), , drop = FALSE] - print(.data_dup) .data_dup <- unique(.data_dup) - print(.data_dup) - .data <- merge(.data, .data_dup, by = col_names) + .data <- dplyr::inner_join(.data, .data_dup, by = col_names) + if(!(.keep_all)) { + .data <- dplyr::select(.data, dplyr::all_of(col_names)) + } .data <- dplyr::as_tibble(.data) .data } diff --git a/man/duplicates.Rd b/man/duplicates.Rd index 77896c1..87a8022 100644 --- a/man/duplicates.Rd +++ b/man/duplicates.Rd @@ -2,21 +2,30 @@ % Please edit documentation in R/duplicates.R \name{duplicates} \alias{duplicates} -\title{Find duplicate rows in a data frame} +\title{Keep non-unique rows in a data frame} \usage{ -duplicates(.data, ...) +duplicates(.data, ..., .keep_all = TRUE) } \arguments{ \item{.data}{A data.frame.} -\item{...}{Optional column names to use when searching for duplicate rows in specific columns.} +\item{...}{Optional variables to use when determining non-uniqueness. +If omitted, will use all variables in the data frame.} + +\item{.keep_all}{A flag specifying whether to keep all variables in .data.} } \value{ -A tibble containing only duplicate rows. +The original data frame with only non-unique rows. } \description{ -Finds duplicate rows within a data frame. Column names can optionally be -provided. If column names aren't provided, the entire data frame (all -columns) will be searched for duplicate rows. If column names are provided, -only the specified columns will be searched for duplicate rows. +Keeps only non-unique rows within a data frame. +} +\examples{ +data <- tibble::tibble(x = c(1,2,1,1), y = c(1,1,1,5)) + +duplicates(data) +duplicates(data, x) +duplicates(data, y) +duplicates(data, x, y) +duplicates(data, y, .keep_all = FALSE) } diff --git a/tests/testthat/test-duplicates.R b/tests/testthat/test-duplicates.R index 1a2222b..e038d51 100644 --- a/tests/testthat/test-duplicates.R +++ b/tests/testthat/test-duplicates.R @@ -3,23 +3,89 @@ test_that("returns only duplicated rows of selected columns", { expect_identical(duplicates(tib), tib[c(1, 3), ]) expect_identical(duplicates(as.data.frame(tib)), tib[c(1, 3), ]) - expect_equal( + expect_identical( duplicates(data.frame(x = c(1, 2, 1), y = 1:3), x), - dplyr::tibble(x = c(1, 1), y = c(1, 3)) + dplyr::tibble(x = c(1, 1), y = c(1L, 3L)) ) - expect_equal( + expect_identical( duplicates(data.frame(x = c(1, 2, 1), y = 1:3), x, y), - dplyr::tibble(x = double(0), y = double(0)) + dplyr::tibble(x = double(0), y = integer(0)) ) }) -test_that("errors when no input argument is supplied", { - expect_error( - duplicates(), - 'argument ".data" is missing, with no default', - fixed = TRUE +test_that("keep_all working", { + data <- tibble::tibble(x = c(1,2,1,1), y = c(1,1,1,5)) + expect_identical(duplicates(data, y), + tibble::tibble(x = c(1,2,1), y = c(1,1,1))) + expect_identical(duplicates(data, x, y), + tibble::tibble(x = c(1,1), y = c(1,1))) + expect_identical(duplicates(data, y, x), + tibble::tibble(x = c(1,1), y = c(1,1))) + expect_identical(duplicates(data), + tibble::tibble(x = c(1,1), y = c(1,1))) + expect_identical(duplicates(data, y, .keep_all = FALSE), + tibble::tibble(y = c(1,1,1))) +}) + + +test_that("handles data frame with no rows", { + data <- dplyr::tibble(x = integer(), y = integer()) + + expect_equal( + duplicates(data), + data ) + + expect_equal( + duplicates(data, x), + data + ) + + expect_equal( + duplicates(data, x, .keep_all = FALSE), + dplyr::tibble(x = integer()) + ) +}) + +test_that("handles data frame with no columns", { + data <- dplyr::tibble() + + expect_identical( + duplicates(data), + data + ) + + expect_identical( + duplicates(data, .keep_all = FALSE), + data + ) +}) + +test_that("handles columns with missing values", { + data <- tibble::tibble(x = c(1,2,NA,1,1), y = c(1,1,NA,NA,NA)) + + expect_identical( + duplicates(data), + tibble::tibble(x = c(1, 1), y = as.double(c(NA, NA))) + ) + + expect_identical( + duplicates(data, y), + data + ) + + expect_identical( + duplicates(data, y, .keep_all = FALSE), + tibble::tibble(y = c(1, 1, NA, NA, NA)) + ) +}) + +test_that("handles data set with no duplicates", { + data <- tibble::tibble(x = c(1,2,NA), z = 1:3) + expect_identical(duplicates(data), tibble::tibble(x = double(), z = integer())) + expect_identical(duplicates(data, x), tibble::tibble(x = double(), z = integer())) + expect_identical(duplicates(data, x, .keep_all = FALSE), tibble::tibble(x = double())) }) test_that("errors when input argument is not a data.frame", {