Skip to content

Commit

Permalink
Merge pull request #19 from poissonconsulting/duplicate2
Browse files Browse the repository at this point in the history
- Added `.keep_all` argument to `duplicates()`
  • Loading branch information
dunkenwg authored Aug 1, 2024
2 parents 7d65cb3 + 8f770fe commit 689a12d
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 29 deletions.
32 changes: 20 additions & 12 deletions R/duplicates.R
Original file line number Diff line number Diff line change
@@ -1,16 +1,24 @@
#' Find duplicate rows in a data frame
#' Keep non-unique rows in a data frame
#'
#' Finds duplicate rows within a data frame. Column names can optionally be
#' provided. If column names aren't provided, the entire data frame (all
#' columns) will be searched for duplicate rows. If column names are provided,
#' only the specified columns will be searched for duplicate rows.
#' Keeps only non-unique rows within a data frame.
#'
#' @param .data A data.frame.
#' @param ... Optional column names to use when searching for duplicate rows in specific columns.
#' @return A tibble containing only duplicate rows.
#' @param ... Optional variables to use when determining non-uniqueness.
#' If omitted, will use all variables in the data frame.
#' @param .keep_all A flag specifying whether to keep all variables in .data.
#' @return The original data frame with only non-unique rows.
#' @export
duplicates <- function(.data, ...) {
#' @examples
#' data <- tibble::tibble(x = c(1,2,1,1), y = c(1,1,1,5))
#'
#' duplicates(data)
#' duplicates(data, x)
#' duplicates(data, y)
#' duplicates(data, x, y)
#' duplicates(data, y, .keep_all = FALSE)
duplicates <- function(.data, ..., .keep_all = TRUE) {
check_data(.data)
chk_flag(.keep_all)

col <- rlang::ensyms(...)
if (length(col) == 0) {
Expand All @@ -27,12 +35,12 @@ duplicates <- function(.data, ...) {
return(.data)
}
.data_dup <- dplyr::select(.data, dplyr::all_of(col_names))
print(.data_dup)
.data_dup <- .data_dup[duplicated(.data_dup), , drop = FALSE]
print(.data_dup)
.data_dup <- unique(.data_dup)
print(.data_dup)
.data <- merge(.data, .data_dup, by = col_names)
.data <- dplyr::inner_join(.data, .data_dup, by = col_names)
if(!(.keep_all)) {
.data <- dplyr::select(.data, dplyr::all_of(col_names))
}
.data <- dplyr::as_tibble(.data)
.data
}
25 changes: 17 additions & 8 deletions man/duplicates.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

84 changes: 75 additions & 9 deletions tests/testthat/test-duplicates.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,89 @@ test_that("returns only duplicated rows of selected columns", {
expect_identical(duplicates(tib), tib[c(1, 3), ])
expect_identical(duplicates(as.data.frame(tib)), tib[c(1, 3), ])

expect_equal(
expect_identical(
duplicates(data.frame(x = c(1, 2, 1), y = 1:3), x),
dplyr::tibble(x = c(1, 1), y = c(1, 3))
dplyr::tibble(x = c(1, 1), y = c(1L, 3L))
)

expect_equal(
expect_identical(
duplicates(data.frame(x = c(1, 2, 1), y = 1:3), x, y),
dplyr::tibble(x = double(0), y = double(0))
dplyr::tibble(x = double(0), y = integer(0))
)
})

test_that("errors when no input argument is supplied", {
expect_error(
duplicates(),
'argument ".data" is missing, with no default',
fixed = TRUE
test_that("keep_all working", {
data <- tibble::tibble(x = c(1,2,1,1), y = c(1,1,1,5))
expect_identical(duplicates(data, y),
tibble::tibble(x = c(1,2,1), y = c(1,1,1)))
expect_identical(duplicates(data, x, y),
tibble::tibble(x = c(1,1), y = c(1,1)))
expect_identical(duplicates(data, y, x),
tibble::tibble(x = c(1,1), y = c(1,1)))
expect_identical(duplicates(data),
tibble::tibble(x = c(1,1), y = c(1,1)))
expect_identical(duplicates(data, y, .keep_all = FALSE),
tibble::tibble(y = c(1,1,1)))
})


test_that("handles data frame with no rows", {
data <- dplyr::tibble(x = integer(), y = integer())

expect_equal(
duplicates(data),
data
)

expect_equal(
duplicates(data, x),
data
)

expect_equal(
duplicates(data, x, .keep_all = FALSE),
dplyr::tibble(x = integer())
)
})

test_that("handles data frame with no columns", {
data <- dplyr::tibble()

expect_identical(
duplicates(data),
data
)

expect_identical(
duplicates(data, .keep_all = FALSE),
data
)
})

test_that("handles columns with missing values", {
data <- tibble::tibble(x = c(1,2,NA,1,1), y = c(1,1,NA,NA,NA))

expect_identical(
duplicates(data),
tibble::tibble(x = c(1, 1), y = as.double(c(NA, NA)))
)

expect_identical(
duplicates(data, y),
data
)

expect_identical(
duplicates(data, y, .keep_all = FALSE),
tibble::tibble(y = c(1, 1, NA, NA, NA))
)
})

test_that("handles data set with no duplicates", {
data <- tibble::tibble(x = c(1,2,NA), z = 1:3)
expect_identical(duplicates(data), tibble::tibble(x = double(), z = integer()))
expect_identical(duplicates(data, x), tibble::tibble(x = double(), z = integer()))
expect_identical(duplicates(data, x, .keep_all = FALSE), tibble::tibble(x = double()))
})

test_that("errors when input argument is not a data.frame", {
Expand Down

0 comments on commit 689a12d

Please sign in to comment.