From 6ac9d79783048b1919c839955414b8dbe1fa6b3f Mon Sep 17 00:00:00 2001 From: Stephanie Reinders Date: Tue, 26 Nov 2024 13:28:02 -0600 Subject: [PATCH] Changed `get_cluster_fill_counts()` to use `tidyselect` --- DESCRIPTION | 1 + R/cluster_format.R | 45 +++++++++++++--------------------- man/get_cluster_fill_counts.Rd | 14 +++++++---- 3 files changed, 27 insertions(+), 33 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index cf79788a..8017da30 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -33,6 +33,7 @@ Imports: rjags, stringr, tidyr, + tidyselect Suggests: knitr, rmarkdown, diff --git a/R/cluster_format.R b/R/cluster_format.R index a3cf6e64..10638fd3 100644 --- a/R/cluster_format.R +++ b/R/cluster_format.R @@ -65,51 +65,40 @@ format_template_data <- function(template) { } -#' get_cluster_fill_counts +#' Get Cluster Fill Counts #' #' `get_cluster_fill_counts()` creates a data frame that shows the number of #' graphs in each cluster for each input document. #' -#' @param df A data frame with columns `writer`, `doc`, and `cluster`. Each -#' row corresponding to a graph and lists the writer of that graph, the document -#' from which the graph was obtained, and the cluster to which that graph is assigned. -#' @return A dataframe of cluster fill counts for each document in the input data frame. -#' +#' @param df A data frame with columns `docname` and `cluster`. Each row +#' corresponding to a graph and lists the document from which the graph was +#' obtained, and the cluster to which that graph is assigned. Optionally, the +#' data frame might also have `writer` and `doc` columns. If present, `writer` +#' lists the writer ID of each document and `doc` is an identifier to +#' distinguish between different documents from the same writer. +#' @return A dataframe of cluster fill counts for each document in the input +#' data frame. +#' #' @examples -#' writer <- c(rep(1, 20), rep(2, 20), rep(3, 20)) #' docname <- c(rep('doc1',20), rep('doc2', 20), rep('doc3', 20)) +#' writer <- c(rep(1, 20), rep(2, 20), rep(3, 20)) #' doc <- c(rep(1, 20), rep(2, 20), rep(3, 20)) #' cluster <- sample(3, 60, replace=TRUE) #' df <- data.frame(docname, writer, doc, cluster) #' get_cluster_fill_counts(df) -#' +#' #' @export #' @md get_cluster_fill_counts <- function(df) { docname <- writer <- doc <- cluster <- n <- NULL - if (('writer' %in% colnames(df)) && ('doc' %in% colnames(df))) { - # count number of graphs in each cluster for each writer - cluster_fill_counts <- df %>% - dplyr::group_by(docname, writer, doc, cluster) %>% - dplyr::summarise(n = dplyr::n()) %>% - dplyr::mutate(n = as.integer(n)) %>% - tidyr::pivot_wider(names_from = cluster, values_from = n, values_fill = 0) - - # sort columns - cols <- c(colnames(cluster_fill_counts[, c(1, 2, 3)]), sort(as.numeric(colnames(cluster_fill_counts[, -c(1, 2, 3)])))) - cluster_fill_counts <- cluster_fill_counts[, cols] - } else { - cluster_fill_counts <- df %>% - dplyr::group_by(docname, cluster) %>% + # count number of graphs in each cluster for each writer + cluster_fill_counts <- df %>% + dplyr::group_by(dplyr::pick(tidyselect::any_of(c("docname", "writer", "doc", "cluster")))) %>% dplyr::summarise(n = dplyr::n()) %>% dplyr::mutate(n = as.integer(n)) %>% - tidyr::pivot_wider(names_from = cluster, values_from = n, values_fill = 0) - - # sort columns - cols <- c(colnames(cluster_fill_counts[, c(1)]), sort(as.numeric(colnames(cluster_fill_counts[, -c(1)])))) - cluster_fill_counts <- cluster_fill_counts[, cols] - } + tidyr::pivot_wider(names_from = cluster, values_from = n, values_fill = 0) %>% + dplyr::select(tidyselect::any_of(c("docname", "writer", "doc")), tidyselect::everything()) return(cluster_fill_counts) } diff --git a/man/get_cluster_fill_counts.Rd b/man/get_cluster_fill_counts.Rd index 1d15ea4a..203c6478 100644 --- a/man/get_cluster_fill_counts.Rd +++ b/man/get_cluster_fill_counts.Rd @@ -7,20 +7,24 @@ get_cluster_fill_counts(df) } \arguments{ -\item{df}{A data frame with columns \code{writer}, \code{doc}, and \code{cluster}. Each -row corresponding to a graph and lists the writer of that graph, the document -from which the graph was obtained, and the cluster to which that graph is assigned.} +\item{df}{A data frame with columns \code{docname} and \code{cluster}. Each row +corresponding to a graph and lists the document from which the graph was +obtained, and the cluster to which that graph is assigned. Optionally, the +data frame might also have \code{writer} and \code{doc} columns. If present, \code{writer} +lists the writer ID of each document and \code{doc} is an identifier to +distinguish between different documents from the same writer.} } \value{ -A dataframe of cluster fill counts for each document in the input data frame. +A dataframe of cluster fill counts for each document in the input +data frame. } \description{ \code{get_cluster_fill_counts()} creates a data frame that shows the number of graphs in each cluster for each input document. } \examples{ -writer <- c(rep(1, 20), rep(2, 20), rep(3, 20)) docname <- c(rep('doc1',20), rep('doc2', 20), rep('doc3', 20)) +writer <- c(rep(1, 20), rep(2, 20), rep(3, 20)) doc <- c(rep(1, 20), rep(2, 20), rep(3, 20)) cluster <- sample(3, 60, replace=TRUE) df <- data.frame(docname, writer, doc, cluster)