From 20ce4acc86ac316b035212426c009fe66e2df419 Mon Sep 17 00:00:00 2001 From: Stephanie Reinders Date: Fri, 9 Feb 2024 09:32:59 -0600 Subject: [PATCH 1/8] Created new exported function to get cluster assignments The internal function `get_clusterassignment()` is setup to wrangle the cluster assignments into the proper format for either `fit_model()` or `analyze_questioned_documents()`. Instead of adapting `get_clusterassignment()` to work on other types of data as well, I think it makes more sense to make a separate exported function that gets cluster assignments for docs in a user defined folder exported in a general format. --- NAMESPACE | 1 + NEWS.md | 9 ++ R/ClusterModeling_clusterassignment.R | 116 ++++++++++++++++++++++++++ man/get_clusters_batch.Rd | 36 ++++++++ 4 files changed, 162 insertions(+) create mode 100644 man/get_clusters_batch.Rd diff --git a/NAMESPACE b/NAMESPACE index 10601ef7..ab9f2faa 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,6 +9,7 @@ export(drop_burnin) export(extractGraphs) export(fit_model) export(format_template_data) +export(get_clusters_batch) export(get_credible_intervals) export(get_posterior_probabilities) export(make_clustering_templates) diff --git a/NEWS.md b/NEWS.md index 26b25f31..c839444b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,14 @@ # handwriter (development version) +<<<<<<< Updated upstream +======= +* Make `get_clusterassignment()` available to users. + +# handwriter 2.0.3 + +* Fix initializer warning with arma::vec. Apparently different versions of gcc were raising warnings with the assignment of values in neighbors. The recommended fix was to use an initializer list. + +>>>>>>> Stashed changes # handwriter 2.0.2 ## New features diff --git a/R/ClusterModeling_clusterassignment.R b/R/ClusterModeling_clusterassignment.R index cb70a791..41da3038 100644 --- a/R/ClusterModeling_clusterassignment.R +++ b/R/ClusterModeling_clusterassignment.R @@ -15,6 +15,122 @@ # along with this program. If not, see . +#' get_clusters_batch +#' +#' @param template A cluster template created with [`make_clustering_templates`] +#' @param input_dir A directory containing graphs created with [`process_batch_dir`] +#' @param output_dir Output directory for cluster assignments +#' @param writer_indices Vector of start and end indices for the writer id in +#' the graph file names +#' @param doc_indices Vector of start and end indices for the document id in the +#' graph file names +#' @param num_cores Integer number of cores to use for parallel processing +#' +#' @return A list of cluster assignments +#' +#' @export +#' @md +get_clusters_batch = function(template, input_dir, output_dir, writer_indices, doc_indices, num_cores){ + # bind global variables to fix check() note + i <- outliercut <- docname <- NULL + + message('Starting cluster assginment...') + # make output directory + if ( !dir.exists(output_dir) ){ dir.create(output_dir) } + + # list files in input dir + proclist = list.files(input_dir, full.names = TRUE) + + my_cluster = parallel::makeCluster(num_cores, outfile="") + doParallel::registerDoParallel(my_cluster) + + proclist <- foreach::foreach(i = 1:length(proclist), + .combine = 'rbind', + .export = c("AddLetterImages", "MakeLetterListLetterSpecific", "centeredImage", "makeassignment", "angle")) %dopar% { # for each document i + + # out_proclist = list() + # for (i in 1:length(proclist)){ + # load doc + message(paste(' Loading graphs for', basename(proclist[i]))) + doc <- readRDS(proclist[i]) + + # check that doc$docname is not blank + if (!("docname" %in% names(doc))){ + if ( !dir.exists(file.path(output_dir, 'problem_files'))){ + dir.create(file.path(output_dir, 'problem_files')) + } + file.copy(proclist[i], file.path(output_dir, 'problem_files', basename(proclist[i]))) + message(paste("docname is NULL for", proclist[i], '\n')) + # next + return() + } + + # load outfile if it already exists + outfile <- file.path(output_dir, paste0(stringr::str_replace(doc$docname, ".png", ""), ".rds")) + if ( file.exists(outfile) ){ + message(paste(' Cluster assignments already exist for', doc$docname, '\n')) + df <- readRDS(outfile) + return(df) + # out_proclist[[i]] <- df + # next + } + + # extra processing + doc$process$letterList = AddLetterImages(doc$process$letterList, dim(doc$image)) + doc$process$letterList = MakeLetterListLetterSpecific(doc$process$letterList, dim(doc$image)) ### THIS SCREWS UP PLOTLETTER AND OTHER PLOTTING!!! + + imagesList = list() + imagesList = c(imagesList, lapply(doc$process$letterList, function(x){centeredImage(x)})) + imagesList = lapply(imagesList, function(x){ + x$nodesrc = cbind(((x$nodes-1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((x$nodes-1) %% dim(x$image)[1])) + x$nodesrc = x$nodesrc - matrix(rep(x$centroid, each = dim(x$nodesrc)[1]), ncol = 2) + x$pathEndsrc = lapply(x$allPaths, function(z){cbind(((z[c(1,length(z))]-1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((z[c(1,length(z))]-1) %% dim(x$image)[1]))}) + x$pathEndsrc = lapply(x$pathEndsrc, function(z){z - matrix(rep(x$centroid, each = 2), ncol = 2)}) + return(x) + }) + + # get cluster assignments + message(paste(' Getting cluster assignments for', doc$docname)) + cluster_assign = sapply(imagesList, makeassignment, templateCenterList = template$centers, outliercut = outliercut) + df = data.frame(cluster = cluster_assign) + + # add docname, writer, doc, slope, xvar, yvar, and covar + df$docname <- doc$docname + df$writer <- as.integer(sapply(df$docname, function(x) substr(x, start = writer_indices[1], stop = writer_indices[2]))) + df$doc <- sapply(df$docname, function(x) substr(x, start = doc_indices[1], stop = doc_indices[2]), USE.NAMES = FALSE) + df$slope <- sapply(doc$process$letterList, function(x) x$characterFeatures$slope) + df$xvar <- sapply(doc$process$letterList, function(x) x$characterFeatures$xvar) + df$yvar <- sapply(doc$process$letterList, function(x) x$characterFeatures$yvar) + df$covar <- sapply(doc$process$letterList, function(x) x$characterFeatures$covar) + + # calculate pc rotation angle and wrapped pc rotation angle + get_pc_rotation <- function(x){ + xv <- as.numeric(x['xvar']) + yv <- as.numeric(x['yvar']) + cv <- as.numeric(x['covar']) + eig <- eigen(cbind(c(xv, cv), c(cv, yv)), symmetric = TRUE) + return(angle(t(as.matrix(eig$vectors[, 1])), as.matrix(c(1, 0)))) + } + df$pc_rotation <- apply(df, 1, get_pc_rotation) + df$pc_wrapped <- 2 * df$pc_rotation + + # sort columns + df <- df[,c('docname', 'writer', 'doc', 'cluster', 'slope', 'xvar', 'yvar', 'covar', 'pc_rotation', 'pc_wrapped')] + + saveRDS(df, file = outfile) + message(paste(' Saving cluster assignments for ', doc$docname, '\n')) + + return(df) + # out_proclist[[i]] <- df + } + + # save clusters + saveRDS(proclist, file.path(output_dir, "all_clusters.rds")) + + return(proclist) +} + + # Internal Functions ------------------------------------------------------ diff --git a/man/get_clusters_batch.Rd b/man/get_clusters_batch.Rd new file mode 100644 index 00000000..91f21ea4 --- /dev/null +++ b/man/get_clusters_batch.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ClusterModeling_clusterassignment.R +\name{get_clusters_batch} +\alias{get_clusters_batch} +\title{get_clusters_batch} +\usage{ +get_clusters_batch( + template, + input_dir, + output_dir, + writer_indices, + doc_indices, + num_cores +) +} +\arguments{ +\item{template}{A cluster template created with \code{\link{make_clustering_templates}}} + +\item{input_dir}{A directory containing graphs created with \code{\link{process_batch_dir}}} + +\item{output_dir}{Output directory for cluster assignments} + +\item{writer_indices}{Vector of start and end indices for the writer id in +the graph file names} + +\item{doc_indices}{Vector of start and end indices for the document id in the +graph file names} + +\item{num_cores}{Integer number of cores to use for parallel processing} +} +\value{ +A list of cluster assignments +} +\description{ +get_clusters_batch +} From 9c7c09762f2e94c9c28ed2c2eb45ad17d73a0be2 Mon Sep 17 00:00:00 2001 From: Stephanie Reinders Date: Fri, 9 Feb 2024 10:50:11 -0600 Subject: [PATCH 2/8] Trying to fix out of memory error in `get_clusters_batch()` ## The problem When I run `get_cluster_batch()` on my Macbook, Rstudio runs out of memory and crashes even if I only use two cores for parallel processing. ## Changes - If num_cores = 1, now the code will execute sequentially. - I turned off printing to console in the foreach loop in case this is part of the problem. --- R/ClusterModeling_clusterassignment.R | 527 ++++++++++++++++---------- 1 file changed, 321 insertions(+), 206 deletions(-) diff --git a/R/ClusterModeling_clusterassignment.R b/R/ClusterModeling_clusterassignment.R index 41da3038..f4c802c9 100644 --- a/R/ClusterModeling_clusterassignment.R +++ b/R/ClusterModeling_clusterassignment.R @@ -1,16 +1,16 @@ -# The handwriter R package performs writership analysis of handwritten documents. +# The handwriter R package performs writership analysis of handwritten documents. # Copyright (C) 2021 Iowa State University of Science and Technology on behalf of its Center for Statistics and Applications in Forensic Evidence -# +# # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. -# +# # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. -# +# # You should have received a copy of the GNU General Public License # along with this program. If not, see . @@ -30,100 +30,194 @@ #' #' @export #' @md -get_clusters_batch = function(template, input_dir, output_dir, writer_indices, doc_indices, num_cores){ +get_clusters_batch <- function(template, input_dir, output_dir, writer_indices, doc_indices, num_cores = 1) { # bind global variables to fix check() note i <- outliercut <- docname <- NULL - - message('Starting cluster assginment...') + + # check num_cores + if (!is.integer(num_cores)) { + stop("num_cores is not an integer") + } else if (num_cores < 1) { + stop("num_cores is not an integer greater than or equal to 1") + } + + message("Starting cluster assginment...") # make output directory - if ( !dir.exists(output_dir) ){ dir.create(output_dir) } - + if (!dir.exists(output_dir)) { + dir.create(output_dir) + } + # list files in input dir - proclist = list.files(input_dir, full.names = TRUE) - - my_cluster = parallel::makeCluster(num_cores, outfile="") - doParallel::registerDoParallel(my_cluster) - - proclist <- foreach::foreach(i = 1:length(proclist), - .combine = 'rbind', - .export = c("AddLetterImages", "MakeLetterListLetterSpecific", "centeredImage", "makeassignment", "angle")) %dopar% { # for each document i - - # out_proclist = list() - # for (i in 1:length(proclist)){ - # load doc - message(paste(' Loading graphs for', basename(proclist[i]))) - doc <- readRDS(proclist[i]) - - # check that doc$docname is not blank - if (!("docname" %in% names(doc))){ - if ( !dir.exists(file.path(output_dir, 'problem_files'))){ - dir.create(file.path(output_dir, 'problem_files')) - } - file.copy(proclist[i], file.path(output_dir, 'problem_files', basename(proclist[i]))) - message(paste("docname is NULL for", proclist[i], '\n')) - # next - return() - } - - # load outfile if it already exists - outfile <- file.path(output_dir, paste0(stringr::str_replace(doc$docname, ".png", ""), ".rds")) - if ( file.exists(outfile) ){ - message(paste(' Cluster assignments already exist for', doc$docname, '\n')) - df <- readRDS(outfile) - return(df) - # out_proclist[[i]] <- df - # next - } - - # extra processing - doc$process$letterList = AddLetterImages(doc$process$letterList, dim(doc$image)) - doc$process$letterList = MakeLetterListLetterSpecific(doc$process$letterList, dim(doc$image)) ### THIS SCREWS UP PLOTLETTER AND OTHER PLOTTING!!! - - imagesList = list() - imagesList = c(imagesList, lapply(doc$process$letterList, function(x){centeredImage(x)})) - imagesList = lapply(imagesList, function(x){ - x$nodesrc = cbind(((x$nodes-1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((x$nodes-1) %% dim(x$image)[1])) - x$nodesrc = x$nodesrc - matrix(rep(x$centroid, each = dim(x$nodesrc)[1]), ncol = 2) - x$pathEndsrc = lapply(x$allPaths, function(z){cbind(((z[c(1,length(z))]-1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((z[c(1,length(z))]-1) %% dim(x$image)[1]))}) - x$pathEndsrc = lapply(x$pathEndsrc, function(z){z - matrix(rep(x$centroid, each = 2), ncol = 2)}) - return(x) - }) - - # get cluster assignments - message(paste(' Getting cluster assignments for', doc$docname)) - cluster_assign = sapply(imagesList, makeassignment, templateCenterList = template$centers, outliercut = outliercut) - df = data.frame(cluster = cluster_assign) - - # add docname, writer, doc, slope, xvar, yvar, and covar - df$docname <- doc$docname - df$writer <- as.integer(sapply(df$docname, function(x) substr(x, start = writer_indices[1], stop = writer_indices[2]))) - df$doc <- sapply(df$docname, function(x) substr(x, start = doc_indices[1], stop = doc_indices[2]), USE.NAMES = FALSE) - df$slope <- sapply(doc$process$letterList, function(x) x$characterFeatures$slope) - df$xvar <- sapply(doc$process$letterList, function(x) x$characterFeatures$xvar) - df$yvar <- sapply(doc$process$letterList, function(x) x$characterFeatures$yvar) - df$covar <- sapply(doc$process$letterList, function(x) x$characterFeatures$covar) - - # calculate pc rotation angle and wrapped pc rotation angle - get_pc_rotation <- function(x){ - xv <- as.numeric(x['xvar']) - yv <- as.numeric(x['yvar']) - cv <- as.numeric(x['covar']) - eig <- eigen(cbind(c(xv, cv), c(cv, yv)), symmetric = TRUE) - return(angle(t(as.matrix(eig$vectors[, 1])), as.matrix(c(1, 0)))) - } - df$pc_rotation <- apply(df, 1, get_pc_rotation) - df$pc_wrapped <- 2 * df$pc_rotation - - # sort columns - df <- df[,c('docname', 'writer', 'doc', 'cluster', 'slope', 'xvar', 'yvar', 'covar', 'pc_rotation', 'pc_wrapped')] - - saveRDS(df, file = outfile) - message(paste(' Saving cluster assignments for ', doc$docname, '\n')) - - return(df) - # out_proclist[[i]] <- df - } - + proclist <- list.files(input_dir, full.names = TRUE) + + if (num_cores > 1) { # run in parallel + my_cluster <- parallel::makeCluster(num_cores) + doParallel::registerDoParallel(my_cluster) + + proclist <- foreach::foreach( + i = 1:length(proclist), + .combine = "rbind", + .export = c("AddLetterImages", "MakeLetterListLetterSpecific", "centeredImage", "makeassignment", "angle") + ) %dopar% { # for each document i + + message(paste(" Loading graphs for", basename(proclist[i]))) + doc <- readRDS(proclist[i]) + + # check that doc$docname is not blank + if (!("docname" %in% names(doc))) { + if (!dir.exists(file.path(output_dir, "problem_files"))) { + dir.create(file.path(output_dir, "problem_files")) + } + file.copy(proclist[i], file.path(output_dir, "problem_files", basename(proclist[i]))) + message(paste("docname is NULL for", proclist[i], "\n")) + return() + } + + # load outfile if it already exists + outfile <- file.path(output_dir, paste0(stringr::str_replace(doc$docname, ".png", ""), ".rds")) + if (file.exists(outfile)) { + message(paste(" Cluster assignments already exist for", doc$docname, "\n")) + df <- readRDS(outfile) + return(df) + } + + # extra processing + doc$process$letterList <- AddLetterImages(doc$process$letterList, dim(doc$image)) + doc$process$letterList <- MakeLetterListLetterSpecific(doc$process$letterList, dim(doc$image)) ### THIS SCREWS UP PLOTLETTER AND OTHER PLOTTING!!! + + imagesList <- list() + imagesList <- c(imagesList, lapply(doc$process$letterList, function(x) { + centeredImage(x) + })) + imagesList <- lapply(imagesList, function(x) { + x$nodesrc <- cbind(((x$nodes - 1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((x$nodes - 1) %% dim(x$image)[1])) + x$nodesrc <- x$nodesrc - matrix(rep(x$centroid, each = dim(x$nodesrc)[1]), ncol = 2) + x$pathEndsrc <- lapply(x$allPaths, function(z) { + cbind(((z[c(1, length(z))] - 1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((z[c(1, length(z))] - 1) %% dim(x$image)[1])) + }) + x$pathEndsrc <- lapply(x$pathEndsrc, function(z) { + z - matrix(rep(x$centroid, each = 2), ncol = 2) + }) + return(x) + }) + + # get cluster assignments + message(paste(" Getting cluster assignments for", doc$docname)) + cluster_assign <- sapply(imagesList, makeassignment, templateCenterList = template$centers, outliercut = outliercut) + df <- data.frame(cluster = cluster_assign) + + # add docname, writer, doc, slope, xvar, yvar, and covar + df$docname <- doc$docname + df$writer <- as.integer(sapply(df$docname, function(x) substr(x, start = writer_indices[1], stop = writer_indices[2]))) + df$doc <- sapply(df$docname, function(x) substr(x, start = doc_indices[1], stop = doc_indices[2]), USE.NAMES = FALSE) + df$slope <- sapply(doc$process$letterList, function(x) x$characterFeatures$slope) + df$xvar <- sapply(doc$process$letterList, function(x) x$characterFeatures$xvar) + df$yvar <- sapply(doc$process$letterList, function(x) x$characterFeatures$yvar) + df$covar <- sapply(doc$process$letterList, function(x) x$characterFeatures$covar) + + # calculate pc rotation angle and wrapped pc rotation angle + get_pc_rotation <- function(x) { + xv <- as.numeric(x["xvar"]) + yv <- as.numeric(x["yvar"]) + cv <- as.numeric(x["covar"]) + eig <- eigen(cbind(c(xv, cv), c(cv, yv)), symmetric = TRUE) + return(angle(t(as.matrix(eig$vectors[, 1])), as.matrix(c(1, 0)))) + } + df$pc_rotation <- apply(df, 1, get_pc_rotation) + df$pc_wrapped <- 2 * df$pc_rotation + + # sort columns + df <- df[, c("docname", "writer", "doc", "cluster", "slope", "xvar", "yvar", "covar", "pc_rotation", "pc_wrapped")] + + saveRDS(df, file = outfile) + message(paste(" Saving cluster assignments for ", doc$docname, "\n")) + + return(df) + } + } else { # run sequentially + out_proclist <- list() + for (i in 1:length(proclist)) { + # load doc + message(paste(" Loading graphs for", basename(proclist[i]))) + doc <- readRDS(proclist[i]) + + # check that doc$docname is not blank + if (!("docname" %in% names(doc))) { + if (!dir.exists(file.path(output_dir, "problem_files"))) { + dir.create(file.path(output_dir, "problem_files")) + } + message(paste("docname is NULL for", proclist[i], "\n")) + # copy file to problem files folder + file.copy(proclist[i], file.path(output_dir, "problem_files", basename(proclist[i]))) + next + } + + # load outfile if it already exists + outfile <- file.path(output_dir, paste0(stringr::str_replace(doc$docname, ".png", ""), ".rds")) + if (file.exists(outfile)) { + message(paste(" Cluster assignments already exist for", doc$docname, "\n")) + df <- readRDS(outfile) + out_proclist[[i]] <- df + next + } + + # extra processing + doc$process$letterList <- AddLetterImages(doc$process$letterList, dim(doc$image)) + doc$process$letterList <- MakeLetterListLetterSpecific(doc$process$letterList, dim(doc$image)) ### THIS SCREWS UP PLOTLETTER AND OTHER PLOTTING!!! + + imagesList <- list() + imagesList <- c(imagesList, lapply(doc$process$letterList, function(x) { + centeredImage(x) + })) + imagesList <- lapply(imagesList, function(x) { + x$nodesrc <- cbind(((x$nodes - 1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((x$nodes - 1) %% dim(x$image)[1])) + x$nodesrc <- x$nodesrc - matrix(rep(x$centroid, each = dim(x$nodesrc)[1]), ncol = 2) + x$pathEndsrc <- lapply(x$allPaths, function(z) { + cbind(((z[c(1, length(z))] - 1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((z[c(1, length(z))] - 1) %% dim(x$image)[1])) + }) + x$pathEndsrc <- lapply(x$pathEndsrc, function(z) { + z - matrix(rep(x$centroid, each = 2), ncol = 2) + }) + return(x) + }) + + # get cluster assignments + message(paste(" Getting cluster assignments for", doc$docname)) + cluster_assign <- sapply(imagesList, makeassignment, templateCenterList = template$centers, outliercut = outliercut) + df <- data.frame(cluster = cluster_assign) + + # add docname, writer, doc, slope, xvar, yvar, and covar + df$docname <- doc$docname + df$writer <- as.integer(sapply(df$docname, function(x) substr(x, start = writer_indices[1], stop = writer_indices[2]))) + df$doc <- sapply(df$docname, function(x) substr(x, start = doc_indices[1], stop = doc_indices[2]), USE.NAMES = FALSE) + df$slope <- sapply(doc$process$letterList, function(x) x$characterFeatures$slope) + df$xvar <- sapply(doc$process$letterList, function(x) x$characterFeatures$xvar) + df$yvar <- sapply(doc$process$letterList, function(x) x$characterFeatures$yvar) + df$covar <- sapply(doc$process$letterList, function(x) x$characterFeatures$covar) + + # calculate pc rotation angle and wrapped pc rotation angle + get_pc_rotation <- function(x) { + xv <- as.numeric(x["xvar"]) + yv <- as.numeric(x["yvar"]) + cv <- as.numeric(x["covar"]) + eig <- eigen(cbind(c(xv, cv), c(cv, yv)), symmetric = TRUE) + return(angle(t(as.matrix(eig$vectors[, 1])), as.matrix(c(1, 0)))) + } + df$pc_rotation <- apply(df, 1, get_pc_rotation) + df$pc_wrapped <- 2 * df$pc_rotation + + # sort columns + df <- df[, c("docname", "writer", "doc", "cluster", "slope", "xvar", "yvar", "covar", "pc_rotation", "pc_wrapped")] + + saveRDS(df, file = outfile) + message(paste(" Saving cluster assignments for ", doc$docname, "\n")) + + out_proclist[[i]] <- df + } + # rename + proclist <- out_proclist + } + # save clusters saveRDS(proclist, file.path(output_dir, "all_clusters.rds")) @@ -134,9 +228,13 @@ get_clusters_batch = function(template, input_dir, output_dir, writer_indices, d # Internal Functions ------------------------------------------------------ -makeassignment = function(imageListElement, templateCenterList, outliercut){ - dist = min(unlist(lapply(templateCenterList, function(x){getGraphDistance(imageList1 = imageListElement, imageList2 = x, isProto2 = TRUE)$matching_weight}))) - cluster = which.min(unlist(lapply(templateCenterList, function(x){getGraphDistance(imageList1 = imageListElement, imageList2 = x, isProto2 = TRUE)$matching_weight}))) +makeassignment <- function(imageListElement, templateCenterList, outliercut) { + dist <- min(unlist(lapply(templateCenterList, function(x) { + getGraphDistance(imageList1 = imageListElement, imageList2 = x, isProto2 = TRUE)$matching_weight + }))) + cluster <- which.min(unlist(lapply(templateCenterList, function(x) { + getGraphDistance(imageList1 = imageListElement, imageList2 = x, isProto2 = TRUE)$matching_weight + }))) return(cluster) } @@ -145,7 +243,7 @@ makeassignment = function(imageListElement, templateCenterList, outliercut){ #' #' @param template_dir Directory containing a cluster template created with `make_clustering_templates` #' @param input_type `model` or `questioned` -#' @param num_graphs 'All' or integer number of graphs to randomly select from each document. +#' @param num_graphs 'All' or integer number of graphs to randomly select from each document. #' @param writer_indices Vector of start and end indices for the writer id in #' the document names #' @param doc_indices Vector of start and end indices for the document id in the @@ -155,83 +253,93 @@ makeassignment = function(imageListElement, templateCenterList, outliercut){ #' @return list of processed handwriting with cluster assignments for each graph #' #' @noRd -get_clusterassignment = function(template_dir, input_type, num_graphs = "All", writer_indices, doc_indices, num_cores){ +get_clusterassignment <- function(template_dir, input_type, num_graphs = "All", writer_indices, doc_indices, num_cores) { # bind global variables to fix check() note i <- outliercut <- docname <- NULL - + # load cluster file if it already exists - if ( input_type == "model" ){ + if (input_type == "model") { cluster_file <- file.path(template_dir, "data", "model_clusters.rds") - } else if ( input_type == "questioned" ) { + } else if (input_type == "questioned") { cluster_file <- file.path(template_dir, "data", "questioned_clusters.rds") } else { stop("Unknown input type. Use model or questioned.") } - if ( file.exists(cluster_file) ){ + if (file.exists(cluster_file)) { proclist <- readRDS(cluster_file) return(proclist) } - + # load template - if ( file.exists(file.path(template_dir, "data", "template.rds")) ){ + if (file.exists(file.path(template_dir, "data", "template.rds"))) { template <- readRDS(file.path(template_dir, "data", "template.rds")) } else { stop(paste("There is no cluster template in", template_dir)) } - + # get input directory - if ( input_type == "model" ){ + if (input_type == "model") { input_dir <- file.path(template_dir, "data", "model_graphs") } else { input_dir <- file.path(template_dir, "data", "questioned_graphs") - } - + } + # make output directory - if ( input_type == "model" ){ + if (input_type == "model") { output_dir <- file.path(template_dir, "data", "model_clusters") } else { output_dir <- file.path(template_dir, "data", "questioned_clusters") - } - if ( !dir.exists(output_dir) ){ dir.create(output_dir) } - + } + if (!dir.exists(output_dir)) { + dir.create(output_dir) + } + # list files in input dir - proclist = list.files(input_dir, full.names = TRUE) - - my_cluster = parallel::makeCluster(num_cores) + proclist <- list.files(input_dir, full.names = TRUE) + + my_cluster <- parallel::makeCluster(num_cores) doParallel::registerDoParallel(my_cluster) - - proclist <- foreach::foreach(i = 1:length(proclist), - .combine = 'rbind', - .export = c("AddLetterImages", "MakeLetterListLetterSpecific", "centeredImage", "makeassignment", "angle")) %dopar% { # for each document i - + + proclist <- foreach::foreach( + i = 1:length(proclist), + .combine = "rbind", + .export = c("AddLetterImages", "MakeLetterListLetterSpecific", "centeredImage", "makeassignment", "angle") + ) %dopar% { # for each document i + # load doc - doc <- readRDS(proclist[i]) - + doc <- readRDS(proclist[i]) + # load outfile if it already exists outfile <- file.path(output_dir, paste0(doc$docname, ".rds")) - if ( file.exists(outfile) ){ + if (file.exists(outfile)) { df <- readRDS(outfile) return(df) } - + # extra processing - doc$process$letterList = AddLetterImages(doc$process$letterList, dim(doc$image)) - doc$process$letterList = MakeLetterListLetterSpecific(doc$process$letterList, dim(doc$image)) ### THIS SCREWS UP PLOTLETTER AND OTHER PLOTTING!!! - - imagesList = list() - imagesList = c(imagesList, lapply(doc$process$letterList, function(x){centeredImage(x)})) - imagesList = lapply(imagesList, function(x){ - x$nodesrc = cbind(((x$nodes-1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((x$nodes-1) %% dim(x$image)[1])) - x$nodesrc = x$nodesrc - matrix(rep(x$centroid, each = dim(x$nodesrc)[1]), ncol = 2) - x$pathEndsrc = lapply(x$allPaths, function(z){cbind(((z[c(1,length(z))]-1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((z[c(1,length(z))]-1) %% dim(x$image)[1]))}) - x$pathEndsrc = lapply(x$pathEndsrc, function(z){z - matrix(rep(x$centroid, each = 2), ncol = 2)}) + doc$process$letterList <- AddLetterImages(doc$process$letterList, dim(doc$image)) + doc$process$letterList <- MakeLetterListLetterSpecific(doc$process$letterList, dim(doc$image)) ### THIS SCREWS UP PLOTLETTER AND OTHER PLOTTING!!! + + imagesList <- list() + imagesList <- c(imagesList, lapply(doc$process$letterList, function(x) { + centeredImage(x) + })) + imagesList <- lapply(imagesList, function(x) { + x$nodesrc <- cbind(((x$nodes - 1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((x$nodes - 1) %% dim(x$image)[1])) + x$nodesrc <- x$nodesrc - matrix(rep(x$centroid, each = dim(x$nodesrc)[1]), ncol = 2) + x$pathEndsrc <- lapply(x$allPaths, function(z) { + cbind(((z[c(1, length(z))] - 1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((z[c(1, length(z))] - 1) %% dim(x$image)[1])) + }) + x$pathEndsrc <- lapply(x$pathEndsrc, function(z) { + z - matrix(rep(x$centroid, each = 2), ncol = 2) + }) return(x) }) - + # get cluster assignments - cluster_assign = sapply(imagesList, makeassignment, templateCenterList = template$centers, outliercut = outliercut) - df = data.frame(cluster = cluster_assign) - + cluster_assign <- sapply(imagesList, makeassignment, templateCenterList = template$centers, outliercut = outliercut) + df <- data.frame(cluster = cluster_assign) + # add docname, writer, doc, slope, xvar, yvar, and covar df$docname <- doc$docname df$writer <- as.integer(sapply(df$docname, function(x) substr(x, start = writer_indices[1], stop = writer_indices[2]))) @@ -240,104 +348,111 @@ get_clusterassignment = function(template_dir, input_type, num_graphs = "All", w df$xvar <- sapply(doc$process$letterList, function(x) x$characterFeatures$xvar) df$yvar <- sapply(doc$process$letterList, function(x) x$characterFeatures$yvar) df$covar <- sapply(doc$process$letterList, function(x) x$characterFeatures$covar) - + # calculate pc rotation angle and wrapped pc rotation angle - get_pc_rotation <- function(x){ - xv <- as.numeric(x['xvar']) - yv <- as.numeric(x['yvar']) - cv <- as.numeric(x['covar']) + get_pc_rotation <- function(x) { + xv <- as.numeric(x["xvar"]) + yv <- as.numeric(x["yvar"]) + cv <- as.numeric(x["covar"]) eig <- eigen(cbind(c(xv, cv), c(cv, yv)), symmetric = TRUE) return(angle(t(as.matrix(eig$vectors[, 1])), as.matrix(c(1, 0)))) } df$pc_rotation <- apply(df, 1, get_pc_rotation) df$pc_wrapped <- 2 * df$pc_rotation - - # sample graphs - if (num_graphs != "All"){ - df <- df %>% + + # sample graphs + if (num_graphs != "All") { + df <- df %>% dplyr::group_by(docname) %>% dplyr::slice_sample(n = num_graphs) } - + # sort columns - df <- df[,c('docname', 'writer', 'doc', 'cluster', 'slope', 'xvar', 'yvar', 'covar', 'pc_rotation', 'pc_wrapped')] + df <- df[, c("docname", "writer", "doc", "cluster", "slope", "xvar", "yvar", "covar", "pc_rotation", "pc_wrapped")] saveRDS(df, file = file.path(output_dir, paste0(stringr::str_replace(doc$docname, ".png", ""), ".rds"))) - + return(df) } # save clusters - if ( input_type == "model" ){ + if (input_type == "model") { saveRDS(proclist, file.path(template_dir, "data", "model_clusters.rds")) } else { saveRDS(proclist, file.path(template_dir, "data", "questioned_clusters.rds")) - } - + } + return(proclist) } -GetImageMatrix = function(letterList, maxImageSize = 50) -{ imagesList = list() - imagesList = c(imagesList, lapply(letterList, function(x){centeredImage(x)})) - +GetImageMatrix <- function(letterList, maxImageSize = 50) { + imagesList <- list() + imagesList <- c(imagesList, lapply(letterList, function(x) { + centeredImage(x) + })) + # letterList = unlist(letListFull,recursive=FALSE) - - for(i in 1:length(imagesList)){ - l = imagesList[[i]]$centroid[1] - r = dim(imagesList[[i]]$image)[2] - imagesList[[i]]$centroid[1] + 1 - t = dim(imagesList[[i]]$image)[1] - imagesList[[i]]$centroid[2] + 1 - b = imagesList[[i]]$centroid[2] - if(l > r){ - imagesList[[i]]$image = cbind(imagesList[[i]]$image, matrix(1, ncol = l-r, nrow = dim(imagesList[[i]]$image)[1])) - } - else if(l < r){ - imagesList[[i]]$image = cbind(matrix(1, ncol = r-l, nrow = dim(imagesList[[i]]$image)[1]), imagesList[[i]]$image) - } - if(t > b){ - imagesList[[i]]$image = rbind(imagesList[[i]]$image, matrix(1, nrow = t-b, ncol = dim(imagesList[[i]]$image)[2])) + + for (i in 1:length(imagesList)) { + l <- imagesList[[i]]$centroid[1] + r <- dim(imagesList[[i]]$image)[2] - imagesList[[i]]$centroid[1] + 1 + t <- dim(imagesList[[i]]$image)[1] - imagesList[[i]]$centroid[2] + 1 + b <- imagesList[[i]]$centroid[2] + if (l > r) { + imagesList[[i]]$image <- cbind(imagesList[[i]]$image, matrix(1, ncol = l - r, nrow = dim(imagesList[[i]]$image)[1])) + } else if (l < r) { + imagesList[[i]]$image <- cbind(matrix(1, ncol = r - l, nrow = dim(imagesList[[i]]$image)[1]), imagesList[[i]]$image) } - else if(t < b){ - imagesList[[i]]$image = rbind(matrix(1, nrow = b-t, ncol = dim(imagesList[[i]]$image)[2]), imagesList[[i]]$image) + if (t > b) { + imagesList[[i]]$image <- rbind(imagesList[[i]]$image, matrix(1, nrow = t - b, ncol = dim(imagesList[[i]]$image)[2])) + } else if (t < b) { + imagesList[[i]]$image <- rbind(matrix(1, nrow = b - t, ncol = dim(imagesList[[i]]$image)[2]), imagesList[[i]]$image) } } - - for(i in 1:length(imagesList)){ - if(any(dim(imagesList[[i]]$image) > maxImageSize)){ - imagesList[[i]]$image = imagesList[[i]]$image %>% as.raster() %>% magick::image_read() %>% magick::image_resize(paste0(maxImageSize,"x",maxImageSize)) %>% magick::image_quantize(max = 2, dither = FALSE, colorspace = "gray") %>% `[[`(1) %>% as.numeric() %>% `[`(,,1) - imagesList[[i]]$image = rbind(1,cbind(1,imagesList[[i]]$image,1),1) - thinned = thinImage(imagesList[[i]]$image) - imagesList[[i]]$image[] = 1 - imagesList[[i]]$image[thinned] = 0 - - imagesList[[i]]$image = imagesList[[i]]$image[-c(1,dim(imagesList[[i]]$image)[1]),-c(1,dim(imagesList[[i]]$image)[2])] - #print(plotImage(imagesList[[i]]$image) + theme_bw()) + + for (i in 1:length(imagesList)) { + if (any(dim(imagesList[[i]]$image) > maxImageSize)) { + imagesList[[i]]$image <- imagesList[[i]]$image %>% + as.raster() %>% + magick::image_read() %>% + magick::image_resize(paste0(maxImageSize, "x", maxImageSize)) %>% + magick::image_quantize(max = 2, dither = FALSE, colorspace = "gray") %>% + `[[`(1) %>% + as.numeric() %>% + `[`(, , 1) + imagesList[[i]]$image <- rbind(1, cbind(1, imagesList[[i]]$image, 1), 1) + thinned <- thinImage(imagesList[[i]]$image) + imagesList[[i]]$image[] <- 1 + imagesList[[i]]$image[thinned] <- 0 + + imagesList[[i]]$image <- imagesList[[i]]$image[-c(1, dim(imagesList[[i]]$image)[1]), -c(1, dim(imagesList[[i]]$image)[2])] + # print(plotImage(imagesList[[i]]$image) + theme_bw()) cat(i, " ") } } - - for(i in 1:length(imagesList)){ - dims = dim(imagesList[[i]]$image) - lrPad = maxImageSize + 2 - dims[2] - tbPad = maxImageSize + 2 - dims[1] - l = floor(lrPad/2) - r = ceiling(lrPad/2) - b = ceiling(tbPad/2) - t = floor(tbPad/2) - - imagesList[[i]]$image = rbind(matrix(1,nrow = t, ncol = dims[2]), imagesList[[i]]$image, matrix(1,nrow = b, ncol = dims[2])) - imagesList[[i]]$image = cbind(matrix(1,ncol = l, nrow = dim(imagesList[[i]]$image)[1]), imagesList[[i]]$image, matrix(1,ncol = r, nrow = dim(imagesList[[i]]$image)[1])) - - imagesList[[i]]$image = imagesList[[i]]$image[,-c(1,maxImageSize + 2)] - imagesList[[i]]$image = imagesList[[i]]$image[-c(1,maxImageSize + 2),] + + for (i in 1:length(imagesList)) { + dims <- dim(imagesList[[i]]$image) + lrPad <- maxImageSize + 2 - dims[2] + tbPad <- maxImageSize + 2 - dims[1] + l <- floor(lrPad / 2) + r <- ceiling(lrPad / 2) + b <- ceiling(tbPad / 2) + t <- floor(tbPad / 2) + + imagesList[[i]]$image <- rbind(matrix(1, nrow = t, ncol = dims[2]), imagesList[[i]]$image, matrix(1, nrow = b, ncol = dims[2])) + imagesList[[i]]$image <- cbind(matrix(1, ncol = l, nrow = dim(imagesList[[i]]$image)[1]), imagesList[[i]]$image, matrix(1, ncol = r, nrow = dim(imagesList[[i]]$image)[1])) + + imagesList[[i]]$image <- imagesList[[i]]$image[, -c(1, maxImageSize + 2)] + imagesList[[i]]$image <- imagesList[[i]]$image[-c(1, maxImageSize + 2), ] } - #apply(matrix(unlist(lapply(imagesList, function(x){dim(x$image)})), ncol = 2, byrow = TRUE), 2, function(x){all(x == maxImageSize)}) - - - images = array(NA, c(maxImageSize, maxImageSize, length(imagesList))) - for(i in 1:length(imagesList)){ - images[,,i] = imagesList[[i]]$image + # apply(matrix(unlist(lapply(imagesList, function(x){dim(x$image)})), ncol = 2, byrow = TRUE), 2, function(x){all(x == maxImageSize)}) + + + images <- array(NA, c(maxImageSize, maxImageSize, length(imagesList))) + for (i in 1:length(imagesList)) { + images[, , i] <- imagesList[[i]]$image } return(images) } From 93edee4ee861df5875562fb56e03b3021e175951 Mon Sep 17 00:00:00 2001 From: Stephanie Reinders Date: Fri, 9 Feb 2024 11:18:32 -0600 Subject: [PATCH 3/8] Fixed bug in checking that num_cores is an integer I didn't realize that is.integer(5) returns FALSE! I replaced is.integer with x %% 1 == 0 test. --- R/ClusterModeling_clusterassignment.R | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/R/ClusterModeling_clusterassignment.R b/R/ClusterModeling_clusterassignment.R index f4c802c9..108781a8 100644 --- a/R/ClusterModeling_clusterassignment.R +++ b/R/ClusterModeling_clusterassignment.R @@ -35,10 +35,14 @@ get_clusters_batch <- function(template, input_dir, output_dir, writer_indices, i <- outliercut <- docname <- NULL # check num_cores - if (!is.integer(num_cores)) { + if (len(num_cores) > 1){ + stop("num_cores is longer than 1") + } else if (!is.numeric(num_cores)){ + stop("num_cores is not numeric") + } else if (x %% 1 != 0) { stop("num_cores is not an integer") } else if (num_cores < 1) { - stop("num_cores is not an integer greater than or equal to 1") + stop("num_cores is not greater than or equal to 1") } message("Starting cluster assginment...") From d3d508b8765b307bc9ad8488575019a963428edb Mon Sep 17 00:00:00 2001 From: Stephanie Reinders Date: Fri, 9 Feb 2024 11:20:46 -0600 Subject: [PATCH 4/8] Fixed bug caused by typo in get_clusters_batch I had used Python "len" instead of R "length" --- R/ClusterModeling_clusterassignment.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/ClusterModeling_clusterassignment.R b/R/ClusterModeling_clusterassignment.R index 108781a8..96186e3b 100644 --- a/R/ClusterModeling_clusterassignment.R +++ b/R/ClusterModeling_clusterassignment.R @@ -35,7 +35,7 @@ get_clusters_batch <- function(template, input_dir, output_dir, writer_indices, i <- outliercut <- docname <- NULL # check num_cores - if (len(num_cores) > 1){ + if (length(num_cores) > 1){ stop("num_cores is longer than 1") } else if (!is.numeric(num_cores)){ stop("num_cores is not numeric") From 3c862bb0b945bd121ecc6acad92818c86ca2050c Mon Sep 17 00:00:00 2001 From: Stephanie Reinders Date: Fri, 9 Feb 2024 11:22:14 -0600 Subject: [PATCH 5/8] Fixed bug in get_clusters_batch --- R/ClusterModeling_clusterassignment.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/ClusterModeling_clusterassignment.R b/R/ClusterModeling_clusterassignment.R index 96186e3b..d9135e30 100644 --- a/R/ClusterModeling_clusterassignment.R +++ b/R/ClusterModeling_clusterassignment.R @@ -39,7 +39,7 @@ get_clusters_batch <- function(template, input_dir, output_dir, writer_indices, stop("num_cores is longer than 1") } else if (!is.numeric(num_cores)){ stop("num_cores is not numeric") - } else if (x %% 1 != 0) { + } else if (num_cores %% 1 != 0) { stop("num_cores is not an integer") } else if (num_cores < 1) { stop("num_cores is not greater than or equal to 1") From 76ef362a3d7b86fa04f344d3ee9b192dffab3887 Mon Sep 17 00:00:00 2001 From: Stephanie Reinders Date: Fri, 9 Feb 2024 15:04:25 -0600 Subject: [PATCH 6/8] Changed `get_clusters_batch()` to allow creation of nested output folders Changed dir.create(output_dir) to dir.create(output_dir, recursive = TRUE) so that if the the output dir is dir1 > dir2 > dir3 but only dir1 exists, then dir2 and dir3 will be created. --- R/ClusterModeling_clusterassignment.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/ClusterModeling_clusterassignment.R b/R/ClusterModeling_clusterassignment.R index d9135e30..1330ec1d 100644 --- a/R/ClusterModeling_clusterassignment.R +++ b/R/ClusterModeling_clusterassignment.R @@ -48,7 +48,7 @@ get_clusters_batch <- function(template, input_dir, output_dir, writer_indices, message("Starting cluster assginment...") # make output directory if (!dir.exists(output_dir)) { - dir.create(output_dir) + dir.create(output_dir, recursive = TRUE) } # list files in input dir From 963505966bbe5a3b60e50f52edb0be83670c19be Mon Sep 17 00:00:00 2001 From: Stephanie Reinders Date: Mon, 12 Feb 2024 09:43:14 -0600 Subject: [PATCH 7/8] Exported `get_cluster_fill_counts` Changed `get_cluster_fill_counts` to an exported function so that handwriterFetch can use it. --- NAMESPACE | 1 + R/ClusterModeling_datafunctions.R | 67 +++++++++++++++++-------------- man/get_cluster_fill_counts.Rd | 29 +++++++++++++ 3 files changed, 67 insertions(+), 30 deletions(-) create mode 100644 man/get_cluster_fill_counts.Rd diff --git a/NAMESPACE b/NAMESPACE index ab9f2faa..0fbccdc6 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -9,6 +9,7 @@ export(drop_burnin) export(extractGraphs) export(fit_model) export(format_template_data) +export(get_cluster_fill_counts) export(get_clusters_batch) export(get_credible_intervals) export(get_posterior_probabilities) diff --git a/R/ClusterModeling_datafunctions.R b/R/ClusterModeling_datafunctions.R index 2911f45c..bda82492 100644 --- a/R/ClusterModeling_datafunctions.R +++ b/R/ClusterModeling_datafunctions.R @@ -69,6 +69,43 @@ format_template_data <- function(template) { } +#' get_cluster_fill_counts +#' +#' `get_cluster_fill_counts()` creates a data frame that shows the number of +#' graphs in each cluster for each input document. +#' +#' @param df A data frame with columns `writer`, `doc`, and `cluster`. Each +#' row corresponding to a graph and lists the writer of that graph, the document +#' from which the graph was obtained, and the cluster to which that graph is assigned. +#' @return A dataframe of cluster fill counts for each document in the input data frame. +#' +#' @examples +#' writer <- c(rep(1, 20), rep(2, 20), rep(3, 20)) +#' docname <- c(rep('doc1',20), rep('doc2', 20), rep('doc3', 20)) +#' doc <- c(rep(1, 20), rep(2, 20), rep(3, 20)) +#' cluster <- sample(3, 60, replace=TRUE) +#' df <- data.frame(docname, writer, doc, cluster) +#' get_cluster_fill_counts(df) +#' +#' @export +#' @md +get_cluster_fill_counts <- function(df) { + docname <- writer <- doc <- cluster <- n <- NULL + + # count number of graphs in each cluster for each writer + cluster_fill_counts <- df %>% + dplyr::group_by(docname, writer, doc, cluster) %>% + dplyr::summarise(n = dplyr::n()) %>% + dplyr::mutate(n = as.integer(n)) %>% + tidyr::pivot_wider(names_from = cluster, values_from = n, values_fill = 0) + + # sort columns + cols <- c(colnames(cluster_fill_counts[, c(1, 2, 3)]), sort(as.numeric(colnames(cluster_fill_counts[, -c(1, 2, 3)])))) + cluster_fill_counts <- cluster_fill_counts[, cols] + + return(cluster_fill_counts) +} + # Internal Functions ------------------------------------------------------ @@ -201,36 +238,6 @@ format_questioned_data <- function(model, questioned_clusters, writer_indices, d return(data) } - -#' get_cluster_fill_counts -#' -#' `get_cluster_fill_counts()` creates a data frame that shows the number of -#' graphs in each cluster for each input document. -#' -#' @param df A data frame with columns `writer`, `doc`, and `cluster`. Each -#' row corresponding to a graph and lists the writer of that graph, the document -#' from which the graph was obtained, and the cluster to which that graph is assigned. -#' @return A dataframe of cluster fill counts for each document in the input data frame. -#' -#' @noRd -get_cluster_fill_counts <- function(df) { - docname <- writer <- doc <- cluster <- n <- NULL - - # count number of graphs in each cluster for each writer - cluster_fill_counts <- df %>% - dplyr::group_by(docname, writer, doc, cluster) %>% - dplyr::summarise(n = dplyr::n()) %>% - dplyr::mutate(n = as.integer(n)) %>% - tidyr::pivot_wider(names_from = cluster, values_from = n, values_fill = 0) - - # sort columns - cols <- c(colnames(cluster_fill_counts[, c(1, 2, 3)]), sort(as.numeric(colnames(cluster_fill_counts[, -c(1, 2, 3)])))) - cluster_fill_counts <- cluster_fill_counts[, cols] - - return(cluster_fill_counts) -} - - #' angle #' #' `angle()` gives a value in (-pi,pi), where negative values come from unit vectors below the x axis (kinda weird/not traditional) diff --git a/man/get_cluster_fill_counts.Rd b/man/get_cluster_fill_counts.Rd new file mode 100644 index 00000000..60525609 --- /dev/null +++ b/man/get_cluster_fill_counts.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ClusterModeling_datafunctions.R +\name{get_cluster_fill_counts} +\alias{get_cluster_fill_counts} +\title{get_cluster_fill_counts} +\usage{ +get_cluster_fill_counts(df) +} +\arguments{ +\item{df}{A data frame with columns \code{writer}, \code{doc}, and \code{cluster}. Each +row corresponding to a graph and lists the writer of that graph, the document +from which the graph was obtained, and the cluster to which that graph is assigned.} +} +\value{ +A dataframe of cluster fill counts for each document in the input data frame. +} +\description{ +\code{get_cluster_fill_counts()} creates a data frame that shows the number of +graphs in each cluster for each input document. +} +\examples{ +writer <- c(rep(1, 20), rep(2, 20), rep(3, 20)) +docname <- c(rep('doc1',20), rep('doc2', 20), rep('doc3', 20)) +doc <- c(rep(1, 20), rep(2, 20), rep(3, 20)) +cluster <- sample(3, 60, replace=TRUE) +df <- data.frame(docname, writer, doc, cluster) +get_cluster_fill_counts(df) + +} From 6f54b5a8d86c207b26c2692ee793e53ed2cd1220 Mon Sep 17 00:00:00 2001 From: Stephanie Reinders Date: Mon, 12 Feb 2024 09:44:17 -0600 Subject: [PATCH 8/8] Added examples to documentation for `get_clusters_batch` --- R/ClusterModeling_clusterassignment.R | 12 +++++++++++- man/get_clusters_batch.Rd | 13 ++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/R/ClusterModeling_clusterassignment.R b/R/ClusterModeling_clusterassignment.R index 1330ec1d..dca20a53 100644 --- a/R/ClusterModeling_clusterassignment.R +++ b/R/ClusterModeling_clusterassignment.R @@ -27,7 +27,17 @@ #' @param num_cores Integer number of cores to use for parallel processing #' #' @return A list of cluster assignments -#' +#' +#' @examples +#' \dontrun{ +#' template <- readRDS('path/to/template.rds') +#' get_clusters_batch(template=template, input_dir='path/to/dir', output_dir='path/to/dir', +#' writer_indices=c(2,5), doc_indices=c(7,18), num_cores=1) +#' +#' get_clusters_batch(template=template, input_dir='path/to/dir', output_dir='path/to/dir', +#' writer_indices=c(1,4), doc_indices=c(5,10), num_cores=5) +#' } +#' #' @export #' @md get_clusters_batch <- function(template, input_dir, output_dir, writer_indices, doc_indices, num_cores = 1) { diff --git a/man/get_clusters_batch.Rd b/man/get_clusters_batch.Rd index 91f21ea4..acc30b99 100644 --- a/man/get_clusters_batch.Rd +++ b/man/get_clusters_batch.Rd @@ -10,7 +10,7 @@ get_clusters_batch( output_dir, writer_indices, doc_indices, - num_cores + num_cores = 1 ) } \arguments{ @@ -34,3 +34,14 @@ A list of cluster assignments \description{ get_clusters_batch } +\examples{ +\dontrun{ +template <- readRDS('path/to/template.rds') +get_clusters_batch(template=template, input_dir='path/to/dir', output_dir='path/to/dir', +writer_indices=c(2,5), doc_indices=c(7,18), num_cores=1) + +get_clusters_batch(template=template, input_dir='path/to/dir', output_dir='path/to/dir', +writer_indices=c(1,4), doc_indices=c(5,10), num_cores=5) +} + +}