From 20ce4acc86ac316b035212426c009fe66e2df419 Mon Sep 17 00:00:00 2001
From: Stephanie Reinders <reinders.stephanie@gmail.com>
Date: Fri, 9 Feb 2024 09:32:59 -0600
Subject: [PATCH 1/8] Created new exported function to get cluster assignments

The internal function `get_clusterassignment()` is setup to wrangle the cluster assignments into the proper format for either `fit_model()` or `analyze_questioned_documents()`. Instead of adapting `get_clusterassignment()` to work on other types of data as well, I think it makes more sense to make a separate exported function that gets cluster assignments for docs in a user defined folder exported in a general format.
---
 NAMESPACE                             |   1 +
 NEWS.md                               |   9 ++
 R/ClusterModeling_clusterassignment.R | 116 ++++++++++++++++++++++++++
 man/get_clusters_batch.Rd             |  36 ++++++++
 4 files changed, 162 insertions(+)
 create mode 100644 man/get_clusters_batch.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 10601ef7..ab9f2faa 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -9,6 +9,7 @@ export(drop_burnin)
 export(extractGraphs)
 export(fit_model)
 export(format_template_data)
+export(get_clusters_batch)
 export(get_credible_intervals)
 export(get_posterior_probabilities)
 export(make_clustering_templates)
diff --git a/NEWS.md b/NEWS.md
index 26b25f31..c839444b 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,14 @@
 # handwriter (development version)
 
+<<<<<<< Updated upstream
+=======
+* Make `get_clusterassignment()` available to users.
+
+# handwriter 2.0.3
+
+* Fix initializer warning with arma::vec. Apparently different versions of gcc were raising warnings with the assignment of values in neighbors. The recommended fix was to use an initializer list.
+
+>>>>>>> Stashed changes
 # handwriter 2.0.2
 
 ## New features
diff --git a/R/ClusterModeling_clusterassignment.R b/R/ClusterModeling_clusterassignment.R
index cb70a791..41da3038 100644
--- a/R/ClusterModeling_clusterassignment.R
+++ b/R/ClusterModeling_clusterassignment.R
@@ -15,6 +15,122 @@
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
 
+#' get_clusters_batch
+#'
+#' @param template A cluster template created with [`make_clustering_templates`]
+#' @param input_dir A directory containing graphs created with [`process_batch_dir`]
+#' @param output_dir Output directory for cluster assignments
+#' @param writer_indices Vector of start and end indices for the writer id in
+#'   the graph file names
+#' @param doc_indices Vector of start and end indices for the document id in the
+#'   graph file names
+#' @param num_cores Integer number of cores to use for parallel processing
+#'
+#' @return A list of cluster assignments
+#'
+#' @export
+#' @md
+get_clusters_batch = function(template, input_dir, output_dir, writer_indices, doc_indices, num_cores){
+  # bind global variables to fix check() note
+  i <- outliercut <- docname <- NULL
+  
+  message('Starting cluster assginment...')
+  # make output directory
+  if ( !dir.exists(output_dir) ){ dir.create(output_dir) }
+  
+  # list files in input dir
+  proclist = list.files(input_dir, full.names = TRUE)
+  
+  my_cluster = parallel::makeCluster(num_cores, outfile="")
+  doParallel::registerDoParallel(my_cluster)
+  
+  proclist <- foreach::foreach(i = 1:length(proclist),
+                               .combine = 'rbind',
+                               .export = c("AddLetterImages", "MakeLetterListLetterSpecific", "centeredImage", "makeassignment", "angle")) %dopar% {  # for each document i
+
+  # out_proclist = list()
+  # for (i in 1:length(proclist)){
+                                 # load doc
+                                 message(paste('     Loading graphs for', basename(proclist[i])))
+                                 doc <- readRDS(proclist[i])                          
+                                 
+                                 # check that doc$docname is not blank
+                                 if (!("docname" %in% names(doc))){
+                                   if ( !dir.exists(file.path(output_dir, 'problem_files'))){
+                                     dir.create(file.path(output_dir, 'problem_files'))
+                                   }
+                                   file.copy(proclist[i], file.path(output_dir, 'problem_files', basename(proclist[i])))
+                                   message(paste("docname is NULL for", proclist[i], '\n'))
+                                   # next
+                                   return()
+                                 }
+                                 
+                                 # load outfile if it already exists
+                                 outfile <- file.path(output_dir, paste0(stringr::str_replace(doc$docname, ".png", ""), ".rds"))
+                                 if ( file.exists(outfile) ){
+                                   message(paste('     Cluster assignments already exist for', doc$docname, '\n'))
+                                   df <- readRDS(outfile)
+                                   return(df)
+                                   # out_proclist[[i]] <- df
+                                   # next
+                                 }
+                                 
+                                 # extra processing
+                                 doc$process$letterList = AddLetterImages(doc$process$letterList, dim(doc$image))
+                                 doc$process$letterList = MakeLetterListLetterSpecific(doc$process$letterList, dim(doc$image)) ### THIS SCREWS UP PLOTLETTER AND OTHER PLOTTING!!!
+                                 
+                                 imagesList = list()
+                                 imagesList = c(imagesList, lapply(doc$process$letterList, function(x){centeredImage(x)}))
+                                 imagesList = lapply(imagesList, function(x){
+                                   x$nodesrc = cbind(((x$nodes-1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((x$nodes-1) %% dim(x$image)[1]))
+                                   x$nodesrc = x$nodesrc - matrix(rep(x$centroid, each = dim(x$nodesrc)[1]), ncol = 2)
+                                   x$pathEndsrc = lapply(x$allPaths, function(z){cbind(((z[c(1,length(z))]-1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((z[c(1,length(z))]-1) %% dim(x$image)[1]))})
+                                   x$pathEndsrc = lapply(x$pathEndsrc, function(z){z - matrix(rep(x$centroid, each = 2), ncol = 2)})
+                                   return(x)
+                                 })
+                                 
+                                 # get cluster assignments
+                                 message(paste('     Getting cluster assignments for', doc$docname))
+                                 cluster_assign = sapply(imagesList, makeassignment, templateCenterList = template$centers, outliercut = outliercut)
+                                 df = data.frame(cluster = cluster_assign)
+                                 
+                                 # add docname, writer, doc, slope, xvar, yvar, and covar
+                                 df$docname <- doc$docname
+                                 df$writer <- as.integer(sapply(df$docname, function(x) substr(x, start = writer_indices[1], stop = writer_indices[2])))
+                                 df$doc <- sapply(df$docname, function(x) substr(x, start = doc_indices[1], stop = doc_indices[2]), USE.NAMES = FALSE)
+                                 df$slope <- sapply(doc$process$letterList, function(x) x$characterFeatures$slope)
+                                 df$xvar <- sapply(doc$process$letterList, function(x) x$characterFeatures$xvar)
+                                 df$yvar <- sapply(doc$process$letterList, function(x) x$characterFeatures$yvar)
+                                 df$covar <- sapply(doc$process$letterList, function(x) x$characterFeatures$covar)
+                                 
+                                 # calculate pc rotation angle and wrapped pc rotation angle
+                                 get_pc_rotation <- function(x){
+                                   xv <- as.numeric(x['xvar'])
+                                   yv <- as.numeric(x['yvar'])
+                                   cv <- as.numeric(x['covar'])
+                                   eig <- eigen(cbind(c(xv, cv), c(cv, yv)), symmetric = TRUE)
+                                   return(angle(t(as.matrix(eig$vectors[, 1])), as.matrix(c(1, 0))))
+                                 }
+                                 df$pc_rotation <- apply(df, 1, get_pc_rotation)
+                                 df$pc_wrapped <- 2 * df$pc_rotation
+                                 
+                                 # sort columns
+                                 df <- df[,c('docname', 'writer', 'doc', 'cluster', 'slope', 'xvar', 'yvar', 'covar', 'pc_rotation', 'pc_wrapped')]
+                                 
+                                 saveRDS(df, file = outfile)
+                                 message(paste('     Saving cluster assignments for ', doc$docname, '\n'))
+                                 
+                                 return(df)
+                                 # out_proclist[[i]] <- df
+                               }
+  
+  # save clusters
+  saveRDS(proclist, file.path(output_dir, "all_clusters.rds"))
+
+  return(proclist)
+}
+
+
 # Internal Functions ------------------------------------------------------
 
 
diff --git a/man/get_clusters_batch.Rd b/man/get_clusters_batch.Rd
new file mode 100644
index 00000000..91f21ea4
--- /dev/null
+++ b/man/get_clusters_batch.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ClusterModeling_clusterassignment.R
+\name{get_clusters_batch}
+\alias{get_clusters_batch}
+\title{get_clusters_batch}
+\usage{
+get_clusters_batch(
+  template,
+  input_dir,
+  output_dir,
+  writer_indices,
+  doc_indices,
+  num_cores
+)
+}
+\arguments{
+\item{template}{A cluster template created with \code{\link{make_clustering_templates}}}
+
+\item{input_dir}{A directory containing graphs created with \code{\link{process_batch_dir}}}
+
+\item{output_dir}{Output directory for cluster assignments}
+
+\item{writer_indices}{Vector of start and end indices for the writer id in
+the graph file names}
+
+\item{doc_indices}{Vector of start and end indices for the document id in the
+graph file names}
+
+\item{num_cores}{Integer number of cores to use for parallel processing}
+}
+\value{
+A list of cluster assignments
+}
+\description{
+get_clusters_batch
+}

From 9c7c09762f2e94c9c28ed2c2eb45ad17d73a0be2 Mon Sep 17 00:00:00 2001
From: Stephanie Reinders <reinders.stephanie@gmail.com>
Date: Fri, 9 Feb 2024 10:50:11 -0600
Subject: [PATCH 2/8] Trying to fix out of memory error in
 `get_clusters_batch()`

## The problem
When I run `get_cluster_batch()` on my Macbook, Rstudio runs out of memory and crashes even if I only use two cores for parallel processing.

## Changes
- If num_cores = 1, now the code will execute sequentially.
- I turned off printing to console in the foreach loop in case this is part of the problem.
---
 R/ClusterModeling_clusterassignment.R | 527 ++++++++++++++++----------
 1 file changed, 321 insertions(+), 206 deletions(-)

diff --git a/R/ClusterModeling_clusterassignment.R b/R/ClusterModeling_clusterassignment.R
index 41da3038..f4c802c9 100644
--- a/R/ClusterModeling_clusterassignment.R
+++ b/R/ClusterModeling_clusterassignment.R
@@ -1,16 +1,16 @@
-# The handwriter R package performs writership analysis of handwritten documents. 
+# The handwriter R package performs writership analysis of handwritten documents.
 # Copyright (C) 2021 Iowa State University of Science and Technology on behalf of its Center for Statistics and Applications in Forensic Evidence
-# 
+#
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
-# 
+#
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
-# 
+#
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <https://www.gnu.org/licenses/>.
 
@@ -30,100 +30,194 @@
 #'
 #' @export
 #' @md
-get_clusters_batch = function(template, input_dir, output_dir, writer_indices, doc_indices, num_cores){
+get_clusters_batch <- function(template, input_dir, output_dir, writer_indices, doc_indices, num_cores = 1) {
   # bind global variables to fix check() note
   i <- outliercut <- docname <- NULL
-  
-  message('Starting cluster assginment...')
+
+  # check num_cores
+  if (!is.integer(num_cores)) {
+    stop("num_cores is not an integer")
+  } else if (num_cores < 1) {
+    stop("num_cores is not an integer greater than or equal to 1")
+  }
+
+  message("Starting cluster assginment...")
   # make output directory
-  if ( !dir.exists(output_dir) ){ dir.create(output_dir) }
-  
+  if (!dir.exists(output_dir)) {
+    dir.create(output_dir)
+  }
+
   # list files in input dir
-  proclist = list.files(input_dir, full.names = TRUE)
-  
-  my_cluster = parallel::makeCluster(num_cores, outfile="")
-  doParallel::registerDoParallel(my_cluster)
-  
-  proclist <- foreach::foreach(i = 1:length(proclist),
-                               .combine = 'rbind',
-                               .export = c("AddLetterImages", "MakeLetterListLetterSpecific", "centeredImage", "makeassignment", "angle")) %dopar% {  # for each document i
-
-  # out_proclist = list()
-  # for (i in 1:length(proclist)){
-                                 # load doc
-                                 message(paste('     Loading graphs for', basename(proclist[i])))
-                                 doc <- readRDS(proclist[i])                          
-                                 
-                                 # check that doc$docname is not blank
-                                 if (!("docname" %in% names(doc))){
-                                   if ( !dir.exists(file.path(output_dir, 'problem_files'))){
-                                     dir.create(file.path(output_dir, 'problem_files'))
-                                   }
-                                   file.copy(proclist[i], file.path(output_dir, 'problem_files', basename(proclist[i])))
-                                   message(paste("docname is NULL for", proclist[i], '\n'))
-                                   # next
-                                   return()
-                                 }
-                                 
-                                 # load outfile if it already exists
-                                 outfile <- file.path(output_dir, paste0(stringr::str_replace(doc$docname, ".png", ""), ".rds"))
-                                 if ( file.exists(outfile) ){
-                                   message(paste('     Cluster assignments already exist for', doc$docname, '\n'))
-                                   df <- readRDS(outfile)
-                                   return(df)
-                                   # out_proclist[[i]] <- df
-                                   # next
-                                 }
-                                 
-                                 # extra processing
-                                 doc$process$letterList = AddLetterImages(doc$process$letterList, dim(doc$image))
-                                 doc$process$letterList = MakeLetterListLetterSpecific(doc$process$letterList, dim(doc$image)) ### THIS SCREWS UP PLOTLETTER AND OTHER PLOTTING!!!
-                                 
-                                 imagesList = list()
-                                 imagesList = c(imagesList, lapply(doc$process$letterList, function(x){centeredImage(x)}))
-                                 imagesList = lapply(imagesList, function(x){
-                                   x$nodesrc = cbind(((x$nodes-1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((x$nodes-1) %% dim(x$image)[1]))
-                                   x$nodesrc = x$nodesrc - matrix(rep(x$centroid, each = dim(x$nodesrc)[1]), ncol = 2)
-                                   x$pathEndsrc = lapply(x$allPaths, function(z){cbind(((z[c(1,length(z))]-1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((z[c(1,length(z))]-1) %% dim(x$image)[1]))})
-                                   x$pathEndsrc = lapply(x$pathEndsrc, function(z){z - matrix(rep(x$centroid, each = 2), ncol = 2)})
-                                   return(x)
-                                 })
-                                 
-                                 # get cluster assignments
-                                 message(paste('     Getting cluster assignments for', doc$docname))
-                                 cluster_assign = sapply(imagesList, makeassignment, templateCenterList = template$centers, outliercut = outliercut)
-                                 df = data.frame(cluster = cluster_assign)
-                                 
-                                 # add docname, writer, doc, slope, xvar, yvar, and covar
-                                 df$docname <- doc$docname
-                                 df$writer <- as.integer(sapply(df$docname, function(x) substr(x, start = writer_indices[1], stop = writer_indices[2])))
-                                 df$doc <- sapply(df$docname, function(x) substr(x, start = doc_indices[1], stop = doc_indices[2]), USE.NAMES = FALSE)
-                                 df$slope <- sapply(doc$process$letterList, function(x) x$characterFeatures$slope)
-                                 df$xvar <- sapply(doc$process$letterList, function(x) x$characterFeatures$xvar)
-                                 df$yvar <- sapply(doc$process$letterList, function(x) x$characterFeatures$yvar)
-                                 df$covar <- sapply(doc$process$letterList, function(x) x$characterFeatures$covar)
-                                 
-                                 # calculate pc rotation angle and wrapped pc rotation angle
-                                 get_pc_rotation <- function(x){
-                                   xv <- as.numeric(x['xvar'])
-                                   yv <- as.numeric(x['yvar'])
-                                   cv <- as.numeric(x['covar'])
-                                   eig <- eigen(cbind(c(xv, cv), c(cv, yv)), symmetric = TRUE)
-                                   return(angle(t(as.matrix(eig$vectors[, 1])), as.matrix(c(1, 0))))
-                                 }
-                                 df$pc_rotation <- apply(df, 1, get_pc_rotation)
-                                 df$pc_wrapped <- 2 * df$pc_rotation
-                                 
-                                 # sort columns
-                                 df <- df[,c('docname', 'writer', 'doc', 'cluster', 'slope', 'xvar', 'yvar', 'covar', 'pc_rotation', 'pc_wrapped')]
-                                 
-                                 saveRDS(df, file = outfile)
-                                 message(paste('     Saving cluster assignments for ', doc$docname, '\n'))
-                                 
-                                 return(df)
-                                 # out_proclist[[i]] <- df
-                               }
-  
+  proclist <- list.files(input_dir, full.names = TRUE)
+
+  if (num_cores > 1) { # run in parallel
+    my_cluster <- parallel::makeCluster(num_cores)
+    doParallel::registerDoParallel(my_cluster)
+
+    proclist <- foreach::foreach(
+      i = 1:length(proclist),
+      .combine = "rbind",
+      .export = c("AddLetterImages", "MakeLetterListLetterSpecific", "centeredImage", "makeassignment", "angle")
+    ) %dopar% { # for each document i
+
+      message(paste("     Loading graphs for", basename(proclist[i])))
+      doc <- readRDS(proclist[i])
+
+      # check that doc$docname is not blank
+      if (!("docname" %in% names(doc))) {
+        if (!dir.exists(file.path(output_dir, "problem_files"))) {
+          dir.create(file.path(output_dir, "problem_files"))
+        }
+        file.copy(proclist[i], file.path(output_dir, "problem_files", basename(proclist[i])))
+        message(paste("docname is NULL for", proclist[i], "\n"))
+        return()
+      }
+
+      # load outfile if it already exists
+      outfile <- file.path(output_dir, paste0(stringr::str_replace(doc$docname, ".png", ""), ".rds"))
+      if (file.exists(outfile)) {
+        message(paste("     Cluster assignments already exist for", doc$docname, "\n"))
+        df <- readRDS(outfile)
+        return(df)
+      }
+
+      # extra processing
+      doc$process$letterList <- AddLetterImages(doc$process$letterList, dim(doc$image))
+      doc$process$letterList <- MakeLetterListLetterSpecific(doc$process$letterList, dim(doc$image)) ### THIS SCREWS UP PLOTLETTER AND OTHER PLOTTING!!!
+
+      imagesList <- list()
+      imagesList <- c(imagesList, lapply(doc$process$letterList, function(x) {
+        centeredImage(x)
+      }))
+      imagesList <- lapply(imagesList, function(x) {
+        x$nodesrc <- cbind(((x$nodes - 1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((x$nodes - 1) %% dim(x$image)[1]))
+        x$nodesrc <- x$nodesrc - matrix(rep(x$centroid, each = dim(x$nodesrc)[1]), ncol = 2)
+        x$pathEndsrc <- lapply(x$allPaths, function(z) {
+          cbind(((z[c(1, length(z))] - 1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((z[c(1, length(z))] - 1) %% dim(x$image)[1]))
+        })
+        x$pathEndsrc <- lapply(x$pathEndsrc, function(z) {
+          z - matrix(rep(x$centroid, each = 2), ncol = 2)
+        })
+        return(x)
+      })
+
+      # get cluster assignments
+      message(paste("     Getting cluster assignments for", doc$docname))
+      cluster_assign <- sapply(imagesList, makeassignment, templateCenterList = template$centers, outliercut = outliercut)
+      df <- data.frame(cluster = cluster_assign)
+
+      # add docname, writer, doc, slope, xvar, yvar, and covar
+      df$docname <- doc$docname
+      df$writer <- as.integer(sapply(df$docname, function(x) substr(x, start = writer_indices[1], stop = writer_indices[2])))
+      df$doc <- sapply(df$docname, function(x) substr(x, start = doc_indices[1], stop = doc_indices[2]), USE.NAMES = FALSE)
+      df$slope <- sapply(doc$process$letterList, function(x) x$characterFeatures$slope)
+      df$xvar <- sapply(doc$process$letterList, function(x) x$characterFeatures$xvar)
+      df$yvar <- sapply(doc$process$letterList, function(x) x$characterFeatures$yvar)
+      df$covar <- sapply(doc$process$letterList, function(x) x$characterFeatures$covar)
+
+      # calculate pc rotation angle and wrapped pc rotation angle
+      get_pc_rotation <- function(x) {
+        xv <- as.numeric(x["xvar"])
+        yv <- as.numeric(x["yvar"])
+        cv <- as.numeric(x["covar"])
+        eig <- eigen(cbind(c(xv, cv), c(cv, yv)), symmetric = TRUE)
+        return(angle(t(as.matrix(eig$vectors[, 1])), as.matrix(c(1, 0))))
+      }
+      df$pc_rotation <- apply(df, 1, get_pc_rotation)
+      df$pc_wrapped <- 2 * df$pc_rotation
+
+      # sort columns
+      df <- df[, c("docname", "writer", "doc", "cluster", "slope", "xvar", "yvar", "covar", "pc_rotation", "pc_wrapped")]
+
+      saveRDS(df, file = outfile)
+      message(paste("     Saving cluster assignments for ", doc$docname, "\n"))
+
+      return(df)
+    }
+  } else { # run sequentially
+    out_proclist <- list()
+    for (i in 1:length(proclist)) {
+      # load doc
+      message(paste("     Loading graphs for", basename(proclist[i])))
+      doc <- readRDS(proclist[i])
+
+      # check that doc$docname is not blank
+      if (!("docname" %in% names(doc))) {
+        if (!dir.exists(file.path(output_dir, "problem_files"))) {
+          dir.create(file.path(output_dir, "problem_files"))
+        }
+        message(paste("docname is NULL for", proclist[i], "\n"))
+        # copy file to problem files folder
+        file.copy(proclist[i], file.path(output_dir, "problem_files", basename(proclist[i])))
+        next
+      }
+
+      # load outfile if it already exists
+      outfile <- file.path(output_dir, paste0(stringr::str_replace(doc$docname, ".png", ""), ".rds"))
+      if (file.exists(outfile)) {
+        message(paste("     Cluster assignments already exist for", doc$docname, "\n"))
+        df <- readRDS(outfile)
+        out_proclist[[i]] <- df
+        next
+      }
+
+      # extra processing
+      doc$process$letterList <- AddLetterImages(doc$process$letterList, dim(doc$image))
+      doc$process$letterList <- MakeLetterListLetterSpecific(doc$process$letterList, dim(doc$image)) ### THIS SCREWS UP PLOTLETTER AND OTHER PLOTTING!!!
+
+      imagesList <- list()
+      imagesList <- c(imagesList, lapply(doc$process$letterList, function(x) {
+        centeredImage(x)
+      }))
+      imagesList <- lapply(imagesList, function(x) {
+        x$nodesrc <- cbind(((x$nodes - 1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((x$nodes - 1) %% dim(x$image)[1]))
+        x$nodesrc <- x$nodesrc - matrix(rep(x$centroid, each = dim(x$nodesrc)[1]), ncol = 2)
+        x$pathEndsrc <- lapply(x$allPaths, function(z) {
+          cbind(((z[c(1, length(z))] - 1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((z[c(1, length(z))] - 1) %% dim(x$image)[1]))
+        })
+        x$pathEndsrc <- lapply(x$pathEndsrc, function(z) {
+          z - matrix(rep(x$centroid, each = 2), ncol = 2)
+        })
+        return(x)
+      })
+
+      # get cluster assignments
+      message(paste("     Getting cluster assignments for", doc$docname))
+      cluster_assign <- sapply(imagesList, makeassignment, templateCenterList = template$centers, outliercut = outliercut)
+      df <- data.frame(cluster = cluster_assign)
+
+      # add docname, writer, doc, slope, xvar, yvar, and covar
+      df$docname <- doc$docname
+      df$writer <- as.integer(sapply(df$docname, function(x) substr(x, start = writer_indices[1], stop = writer_indices[2])))
+      df$doc <- sapply(df$docname, function(x) substr(x, start = doc_indices[1], stop = doc_indices[2]), USE.NAMES = FALSE)
+      df$slope <- sapply(doc$process$letterList, function(x) x$characterFeatures$slope)
+      df$xvar <- sapply(doc$process$letterList, function(x) x$characterFeatures$xvar)
+      df$yvar <- sapply(doc$process$letterList, function(x) x$characterFeatures$yvar)
+      df$covar <- sapply(doc$process$letterList, function(x) x$characterFeatures$covar)
+
+      # calculate pc rotation angle and wrapped pc rotation angle
+      get_pc_rotation <- function(x) {
+        xv <- as.numeric(x["xvar"])
+        yv <- as.numeric(x["yvar"])
+        cv <- as.numeric(x["covar"])
+        eig <- eigen(cbind(c(xv, cv), c(cv, yv)), symmetric = TRUE)
+        return(angle(t(as.matrix(eig$vectors[, 1])), as.matrix(c(1, 0))))
+      }
+      df$pc_rotation <- apply(df, 1, get_pc_rotation)
+      df$pc_wrapped <- 2 * df$pc_rotation
+
+      # sort columns
+      df <- df[, c("docname", "writer", "doc", "cluster", "slope", "xvar", "yvar", "covar", "pc_rotation", "pc_wrapped")]
+
+      saveRDS(df, file = outfile)
+      message(paste("     Saving cluster assignments for ", doc$docname, "\n"))
+
+      out_proclist[[i]] <- df
+    }
+    # rename
+    proclist <- out_proclist
+  }
+
   # save clusters
   saveRDS(proclist, file.path(output_dir, "all_clusters.rds"))
 
@@ -134,9 +228,13 @@ get_clusters_batch = function(template, input_dir, output_dir, writer_indices, d
 # Internal Functions ------------------------------------------------------
 
 
-makeassignment = function(imageListElement, templateCenterList, outliercut){
-  dist = min(unlist(lapply(templateCenterList, function(x){getGraphDistance(imageList1 = imageListElement, imageList2 = x, isProto2 = TRUE)$matching_weight})))
-  cluster = which.min(unlist(lapply(templateCenterList, function(x){getGraphDistance(imageList1 = imageListElement, imageList2 = x, isProto2 = TRUE)$matching_weight})))
+makeassignment <- function(imageListElement, templateCenterList, outliercut) {
+  dist <- min(unlist(lapply(templateCenterList, function(x) {
+    getGraphDistance(imageList1 = imageListElement, imageList2 = x, isProto2 = TRUE)$matching_weight
+  })))
+  cluster <- which.min(unlist(lapply(templateCenterList, function(x) {
+    getGraphDistance(imageList1 = imageListElement, imageList2 = x, isProto2 = TRUE)$matching_weight
+  })))
   return(cluster)
 }
 
@@ -145,7 +243,7 @@ makeassignment = function(imageListElement, templateCenterList, outliercut){
 #'
 #' @param template_dir Directory containing a cluster template created with `make_clustering_templates`
 #' @param input_type `model` or `questioned`
-#' @param num_graphs 'All' or integer number of graphs to randomly select from each document. 
+#' @param num_graphs 'All' or integer number of graphs to randomly select from each document.
 #' @param writer_indices Vector of start and end indices for the writer id in
 #'   the document names
 #' @param doc_indices Vector of start and end indices for the document id in the
@@ -155,83 +253,93 @@ makeassignment = function(imageListElement, templateCenterList, outliercut){
 #' @return list of processed handwriting with cluster assignments for each graph
 #'
 #' @noRd
-get_clusterassignment = function(template_dir, input_type, num_graphs = "All", writer_indices, doc_indices, num_cores){
+get_clusterassignment <- function(template_dir, input_type, num_graphs = "All", writer_indices, doc_indices, num_cores) {
   # bind global variables to fix check() note
   i <- outliercut <- docname <- NULL
-  
+
   # load cluster file if it already exists
-  if ( input_type == "model" ){
+  if (input_type == "model") {
     cluster_file <- file.path(template_dir, "data", "model_clusters.rds")
-  } else if ( input_type == "questioned" ) {
+  } else if (input_type == "questioned") {
     cluster_file <- file.path(template_dir, "data", "questioned_clusters.rds")
   } else {
     stop("Unknown input type. Use model or questioned.")
   }
-  if ( file.exists(cluster_file) ){
+  if (file.exists(cluster_file)) {
     proclist <- readRDS(cluster_file)
     return(proclist)
   }
-  
+
   # load template
-  if ( file.exists(file.path(template_dir, "data", "template.rds")) ){
+  if (file.exists(file.path(template_dir, "data", "template.rds"))) {
     template <- readRDS(file.path(template_dir, "data", "template.rds"))
   } else {
     stop(paste("There is no cluster template in", template_dir))
   }
-  
+
   # get input directory
-  if ( input_type == "model" ){
+  if (input_type == "model") {
     input_dir <- file.path(template_dir, "data", "model_graphs")
   } else {
     input_dir <- file.path(template_dir, "data", "questioned_graphs")
-  } 
-  
+  }
+
   # make output directory
-  if ( input_type == "model" ){
+  if (input_type == "model") {
     output_dir <- file.path(template_dir, "data", "model_clusters")
   } else {
     output_dir <- file.path(template_dir, "data", "questioned_clusters")
-  } 
-  if ( !dir.exists(output_dir) ){ dir.create(output_dir) }
-  
+  }
+  if (!dir.exists(output_dir)) {
+    dir.create(output_dir)
+  }
+
   # list files in input dir
-  proclist = list.files(input_dir, full.names = TRUE)
-  
-  my_cluster = parallel::makeCluster(num_cores)
+  proclist <- list.files(input_dir, full.names = TRUE)
+
+  my_cluster <- parallel::makeCluster(num_cores)
   doParallel::registerDoParallel(my_cluster)
-  
-  proclist <- foreach::foreach(i = 1:length(proclist), 
-                               .combine = 'rbind',
-                               .export = c("AddLetterImages", "MakeLetterListLetterSpecific", "centeredImage", "makeassignment", "angle")) %dopar% {  # for each document i
-    
+
+  proclist <- foreach::foreach(
+    i = 1:length(proclist),
+    .combine = "rbind",
+    .export = c("AddLetterImages", "MakeLetterListLetterSpecific", "centeredImage", "makeassignment", "angle")
+  ) %dopar% { # for each document i
+
     # load doc
-    doc <- readRDS(proclist[i])                          
-                                 
+    doc <- readRDS(proclist[i])
+
     # load outfile if it already exists
     outfile <- file.path(output_dir, paste0(doc$docname, ".rds"))
-    if ( file.exists(outfile) ){
+    if (file.exists(outfile)) {
       df <- readRDS(outfile)
       return(df)
     }
-                                 
+
     # extra processing
-    doc$process$letterList = AddLetterImages(doc$process$letterList, dim(doc$image))
-    doc$process$letterList = MakeLetterListLetterSpecific(doc$process$letterList, dim(doc$image)) ### THIS SCREWS UP PLOTLETTER AND OTHER PLOTTING!!!
-    
-    imagesList = list()
-    imagesList = c(imagesList, lapply(doc$process$letterList, function(x){centeredImage(x)}))
-    imagesList = lapply(imagesList, function(x){
-      x$nodesrc = cbind(((x$nodes-1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((x$nodes-1) %% dim(x$image)[1]))
-      x$nodesrc = x$nodesrc - matrix(rep(x$centroid, each = dim(x$nodesrc)[1]), ncol = 2)
-      x$pathEndsrc = lapply(x$allPaths, function(z){cbind(((z[c(1,length(z))]-1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((z[c(1,length(z))]-1) %% dim(x$image)[1]))})
-      x$pathEndsrc = lapply(x$pathEndsrc, function(z){z - matrix(rep(x$centroid, each = 2), ncol = 2)})
+    doc$process$letterList <- AddLetterImages(doc$process$letterList, dim(doc$image))
+    doc$process$letterList <- MakeLetterListLetterSpecific(doc$process$letterList, dim(doc$image)) ### THIS SCREWS UP PLOTLETTER AND OTHER PLOTTING!!!
+
+    imagesList <- list()
+    imagesList <- c(imagesList, lapply(doc$process$letterList, function(x) {
+      centeredImage(x)
+    }))
+    imagesList <- lapply(imagesList, function(x) {
+      x$nodesrc <- cbind(((x$nodes - 1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((x$nodes - 1) %% dim(x$image)[1]))
+      x$nodesrc <- x$nodesrc - matrix(rep(x$centroid, each = dim(x$nodesrc)[1]), ncol = 2)
+      x$pathEndsrc <- lapply(x$allPaths, function(z) {
+        cbind(((z[c(1, length(z))] - 1) %/% dim(x$image)[1]) + 1, dim(x$image)[1] - ((z[c(1, length(z))] - 1) %% dim(x$image)[1]))
+      })
+      x$pathEndsrc <- lapply(x$pathEndsrc, function(z) {
+        z - matrix(rep(x$centroid, each = 2), ncol = 2)
+      })
       return(x)
     })
-    
+
     # get cluster assignments
-    cluster_assign = sapply(imagesList, makeassignment, templateCenterList = template$centers, outliercut = outliercut)
-    df = data.frame(cluster = cluster_assign)
-    
+    cluster_assign <- sapply(imagesList, makeassignment, templateCenterList = template$centers, outliercut = outliercut)
+    df <- data.frame(cluster = cluster_assign)
+
     # add docname, writer, doc, slope, xvar, yvar, and covar
     df$docname <- doc$docname
     df$writer <- as.integer(sapply(df$docname, function(x) substr(x, start = writer_indices[1], stop = writer_indices[2])))
@@ -240,104 +348,111 @@ get_clusterassignment = function(template_dir, input_type, num_graphs = "All", w
     df$xvar <- sapply(doc$process$letterList, function(x) x$characterFeatures$xvar)
     df$yvar <- sapply(doc$process$letterList, function(x) x$characterFeatures$yvar)
     df$covar <- sapply(doc$process$letterList, function(x) x$characterFeatures$covar)
-    
+
     # calculate pc rotation angle and wrapped pc rotation angle
-    get_pc_rotation <- function(x){
-      xv <- as.numeric(x['xvar'])
-      yv <- as.numeric(x['yvar'])
-      cv <- as.numeric(x['covar'])
+    get_pc_rotation <- function(x) {
+      xv <- as.numeric(x["xvar"])
+      yv <- as.numeric(x["yvar"])
+      cv <- as.numeric(x["covar"])
       eig <- eigen(cbind(c(xv, cv), c(cv, yv)), symmetric = TRUE)
       return(angle(t(as.matrix(eig$vectors[, 1])), as.matrix(c(1, 0))))
     }
     df$pc_rotation <- apply(df, 1, get_pc_rotation)
     df$pc_wrapped <- 2 * df$pc_rotation
-    
-    # sample graphs 
-    if (num_graphs != "All"){
-      df <- df %>% 
+
+    # sample graphs
+    if (num_graphs != "All") {
+      df <- df %>%
         dplyr::group_by(docname) %>%
         dplyr::slice_sample(n = num_graphs)
     }
-    
+
     # sort columns
-    df <- df[,c('docname', 'writer', 'doc', 'cluster', 'slope', 'xvar', 'yvar', 'covar', 'pc_rotation', 'pc_wrapped')]
+    df <- df[, c("docname", "writer", "doc", "cluster", "slope", "xvar", "yvar", "covar", "pc_rotation", "pc_wrapped")]
 
     saveRDS(df, file = file.path(output_dir, paste0(stringr::str_replace(doc$docname, ".png", ""), ".rds")))
-    
+
     return(df)
   }
 
   # save clusters
-  if ( input_type == "model" ){
+  if (input_type == "model") {
     saveRDS(proclist, file.path(template_dir, "data", "model_clusters.rds"))
   } else {
     saveRDS(proclist, file.path(template_dir, "data", "questioned_clusters.rds"))
-  } 
-  
+  }
+
   return(proclist)
 }
 
 
-GetImageMatrix = function(letterList, maxImageSize = 50)
-{ imagesList = list()
-  imagesList = c(imagesList, lapply(letterList, function(x){centeredImage(x)}))
-  
+GetImageMatrix <- function(letterList, maxImageSize = 50) {
+  imagesList <- list()
+  imagesList <- c(imagesList, lapply(letterList, function(x) {
+    centeredImage(x)
+  }))
+
   # letterList = unlist(letListFull,recursive=FALSE)
-  
-  for(i in 1:length(imagesList)){
-    l = imagesList[[i]]$centroid[1]
-    r = dim(imagesList[[i]]$image)[2] - imagesList[[i]]$centroid[1] + 1
-    t = dim(imagesList[[i]]$image)[1] - imagesList[[i]]$centroid[2] + 1
-    b = imagesList[[i]]$centroid[2]
-    if(l > r){
-      imagesList[[i]]$image = cbind(imagesList[[i]]$image, matrix(1, ncol = l-r, nrow = dim(imagesList[[i]]$image)[1]))
-    }
-    else if(l < r){
-      imagesList[[i]]$image = cbind(matrix(1, ncol = r-l, nrow = dim(imagesList[[i]]$image)[1]), imagesList[[i]]$image)
-    }
-    if(t > b){
-      imagesList[[i]]$image = rbind(imagesList[[i]]$image, matrix(1, nrow = t-b, ncol = dim(imagesList[[i]]$image)[2]))
+
+  for (i in 1:length(imagesList)) {
+    l <- imagesList[[i]]$centroid[1]
+    r <- dim(imagesList[[i]]$image)[2] - imagesList[[i]]$centroid[1] + 1
+    t <- dim(imagesList[[i]]$image)[1] - imagesList[[i]]$centroid[2] + 1
+    b <- imagesList[[i]]$centroid[2]
+    if (l > r) {
+      imagesList[[i]]$image <- cbind(imagesList[[i]]$image, matrix(1, ncol = l - r, nrow = dim(imagesList[[i]]$image)[1]))
+    } else if (l < r) {
+      imagesList[[i]]$image <- cbind(matrix(1, ncol = r - l, nrow = dim(imagesList[[i]]$image)[1]), imagesList[[i]]$image)
     }
-    else if(t < b){
-      imagesList[[i]]$image = rbind(matrix(1, nrow = b-t, ncol = dim(imagesList[[i]]$image)[2]), imagesList[[i]]$image)
+    if (t > b) {
+      imagesList[[i]]$image <- rbind(imagesList[[i]]$image, matrix(1, nrow = t - b, ncol = dim(imagesList[[i]]$image)[2]))
+    } else if (t < b) {
+      imagesList[[i]]$image <- rbind(matrix(1, nrow = b - t, ncol = dim(imagesList[[i]]$image)[2]), imagesList[[i]]$image)
     }
   }
-  
-  for(i in 1:length(imagesList)){
-    if(any(dim(imagesList[[i]]$image) > maxImageSize)){
-      imagesList[[i]]$image = imagesList[[i]]$image %>% as.raster() %>% magick::image_read() %>% magick::image_resize(paste0(maxImageSize,"x",maxImageSize)) %>% magick::image_quantize(max = 2, dither = FALSE, colorspace = "gray") %>% `[[`(1) %>% as.numeric() %>% `[`(,,1)
-      imagesList[[i]]$image = rbind(1,cbind(1,imagesList[[i]]$image,1),1)
-      thinned = thinImage(imagesList[[i]]$image)
-      imagesList[[i]]$image[] = 1
-      imagesList[[i]]$image[thinned] = 0
-      
-      imagesList[[i]]$image = imagesList[[i]]$image[-c(1,dim(imagesList[[i]]$image)[1]),-c(1,dim(imagesList[[i]]$image)[2])]
-      #print(plotImage(imagesList[[i]]$image) + theme_bw())
+
+  for (i in 1:length(imagesList)) {
+    if (any(dim(imagesList[[i]]$image) > maxImageSize)) {
+      imagesList[[i]]$image <- imagesList[[i]]$image %>%
+        as.raster() %>%
+        magick::image_read() %>%
+        magick::image_resize(paste0(maxImageSize, "x", maxImageSize)) %>%
+        magick::image_quantize(max = 2, dither = FALSE, colorspace = "gray") %>%
+        `[[`(1) %>%
+        as.numeric() %>%
+        `[`(, , 1)
+      imagesList[[i]]$image <- rbind(1, cbind(1, imagesList[[i]]$image, 1), 1)
+      thinned <- thinImage(imagesList[[i]]$image)
+      imagesList[[i]]$image[] <- 1
+      imagesList[[i]]$image[thinned] <- 0
+
+      imagesList[[i]]$image <- imagesList[[i]]$image[-c(1, dim(imagesList[[i]]$image)[1]), -c(1, dim(imagesList[[i]]$image)[2])]
+      # print(plotImage(imagesList[[i]]$image) + theme_bw())
       cat(i, " ")
     }
   }
-  
-  for(i in 1:length(imagesList)){
-    dims = dim(imagesList[[i]]$image)
-    lrPad = maxImageSize + 2 - dims[2]
-    tbPad = maxImageSize + 2 - dims[1]
-    l = floor(lrPad/2)
-    r = ceiling(lrPad/2)
-    b = ceiling(tbPad/2)
-    t = floor(tbPad/2)
-    
-    imagesList[[i]]$image = rbind(matrix(1,nrow = t, ncol = dims[2]), imagesList[[i]]$image, matrix(1,nrow = b, ncol = dims[2]))
-    imagesList[[i]]$image = cbind(matrix(1,ncol = l, nrow = dim(imagesList[[i]]$image)[1]), imagesList[[i]]$image, matrix(1,ncol = r, nrow = dim(imagesList[[i]]$image)[1]))
-    
-    imagesList[[i]]$image = imagesList[[i]]$image[,-c(1,maxImageSize + 2)]
-    imagesList[[i]]$image = imagesList[[i]]$image[-c(1,maxImageSize + 2),]
+
+  for (i in 1:length(imagesList)) {
+    dims <- dim(imagesList[[i]]$image)
+    lrPad <- maxImageSize + 2 - dims[2]
+    tbPad <- maxImageSize + 2 - dims[1]
+    l <- floor(lrPad / 2)
+    r <- ceiling(lrPad / 2)
+    b <- ceiling(tbPad / 2)
+    t <- floor(tbPad / 2)
+
+    imagesList[[i]]$image <- rbind(matrix(1, nrow = t, ncol = dims[2]), imagesList[[i]]$image, matrix(1, nrow = b, ncol = dims[2]))
+    imagesList[[i]]$image <- cbind(matrix(1, ncol = l, nrow = dim(imagesList[[i]]$image)[1]), imagesList[[i]]$image, matrix(1, ncol = r, nrow = dim(imagesList[[i]]$image)[1]))
+
+    imagesList[[i]]$image <- imagesList[[i]]$image[, -c(1, maxImageSize + 2)]
+    imagesList[[i]]$image <- imagesList[[i]]$image[-c(1, maxImageSize + 2), ]
   }
-  #apply(matrix(unlist(lapply(imagesList, function(x){dim(x$image)})), ncol = 2, byrow = TRUE), 2, function(x){all(x == maxImageSize)})
-  
-  
-  images = array(NA, c(maxImageSize, maxImageSize, length(imagesList)))
-  for(i in 1:length(imagesList)){
-    images[,,i] = imagesList[[i]]$image
+  # apply(matrix(unlist(lapply(imagesList, function(x){dim(x$image)})), ncol = 2, byrow = TRUE), 2, function(x){all(x == maxImageSize)})
+
+
+  images <- array(NA, c(maxImageSize, maxImageSize, length(imagesList)))
+  for (i in 1:length(imagesList)) {
+    images[, , i] <- imagesList[[i]]$image
   }
   return(images)
 }

From 93edee4ee861df5875562fb56e03b3021e175951 Mon Sep 17 00:00:00 2001
From: Stephanie Reinders <reinders.stephanie@gmail.com>
Date: Fri, 9 Feb 2024 11:18:32 -0600
Subject: [PATCH 3/8] Fixed bug in checking that num_cores is an integer

I didn't realize that is.integer(5) returns FALSE! I replaced is.integer with x %% 1 == 0 test.
---
 R/ClusterModeling_clusterassignment.R | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/R/ClusterModeling_clusterassignment.R b/R/ClusterModeling_clusterassignment.R
index f4c802c9..108781a8 100644
--- a/R/ClusterModeling_clusterassignment.R
+++ b/R/ClusterModeling_clusterassignment.R
@@ -35,10 +35,14 @@ get_clusters_batch <- function(template, input_dir, output_dir, writer_indices,
   i <- outliercut <- docname <- NULL
 
   # check num_cores
-  if (!is.integer(num_cores)) {
+  if (len(num_cores) > 1){
+    stop("num_cores is longer than 1")
+  } else if (!is.numeric(num_cores)){ 
+    stop("num_cores is not numeric")
+  } else if (x %% 1 != 0) {
     stop("num_cores is not an integer")
   } else if (num_cores < 1) {
-    stop("num_cores is not an integer greater than or equal to 1")
+    stop("num_cores is not greater than or equal to 1")
   }
 
   message("Starting cluster assginment...")

From d3d508b8765b307bc9ad8488575019a963428edb Mon Sep 17 00:00:00 2001
From: Stephanie Reinders <reinders.stephanie@gmail.com>
Date: Fri, 9 Feb 2024 11:20:46 -0600
Subject: [PATCH 4/8] Fixed bug caused by typo in get_clusters_batch

I had used Python "len" instead of R "length"
---
 R/ClusterModeling_clusterassignment.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/ClusterModeling_clusterassignment.R b/R/ClusterModeling_clusterassignment.R
index 108781a8..96186e3b 100644
--- a/R/ClusterModeling_clusterassignment.R
+++ b/R/ClusterModeling_clusterassignment.R
@@ -35,7 +35,7 @@ get_clusters_batch <- function(template, input_dir, output_dir, writer_indices,
   i <- outliercut <- docname <- NULL
 
   # check num_cores
-  if (len(num_cores) > 1){
+  if (length(num_cores) > 1){
     stop("num_cores is longer than 1")
   } else if (!is.numeric(num_cores)){ 
     stop("num_cores is not numeric")

From 3c862bb0b945bd121ecc6acad92818c86ca2050c Mon Sep 17 00:00:00 2001
From: Stephanie Reinders <reinders.stephanie@gmail.com>
Date: Fri, 9 Feb 2024 11:22:14 -0600
Subject: [PATCH 5/8] Fixed bug in get_clusters_batch

---
 R/ClusterModeling_clusterassignment.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/ClusterModeling_clusterassignment.R b/R/ClusterModeling_clusterassignment.R
index 96186e3b..d9135e30 100644
--- a/R/ClusterModeling_clusterassignment.R
+++ b/R/ClusterModeling_clusterassignment.R
@@ -39,7 +39,7 @@ get_clusters_batch <- function(template, input_dir, output_dir, writer_indices,
     stop("num_cores is longer than 1")
   } else if (!is.numeric(num_cores)){ 
     stop("num_cores is not numeric")
-  } else if (x %% 1 != 0) {
+  } else if (num_cores %% 1 != 0) {
     stop("num_cores is not an integer")
   } else if (num_cores < 1) {
     stop("num_cores is not greater than or equal to 1")

From 76ef362a3d7b86fa04f344d3ee9b192dffab3887 Mon Sep 17 00:00:00 2001
From: Stephanie Reinders <reinders.stephanie@gmail.com>
Date: Fri, 9 Feb 2024 15:04:25 -0600
Subject: [PATCH 6/8] Changed `get_clusters_batch()` to allow creation of
 nested output folders

Changed dir.create(output_dir) to dir.create(output_dir, recursive = TRUE) so that if the the output dir is dir1 > dir2 > dir3 but only dir1 exists, then dir2  and dir3 will be created.
---
 R/ClusterModeling_clusterassignment.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/ClusterModeling_clusterassignment.R b/R/ClusterModeling_clusterassignment.R
index d9135e30..1330ec1d 100644
--- a/R/ClusterModeling_clusterassignment.R
+++ b/R/ClusterModeling_clusterassignment.R
@@ -48,7 +48,7 @@ get_clusters_batch <- function(template, input_dir, output_dir, writer_indices,
   message("Starting cluster assginment...")
   # make output directory
   if (!dir.exists(output_dir)) {
-    dir.create(output_dir)
+    dir.create(output_dir, recursive = TRUE)
   }
 
   # list files in input dir

From 963505966bbe5a3b60e50f52edb0be83670c19be Mon Sep 17 00:00:00 2001
From: Stephanie Reinders <reinders.stephanie@gmail.com>
Date: Mon, 12 Feb 2024 09:43:14 -0600
Subject: [PATCH 7/8] Exported `get_cluster_fill_counts`

Changed `get_cluster_fill_counts` to an exported function so that handwriterFetch can use it.
---
 NAMESPACE                         |  1 +
 R/ClusterModeling_datafunctions.R | 67 +++++++++++++++++--------------
 man/get_cluster_fill_counts.Rd    | 29 +++++++++++++
 3 files changed, 67 insertions(+), 30 deletions(-)
 create mode 100644 man/get_cluster_fill_counts.Rd

diff --git a/NAMESPACE b/NAMESPACE
index ab9f2faa..0fbccdc6 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -9,6 +9,7 @@ export(drop_burnin)
 export(extractGraphs)
 export(fit_model)
 export(format_template_data)
+export(get_cluster_fill_counts)
 export(get_clusters_batch)
 export(get_credible_intervals)
 export(get_posterior_probabilities)
diff --git a/R/ClusterModeling_datafunctions.R b/R/ClusterModeling_datafunctions.R
index 2911f45c..bda82492 100644
--- a/R/ClusterModeling_datafunctions.R
+++ b/R/ClusterModeling_datafunctions.R
@@ -69,6 +69,43 @@ format_template_data <- function(template) {
 }
 
 
+#' get_cluster_fill_counts
+#'
+#' `get_cluster_fill_counts()` creates a data frame that shows the number of
+#' graphs in each cluster for each input document.
+#'
+#' @param df A data frame with columns `writer`, `doc`, and `cluster`. Each
+#'   row corresponding to a graph and lists the writer of that graph, the document
+#'   from which the graph was obtained, and the cluster to which that graph is assigned.
+#' @return A dataframe of cluster fill counts for each document in the input data frame.
+#' 
+#' @examples
+#' writer <- c(rep(1, 20), rep(2, 20), rep(3, 20))
+#' docname <- c(rep('doc1',20), rep('doc2', 20), rep('doc3', 20))
+#' doc <- c(rep(1, 20), rep(2, 20), rep(3, 20))
+#' cluster <- sample(3, 60, replace=TRUE)
+#' df <- data.frame(docname, writer, doc, cluster)
+#' get_cluster_fill_counts(df)
+#' 
+#' @export
+#' @md
+get_cluster_fill_counts <- function(df) {
+  docname <- writer <- doc <- cluster <- n <- NULL
+  
+  # count number of graphs in each cluster for each writer
+  cluster_fill_counts <- df %>%
+    dplyr::group_by(docname, writer, doc, cluster) %>%
+    dplyr::summarise(n = dplyr::n()) %>%
+    dplyr::mutate(n = as.integer(n)) %>%
+    tidyr::pivot_wider(names_from = cluster, values_from = n, values_fill = 0)
+  
+  # sort columns
+  cols <- c(colnames(cluster_fill_counts[, c(1, 2, 3)]), sort(as.numeric(colnames(cluster_fill_counts[, -c(1, 2, 3)]))))
+  cluster_fill_counts <- cluster_fill_counts[, cols]
+  
+  return(cluster_fill_counts)
+}
+
 # Internal Functions ------------------------------------------------------
 
 
@@ -201,36 +238,6 @@ format_questioned_data <- function(model, questioned_clusters, writer_indices, d
   return(data)
 }
 
-
-#' get_cluster_fill_counts
-#'
-#' `get_cluster_fill_counts()` creates a data frame that shows the number of
-#' graphs in each cluster for each input document.
-#'
-#' @param df A data frame with columns `writer`, `doc`, and `cluster`. Each
-#'   row corresponding to a graph and lists the writer of that graph, the document
-#'   from which the graph was obtained, and the cluster to which that graph is assigned.
-#' @return A dataframe of cluster fill counts for each document in the input data frame.
-#'
-#' @noRd
-get_cluster_fill_counts <- function(df) {
-  docname <- writer <- doc <- cluster <- n <- NULL
-  
-  # count number of graphs in each cluster for each writer
-  cluster_fill_counts <- df %>%
-    dplyr::group_by(docname, writer, doc, cluster) %>%
-    dplyr::summarise(n = dplyr::n()) %>%
-    dplyr::mutate(n = as.integer(n)) %>%
-    tidyr::pivot_wider(names_from = cluster, values_from = n, values_fill = 0)
-
-  # sort columns
-  cols <- c(colnames(cluster_fill_counts[, c(1, 2, 3)]), sort(as.numeric(colnames(cluster_fill_counts[, -c(1, 2, 3)]))))
-  cluster_fill_counts <- cluster_fill_counts[, cols]
-
-  return(cluster_fill_counts)
-}
-
-
 #' angle
 #'
 #' `angle()` gives a value in (-pi,pi), where negative values come from unit vectors below the x axis (kinda weird/not traditional)
diff --git a/man/get_cluster_fill_counts.Rd b/man/get_cluster_fill_counts.Rd
new file mode 100644
index 00000000..60525609
--- /dev/null
+++ b/man/get_cluster_fill_counts.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ClusterModeling_datafunctions.R
+\name{get_cluster_fill_counts}
+\alias{get_cluster_fill_counts}
+\title{get_cluster_fill_counts}
+\usage{
+get_cluster_fill_counts(df)
+}
+\arguments{
+\item{df}{A data frame with columns \code{writer}, \code{doc}, and \code{cluster}. Each
+row corresponding to a graph and lists the writer of that graph, the document
+from which the graph was obtained, and the cluster to which that graph is assigned.}
+}
+\value{
+A dataframe of cluster fill counts for each document in the input data frame.
+}
+\description{
+\code{get_cluster_fill_counts()} creates a data frame that shows the number of
+graphs in each cluster for each input document.
+}
+\examples{
+writer <- c(rep(1, 20), rep(2, 20), rep(3, 20))
+docname <- c(rep('doc1',20), rep('doc2', 20), rep('doc3', 20))
+doc <- c(rep(1, 20), rep(2, 20), rep(3, 20))
+cluster <- sample(3, 60, replace=TRUE)
+df <- data.frame(docname, writer, doc, cluster)
+get_cluster_fill_counts(df)
+
+}

From 6f54b5a8d86c207b26c2692ee793e53ed2cd1220 Mon Sep 17 00:00:00 2001
From: Stephanie Reinders <reinders.stephanie@gmail.com>
Date: Mon, 12 Feb 2024 09:44:17 -0600
Subject: [PATCH 8/8] Added examples to documentation for `get_clusters_batch`

---
 R/ClusterModeling_clusterassignment.R | 12 +++++++++++-
 man/get_clusters_batch.Rd             | 13 ++++++++++++-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/R/ClusterModeling_clusterassignment.R b/R/ClusterModeling_clusterassignment.R
index 1330ec1d..dca20a53 100644
--- a/R/ClusterModeling_clusterassignment.R
+++ b/R/ClusterModeling_clusterassignment.R
@@ -27,7 +27,17 @@
 #' @param num_cores Integer number of cores to use for parallel processing
 #'
 #' @return A list of cluster assignments
-#'
+#' 
+#' @examples
+#' \dontrun{
+#' template <- readRDS('path/to/template.rds')
+#' get_clusters_batch(template=template, input_dir='path/to/dir', output_dir='path/to/dir',
+#' writer_indices=c(2,5), doc_indices=c(7,18), num_cores=1)
+#' 
+#' get_clusters_batch(template=template, input_dir='path/to/dir', output_dir='path/to/dir',
+#' writer_indices=c(1,4), doc_indices=c(5,10), num_cores=5)
+#' }
+#' 
 #' @export
 #' @md
 get_clusters_batch <- function(template, input_dir, output_dir, writer_indices, doc_indices, num_cores = 1) {
diff --git a/man/get_clusters_batch.Rd b/man/get_clusters_batch.Rd
index 91f21ea4..acc30b99 100644
--- a/man/get_clusters_batch.Rd
+++ b/man/get_clusters_batch.Rd
@@ -10,7 +10,7 @@ get_clusters_batch(
   output_dir,
   writer_indices,
   doc_indices,
-  num_cores
+  num_cores = 1
 )
 }
 \arguments{
@@ -34,3 +34,14 @@ A list of cluster assignments
 \description{
 get_clusters_batch
 }
+\examples{
+\dontrun{
+template <- readRDS('path/to/template.rds')
+get_clusters_batch(template=template, input_dir='path/to/dir', output_dir='path/to/dir',
+writer_indices=c(2,5), doc_indices=c(7,18), num_cores=1)
+
+get_clusters_batch(template=template, input_dir='path/to/dir', output_dir='path/to/dir',
+writer_indices=c(1,4), doc_indices=c(5,10), num_cores=5)
+}
+
+}