Skip to content

Commit

Permalink
Merge pull request #202 from CSAFE-ISU/201-clusters-no-indices
Browse files Browse the repository at this point in the history
Changed writer and doc indices to optional in `get_clusters_batch()`
  • Loading branch information
stephaniereinders authored Nov 6, 2024
2 parents adc58ac + 6009443 commit e15d397
Show file tree
Hide file tree
Showing 17 changed files with 151 additions and 49 deletions.
45 changes: 33 additions & 12 deletions R/cluster_assignment.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@
#' @param input_dir A directory containing graphs created with
#' [`process_batch_dir`]
#' @param output_dir Output directory for cluster assignments
#' @param writer_indices Vector of start and end indices for the writer id in
#' the graph file names
#' @param doc_indices Vector of start and end indices for the document id in the
#' graph file names
#' @param writer_indices Optional. A Vector of start and end indices for the writer id in
#' the graph file names.
#' @param doc_indices Optional. Vector of start and end indices for the document id in the
#' graph file names.
#' @param num_cores Integer number of cores to use for parallel processing
#' @param save_master_file TRUE or FALSE. If TRUE, a master file named
#' 'all_clusters.rds' containing the cluster assignments for all documents in
Expand All @@ -46,7 +46,13 @@
#'
#' @export
#' @md
get_clusters_batch <- function(template, input_dir, output_dir, writer_indices, doc_indices, num_cores = 1, save_master_file = FALSE) {
get_clusters_batch <- function(template,
input_dir,
output_dir,
writer_indices = NULL,
doc_indices = NULL,
num_cores = 1,
save_master_file = FALSE) {
# bind global variables to fix check() note
i <- outliercut <- docname <- NULL

Expand Down Expand Up @@ -120,7 +126,10 @@ get_clusters_batch <- function(template, input_dir, output_dir, writer_indices,
# get cluster assignments
cluster_assign <- sapply(imagesList, makeassignment, templateCenterList = template$centers, outliercut = outliercut)

df <- make_clusters_df(cluster_assign, doc, writer_indices, doc_indices)
df <- make_clusters_df(cluster_assign = cluster_assign,
doc = doc,
writer_indices = writer_indices,
doc_indices = doc_indices)

saveRDS(df, file = outfile)

Expand Down Expand Up @@ -168,7 +177,10 @@ get_clusters_batch <- function(template, input_dir, output_dir, writer_indices,
message(paste("Getting cluster assignments for", doc$docname))
cluster_assign <- sapply(imagesList, makeassignment, templateCenterList = template$centers, outliercut = outliercut)

df <- make_clusters_df(cluster_assign, doc, writer_indices, doc_indices)
df <- make_clusters_df(cluster_assign = cluster_assign,
doc = doc,
writer_indices = writer_indices,
doc_indices = doc_indices)

saveRDS(df, file = outfile)
message(paste("Saving cluster assignments for ", doc$docname, "\n"))
Expand Down Expand Up @@ -411,7 +423,7 @@ MakeLetterListLetterSpecific = function(letterList, dims)
return(letterList)
}

make_clusters_df <- function(cluster_assign, doc, writer_indices, doc_indices) {
make_clusters_df <- function(cluster_assign, doc, writer_indices = NULL, doc_indices = NULL) {
# calculate pc rotation angle and wrapped pc rotation angle
# NOTE: foreach can't find get_pc_rotation unless it is nested in make_clusters_df
get_pc_rotation <- function(x) {
Expand All @@ -426,8 +438,12 @@ make_clusters_df <- function(cluster_assign, doc, writer_indices, doc_indices) {

# add docname, writer, doc, slope, xvar, yvar, and covar
df$docname <- doc$docname
df$writer <- sapply(df$docname, function(x) substr(x, start = writer_indices[1], stop = writer_indices[2]))
df$doc <- sapply(df$docname, function(x) substr(x, start = doc_indices[1], stop = doc_indices[2]), USE.NAMES = FALSE)

if (!is.null(writer_indices) && !is.null(doc_indices)){
df$writer <- sapply(df$docname, function(x) substr(x, start = writer_indices[1], stop = writer_indices[2]))
df$doc <- sapply(df$docname, function(x) substr(x, start = doc_indices[1], stop = doc_indices[2]), USE.NAMES = FALSE)
}

df$slope <- sapply(doc$process$letterList, function(x) x$characterFeatures$slope)
df$xvar <- sapply(doc$process$letterList, function(x) x$characterFeatures$xvar)
df$yvar <- sapply(doc$process$letterList, function(x) x$characterFeatures$yvar)
Expand All @@ -436,7 +452,11 @@ make_clusters_df <- function(cluster_assign, doc, writer_indices, doc_indices) {
df$pc_wrapped <- 2 * df$pc_rotation

# sort columns
df <- df[, c("docname", "writer", "doc", "cluster", "slope", "xvar", "yvar", "covar", "pc_rotation", "pc_wrapped")]
if (!is.null(writer_indices) && !is.null(doc_indices)){
df <- df[, c("docname", "writer", "doc", "cluster", "slope", "xvar", "yvar", "covar", "pc_rotation", "pc_wrapped")]
} else {
df <- df[, c("docname", "cluster", "slope", "xvar", "yvar", "covar", "pc_rotation", "pc_wrapped")]
}
return(df)
}

Expand Down Expand Up @@ -468,7 +488,8 @@ delete_graphs <- function(doc, max_edges = 30){
#' get_clusterassignment
#'
#' An internal function for getting cluster assignments for model or questioned
#' documents. This function runs 'get_clusters_batch'.
#' documents. This function runs 'get_clusters_batch'. Notice that this function
#' requires writer and doc indices even those 'get_clusters_batch' does not.
#'
#' @param main_dir Directory containing a cluster template created with
#' `make_clustering_template`
Expand Down
34 changes: 23 additions & 11 deletions R/cluster_format.R
Original file line number Diff line number Diff line change
Expand Up @@ -88,17 +88,29 @@ format_template_data <- function(template) {
get_cluster_fill_counts <- function(df) {
docname <- writer <- doc <- cluster <- n <- NULL

# count number of graphs in each cluster for each writer
cluster_fill_counts <- df %>%
dplyr::group_by(docname, writer, doc, cluster) %>%
dplyr::summarise(n = dplyr::n()) %>%
dplyr::mutate(n = as.integer(n)) %>%
tidyr::pivot_wider(names_from = cluster, values_from = n, values_fill = 0)

# sort columns
cols <- c(colnames(cluster_fill_counts[, c(1, 2, 3)]), sort(as.numeric(colnames(cluster_fill_counts[, -c(1, 2, 3)]))))
cluster_fill_counts <- cluster_fill_counts[, cols]

if (('writer' %in% colnames(df)) && ('doc' %in% colnames(df))) {
# count number of graphs in each cluster for each writer
cluster_fill_counts <- df %>%
dplyr::group_by(docname, writer, doc, cluster) %>%
dplyr::summarise(n = dplyr::n()) %>%
dplyr::mutate(n = as.integer(n)) %>%
tidyr::pivot_wider(names_from = cluster, values_from = n, values_fill = 0)

# sort columns
cols <- c(colnames(cluster_fill_counts[, c(1, 2, 3)]), sort(as.numeric(colnames(cluster_fill_counts[, -c(1, 2, 3)]))))
cluster_fill_counts <- cluster_fill_counts[, cols]
} else {
cluster_fill_counts <- df %>%
dplyr::group_by(docname, cluster) %>%
dplyr::summarise(n = dplyr::n()) %>%
dplyr::mutate(n = as.integer(n)) %>%
tidyr::pivot_wider(names_from = cluster, values_from = n, values_fill = 0)

# sort columns
cols <- c(colnames(cluster_fill_counts[, c(1)]), sort(as.numeric(colnames(cluster_fill_counts[, -c(1)]))))
cluster_fill_counts <- cluster_fill_counts[, cols]
}

return(cluster_fill_counts)
}

Expand Down
12 changes: 6 additions & 6 deletions man/get_clusters_batch.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
20 changes: 20 additions & 0 deletions tests/testthat/fixtures/processHandwriting/make_fixtures.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,23 @@ samples <- paste0("sample", 1:4)
for (sample in samples) {
save_proclist(sample)
}

unlink(tempdir(), recursive = TRUE)
clusters <- get_clusters_batch(
example_cluster_template,
testthat::test_path("fixtures", "processHandwriting", "graphs"),
tempdir())
saveRDS(clusters, testthat::test_path("fixtures", "processHandwriting", "clusters_wo_indices.rds"))
counts <- get_cluster_fill_counts(clusters)
saveRDS(counts, testthat::test_path("fixtures", "processHandwriting", "counts_wo_indices.rds"))

unlink(tempdir(), recursive = TRUE)
clusters <- get_clusters_batch(
template = example_cluster_template,
input_dir = testthat::test_path("fixtures", "processHandwriting", "graphs"),
output_dir = tempdir(),
writer_indices = c(2,5),
doc_indices = c(7,18))
saveRDS(clusters, testthat::test_path("fixtures", "processHandwriting", "clusters.rds"))
counts <- get_cluster_fill_counts(clusters)
saveRDS(counts, testthat::test_path("fixtures", "processHandwriting", "counts.rds"))
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,27 @@ test_that("get_clusters_batch works in parallel if cluster assignments already e
expected <- readRDS(testthat::test_path("fixtures", "clusters", "clusters.rds"))
expect_identical(actual, expected)
})

test_that("get_clusters_batch works sequentially without writer or doc indices", {
unlink(file.path(tempdir(), "clusters"), recursive = TRUE)
actual <- get_clusters_batch(
example_cluster_template,
testthat::test_path("fixtures", "processHandwriting", "graphs"),
file.path(tempdir(), "clusters"))

expected <- readRDS(testthat::test_path("fixtures", "processHandwriting", "clusters_wo_indices.rds"))

expect_identical(actual, expected)
})

test_that("get_clusters_batch works in parallel without writer or doc indices", {
unlink(file.path(tempdir(), "clusters"), recursive = TRUE)
actual <- get_clusters_batch(
example_cluster_template,
testthat::test_path("fixtures", "processHandwriting", "graphs"),
file.path(tempdir(), "clusters"))

expected <- readRDS(testthat::test_path("fixtures", "processHandwriting", "clusters_wo_indices.rds"))

expect_identical(actual, expected)
})
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -90,3 +90,27 @@ test_that("formatted questioned data is formatted correctly", {
# check cluster labels
expect_gte(min(data$graph_measurements$cluster), 1)
})


# get_cluster_fill_counts -------------------------------------------------

testthat::test_that("Get cluster fill counts works without writer or doc indices", {
clusters <- readRDS(testthat::test_path("fixtures", "processHandwriting", "clusters_wo_indices.rds"))
actual <- get_cluster_fill_counts(clusters)

expected <- readRDS(testthat::test_path("fixtures", "processHandwriting", "counts_wo_indices.rds"))

expect_equal(actual, expected)

})

testthat::test_that("Get cluster fill counts works with writer or doc indices", {
clusters <- readRDS(testthat::test_path("fixtures", "processHandwriting", "clusters.rds"))
actual <- get_cluster_fill_counts(clusters)

expected <- readRDS(testthat::test_path("fixtures", "processHandwriting", "counts.rds"))

expect_equal(actual, expected)

})

File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,3 +1,24 @@
test_that("processDocument works on sample1.png", {
# use the same fixtures as "processHandwriting works on sample1.png"
actual <- processDocument(testthat::test_path("fixtures", "processHandwriting", "samples", "sample1.png"))
expected <- readRDS(testthat::test_path("fixtures", "processHandwriting", "graphs", "sample1_proclist.rds"))
expect_equal(actual, expected, tolerance = 1e-08)
})

test_that("processDocument works on sample2.png", {
# use the same fixtures as "processHandwriting works on sample2.png"
actual <- processDocument(testthat::test_path("fixtures", "processHandwriting", "samples", "sample2.png"))
expected <- readRDS(testthat::test_path("fixtures", "processHandwriting", "graphs", "sample2_proclist.rds"))
expect_equal(actual, expected, tolerance = 1e-08)
})

test_that("processDocument works on sample3.png", {
# use the same fixtures as "processHandwriting works on sample3.png"
actual <- processDocument(testthat::test_path("fixtures", "processHandwriting", "samples", "sample3.png"))
expected <- readRDS(testthat::test_path("fixtures", "processHandwriting", "graphs", "sample3_proclist.rds"))
expect_equal(actual, expected, tolerance = 1e-08)
})

test_that("processHandwriting works on sample1.png", {
actual <- list()
actual$image <- readPNGBinary(testthat::test_path("fixtures", "processHandwriting", "samples", "sample1.png"))
Expand Down
20 changes: 0 additions & 20 deletions tests/testthat/test-processDocument.R

This file was deleted.

0 comments on commit e15d397

Please sign in to comment.