Merge pull request #8 from CSAFE-ISU/7-release-handwriterrf-100

7 release handwriterrf 100
CSAFE-ISU · Oct 10, 2024 · 1755783 · 1755783
2 parents 31bf878 + cb43509
commit 1755783
Show file tree

Hide file tree

Showing 67 changed files with 805 additions and 931 deletions.
diff --git a/.Rbuildignore b/.Rbuildignore
@@ -12,3 +12,4 @@
 ^docs$
 ^altdoc$
 ^_quarto$
+^cran-comments\.md$
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,11 +1,11 @@
 Package: handwriterRF
 Type: Package
-Title: What the Package Does (Title Case)
+Title: Handwriting Analysis with Random Forests
 Version: 0.1.0
-Author: Who wrote it
-Maintainer: The package maintainer <yourself@somewhere.net>
-Description: More about what it does (maybe more than one line)
-    Use four spaces when indenting paragraphs within the Description.
+Authors@R: c(person("Iowa State University of Science and Technology on behalf of its Center for Statistics and Applications in Forensic Evidence", role = c("aut", "cph", "fnd")),
+          person("Stephanie", "Reinders", role = c("aut", "cre"), email = "reinders.stephanie@gmail.com"))
+Maintainer: Stephanie Reinders <reinders.stephanie@gmail.com>
+Description: Perform forensic handwriting analysis of two scanned handwritten documents. This package implements the statistical method described by Madeline Johnson and Danica Ommen (2021) <doi:10.1002/sam.11566>. Similarity measures and a random forest produce a score-based likelihood ratio that quantifies the strength of the evidence in favor of 'same writer' or 'different writer.'
 License: GPL (>= 3)
 Encoding: UTF-8
 LazyData: true
@@ -23,9 +23,10 @@ Imports:
     handwriter,
     magrittr,
     purrr,
-    randomForest,
     ranger,
     reshape2,
     tidyr,
     tidyselect
 Config/testthat/edition: 3
+URL: https://github.com/CSAFE-ISU/handwriterRF
+BugReports: https://github.com/CSAFE-ISU/handwriterRF/issues
diff --git a/NAMESPACE b/NAMESPACE
@@ -2,7 +2,6 @@
 
 export("%>%")
 export(calculate_slr)
-export(expand_docnames)
 export(get_cluster_fill_rates)
 export(get_csafe_train_set)
 export(get_distances)

diff --git a/R/clusters.R b/R/clusters.R
@@ -1,29 +1,29 @@
 #' Get Cluster Fill Rates
 #'
 #' Calculate cluster fill rates from a data frame of cluster fill counts created
-#' with 'handwriter::get_cluster_fill_counts'.
+#' with \code{\link[handwriter]{get_cluster_fill_counts}}.
 #'
-#' @param cfc A data frame of cluster fill rates created with
-#'   'handwriter::get_cluster_fill_counts'
+#' @param df A data frame of cluster fill rates created with
+#'   \code{\link[handwriter]{get_cluster_fill_counts}}.
 #'
-#' @return A data frame of cluster fill rates
+#' @return A data frame of cluster fill rates.
 #'
 #' @export
 #'
 #' @examples
-#' rates <- get_cluster_fill_rates(cfc)
+#' rates <- get_cluster_fill_rates(df = cfc)
 #'
-get_cluster_fill_rates <- function(cfc) {
+get_cluster_fill_rates <- function(df) {
   # Prevent note "no visible binding for global variable"
   docname <- writer <- doc <- NULL
 
   # drop label columns and calculate cluster fill rates: each row sums to 1.
-  cfc_clusters_only <- cfc %>%
+  df_clusters_only <- df %>%
     dplyr::ungroup() %>%
     dplyr::select(-docname, -writer, -doc)
-  cfc_clusters_only <- as.matrix(cfc_clusters_only)
-  total_graphs <- rowSums(cfc_clusters_only)
-  cfr <- diag(1 / total_graphs) %*% cfc_clusters_only
+  df_clusters_only <- as.matrix(df_clusters_only)
+  total_graphs <- rowSums(df_clusters_only)
+  cfr <- diag(1 / total_graphs) %*% df_clusters_only
 
   # add missing clusters
   missing_labels <- setdiff(1:40, colnames(cfr))
@@ -45,7 +45,7 @@ get_cluster_fill_rates <- function(cfc) {
   }
 
   # add label columns and total_graphs column
-  cfr <- cbind(cfc[, 1], data.frame(total_graphs = total_graphs), cfr)
+  cfr <- cbind(df[, 1], data.frame(total_graphs = total_graphs), cfr)
 
   return(cfr)
 }
diff --git a/R/data.R b/R/data.R
@@ -2,9 +2,9 @@
 #'
 #' A dataset containing cluster fill counts for for 1,200 handwriting samples
 #' from the CSAFE Handwriting Database. The documents were split into graphs
-#' with 'handwriter::process_batch_dir'. The graphs were grouped into clusters
-#' with 'handwriter::get_clusterassignment'. The cluster fill counts were
-#' calculated with 'handwriter::get_cluster_fill_counts'.
+#' with \code{\link[handwriter]{process_batch_dir}}. The graphs were grouped into clusters
+#' with \code{\link[handwriter]{get_clusters_batch}}. The cluster fill counts were
+#' calculated with \code{\link[handwriter]{get_cluster_fill_counts}}.
 #'
 #' @format A data frame with 1200 rows and 41 variables:
 #' \describe{
@@ -18,115 +18,113 @@
 #'   and repetition number of the handwriting sample. There are 12 distinct
 #'   document codes. Each writer has a writing sample for each of the 12 document
 #'   codes.}
-#'   \item{1}{number of graphs in cluster 1}
-#'   \item{2}{number of graphs in cluster 2}
-#'   \item{3}{number of graphs in cluster 3}
-#'   \item{4}{number of graphs in cluster 4}
-#'   \item{5}{number of graphs in cluster 5}
-#'   \item{6}{number of graphs in cluster 6}
-#'   \item{7}{number of graphs in cluster 7}
-#'   \item{8}{number of graphs in cluster 8}
-#'   \item{9}{number of graphs in cluster 9}
-#'   \item{10}{number of graphs in cluster 10}
-#'   \item{11}{number of graphs in cluster 11}
-#'   \item{12}{number of graphs in cluster 12}
-#'   \item{13}{number of graphs in cluster 13}
-#'   \item{14}{number of graphs in cluster 14}
-#'   \item{15}{number of graphs in cluster 15}
-#'   \item{16}{number of graphs in cluster 16}
-#'   \item{17}{number of graphs in cluster 17}
-#'   \item{18}{number of graphs in cluster 18}
-#'   \item{19}{number of graphs in cluster 19}
-#'   \item{20}{number of graphs in cluster 20}
-#'   \item{21}{number of graphs in cluster 21}
-#'   \item{22}{number of graphs in cluster 22}
-#'   \item{23}{number of graphs in cluster 23}
-#'   \item{24}{number of graphs in cluster 24}
-#'   \item{25}{number of graphs in cluster 25}
-#'   \item{26}{number of graphs in cluster 26}
-#'   \item{27}{number of graphs in cluster 27}
-#'   \item{28}{number of graphs in cluster 28}
-#'   \item{29}{number of graphs in cluster 29}
-#'   \item{30}{number of graphs in cluster 30}
-#'   \item{31}{number of graphs in cluster 31}
-#'   \item{32}{number of graphs in cluster 32}
-#'   \item{33}{number of graphs in cluster 33}
-#'   \item{34}{number of graphs in cluster 34}
-#'   \item{35}{number of graphs in cluster 35}
-#'   \item{36}{number of graphs in cluster 36}
-#'   \item{37}{number of graphs in cluster 37}
-#'   \item{38}{number of graphs in cluster 38}
-#'   \item{39}{number of graphs in cluster 39}
-#'   \item{40}{number of graphs in cluster 40}
+#'   \item{1}{The number of graphs in cluster 1}
+#'   \item{2}{The number of graphs in cluster 2}
+#'   \item{3}{The number of graphs in cluster 3}
+#'   \item{4}{The number of graphs in cluster 4}
+#'   \item{5}{The number of graphs in cluster 5}
+#'   \item{6}{The number of graphs in cluster 6}
+#'   \item{7}{The number of graphs in cluster 7}
+#'   \item{8}{The number of graphs in cluster 8}
+#'   \item{9}{The number of graphs in cluster 9}
+#'   \item{10}{The number of graphs in cluster 10}
+#'   \item{11}{The number of graphs in cluster 11}
+#'   \item{12}{The number of graphs in cluster 12}
+#'   \item{13}{The number of graphs in cluster 13}
+#'   \item{14}{The number of graphs in cluster 14}
+#'   \item{15}{The number of graphs in cluster 15}
+#'   \item{16}{The number of graphs in cluster 16}
+#'   \item{17}{The number of graphs in cluster 17}
+#'   \item{18}{The number of graphs in cluster 18}
+#'   \item{19}{The number of graphs in cluster 19}
+#'   \item{20}{The number of graphs in cluster 20}
+#'   \item{21}{The number of graphs in cluster 21}
+#'   \item{22}{The number of graphs in cluster 22}
+#'   \item{23}{The number of graphs in cluster 23}
+#'   \item{24}{The number of graphs in cluster 24}
+#'   \item{25}{The number of graphs in cluster 25}
+#'   \item{26}{The number of graphs in cluster 26}
+#'   \item{27}{The number of graphs in cluster 27}
+#'   \item{28}{The number of graphs in cluster 28}
+#'   \item{29}{The number of graphs in cluster 29}
+#'   \item{30}{The number of graphs in cluster 30}
+#'   \item{31}{The number of graphs in cluster 31}
+#'   \item{32}{The number of graphs in cluster 32}
+#'   \item{33}{The number of graphs in cluster 33}
+#'   \item{34}{The number of graphs in cluster 34}
+#'   \item{35}{The number of graphs in cluster 35}
+#'   \item{36}{The number of graphs in cluster 36}
+#'   \item{37}{The number of graphs in cluster 37}
+#'   \item{38}{The number of graphs in cluster 38}
+#'   \item{39}{The number of graphs in cluster 39}
+#'   \item{40}{The number of graphs in cluster 40}
 #' }
 #' @source <https://forensicstats.org/handwritingdatabase/>
 "cfc"
 
 #' Cluster Fill Rates for 1200 CSAFE Handwriting Database Samples
 #'
-#' A dataset containing cluster fill rates for for 1,200 handwriting samples from
-#' the CSAFE Handwriting Database. The dataset was created by running
-#' 'get_cluster_fill_rates' on the cluster fill counts data frame 'cfc'. Cluster
-#' fill rates are calculated by calculating the proportion of total graphs assigned
-#' to each cluster.
+#' A dataset containing cluster fill rates for for 1,200 handwriting samples
+#' from the CSAFE Handwriting Database. The dataset was created by running
+#' \code{\link{get_cluster_fill_rates}} on the cluster
+#' fill counts data frame cfc. Cluster fill rates are the proportion of total
+#' graphs assigned to each cluster.
 #'
 #' @format A data frame with 1200 rows and 42 variables:
 #' \describe{
 #'   \item{docname}{file name of the handwriting sample}
-#'   \item{total_graphs}{the total number of graphs in the handwriting sample}
-#'   \item{cluster1}{number of graphs in cluster 1}
-#'   \item{cluster2}{number of graphs in cluster 2}
-#'   \item{cluster3}{number of graphs in cluster 3}
-#'   \item{cluster4}{number of graphs in cluster 4}
-#'   \item{cluster5}{number of graphs in cluster 5}
-#'   \item{cluster6}{number of graphs in cluster 6}
-#'   \item{cluster7}{number of graphs in cluster 7}
-#'   \item{cluster8}{number of graphs in cluster 8}
-#'   \item{cluster9}{number of graphs in cluster 9}
-#'   \item{cluster10}{number of graphs in cluster 10}
-#'   \item{cluster11}{number of graphs in cluster 11}
-#'   \item{cluster12}{number of graphs in cluster 12}
-#'   \item{cluster13}{number of graphs in cluster 13}
-#'   \item{cluster14}{number of graphs in cluster 14}
-#'   \item{cluster15}{number of graphs in cluster 15}
-#'   \item{cluster16}{number of graphs in cluster 16}
-#'   \item{cluster17}{number of graphs in cluster 17}
-#'   \item{cluster18}{number of graphs in cluster 18}
-#'   \item{cluster19}{number of graphs in cluster 19}
-#'   \item{cluster20}{number of graphs in cluster 20}
-#'   \item{cluster21}{number of graphs in cluster 21}
-#'   \item{cluster22}{number of graphs in cluster 22}
-#'   \item{cluster23}{number of graphs in cluster 23}
-#'   \item{cluster24}{number of graphs in cluster 24}
-#'   \item{cluster25}{number of graphs in cluster 25}
-#'   \item{cluster26}{number of graphs in cluster 26}
-#'   \item{cluster27}{number of graphs in cluster 27}
-#'   \item{cluster28}{number of graphs in cluster 28}
-#'   \item{cluster29}{number of graphs in cluster 29}
-#'   \item{cluster30}{number of graphs in cluster 30}
-#'   \item{cluster31}{number of graphs in cluster 31}
-#'   \item{cluster32}{number of graphs in cluster 32}
-#'   \item{cluster33}{number of graphs in cluster 33}
-#'   \item{cluster34}{number of graphs in cluster 34}
-#'   \item{cluster35}{number of graphs in cluster 35}
-#'   \item{cluster36}{number of graphs in cluster 36}
-#'   \item{cluster37}{number of graphs in cluster 37}
-#'   \item{cluster38}{number of graphs in cluster 38}
-#'   \item{cluster39}{number of graphs in cluster 39}
-#'   \item{cluster40}{number of graphs in cluster 40}
+#'   \item{total_graphs}{The total number of graphs in the handwriting sample}
+#'   \item{cluster1}{The number of graphs in cluster 1}
+#'   \item{cluster2}{The number of graphs in cluster 2}
+#'   \item{cluster3}{The number of graphs in cluster 3}
+#'   \item{cluster4}{The number of graphs in cluster 4}
+#'   \item{cluster5}{The number of graphs in cluster 5}
+#'   \item{cluster6}{The number of graphs in cluster 6}
+#'   \item{cluster7}{The number of graphs in cluster 7}
+#'   \item{cluster8}{The number of graphs in cluster 8}
+#'   \item{cluster9}{The number of graphs in cluster 9}
+#'   \item{cluster10}{The number of graphs in cluster 10}
+#'   \item{cluster11}{The number of graphs in cluster 11}
+#'   \item{cluster12}{The number of graphs in cluster 12}
+#'   \item{cluster13}{The number of graphs in cluster 13}
+#'   \item{cluster14}{The number of graphs in cluster 14}
+#'   \item{cluster15}{The number of graphs in cluster 15}
+#'   \item{cluster16}{The number of graphs in cluster 16}
+#'   \item{cluster17}{The number of graphs in cluster 17}
+#'   \item{cluster18}{The number of graphs in cluster 18}
+#'   \item{cluster19}{The number of graphs in cluster 19}
+#'   \item{cluster20}{The number of graphs in cluster 20}
+#'   \item{cluster21}{The number of graphs in cluster 21}
+#'   \item{cluster22}{The number of graphs in cluster 22}
+#'   \item{cluster23}{The number of graphs in cluster 23}
+#'   \item{cluster24}{The number of graphs in cluster 24}
+#'   \item{cluster25}{The number of graphs in cluster 25}
+#'   \item{cluster26}{The number of graphs in cluster 26}
+#'   \item{cluster27}{The number of graphs in cluster 27}
+#'   \item{cluster28}{The number of graphs in cluster 28}
+#'   \item{cluster29}{The number of graphs in cluster 29}
+#'   \item{cluster30}{The number of graphs in cluster 30}
+#'   \item{cluster31}{The number of graphs in cluster 31}
+#'   \item{cluster32}{The number of graphs in cluster 32}
+#'   \item{cluster33}{The number of graphs in cluster 33}
+#'   \item{cluster34}{The number of graphs in cluster 34}
+#'   \item{cluster35}{The number of graphs in cluster 35}
+#'   \item{cluster36}{The number of graphs in cluster 36}
+#'   \item{cluster37}{The number of graphs in cluster 37}
+#'   \item{cluster38}{The number of graphs in cluster 38}
+#'   \item{cluster39}{The number of graphs in cluster 39}
+#'   \item{cluster40}{The number of graphs in cluster 40}
 #' }
 #' @source <https://forensicstats.org/handwritingdatabase/>
 "cfr"
 
 #' Cluster Template with 40 Clusters
 #'
-#' A cluster template created by 'handwriter' with K=40
-#' clusters. This template was created from 100 handwriting samples from the
-#' CSAFE Handwriting Database. This template is suitable for casework.
+#' A cluster template created by \pkg{handwriter} with 40 clusters. This template
+#' was created from 120 handwriting samples from the CSAFE Handwriting Database.
 #'
-#' 'handwriter' splits handwriting samples into component shapes
-#' called *graphs*. The graphs are sorted into 40 clusters with a K-Means
-#' algorithm. See 'handwriter' for more details.
+#' \pkg{handwriter} splits handwriting samples into component shapes called
+#' graphs. The graphs are sorted into 40 clusters with a K-Means algorithm.
 #'
 #' @format A list containing the contents of the cluster template.
 #' \describe{
@@ -150,7 +148,7 @@
 #' \item{wcd}{The within cluster
 #'   distances on the final iteration of the K-means algorithm. More specifically,
 #'   the distance between each graph and the center of the cluster to which it
-#'   was assigned  on each iteration. The output of 'handwriter::make_clustering_template' stores
+#'   was assigned  on each iteration. The output of \code{\link[handwriter]{make_clustering_template}}' stores
 #'   the within cluster distances on each iteration, but the previous iterations were removed here to reduce the file size.}
 #' \item{wcss}{A vector of the
 #'   within-cluster sum of squares on each iteration of the K-means algorithm.}}
@@ -169,28 +167,28 @@
 "templateK40"
 
 
-#' A 'ranger' Random Forest, Distances, and Densities
+#' A \pkg{ranger} Random Forest, Distances, and Densities
 #'
-#' A list that contains a trained random forest created with 'ranger', the data
+#' A list that contains a trained random forest created with \pkg{ranger}, the data
 #' frame of distances used to train the random forest, and two densities
 #' obtained from the random forest.
 #'
 #' @format A list with the following components:
 #' \describe{
-#' \item{rf}{A random forest created with 'ranger' with settings:
-#' importance = "permutation", scale.permutation.importance = TRUE, and num.trees = 200.}
 #' \item{dists}{The data frame used to train the random forest. The data frame has
 #' 600 rows. Each row contains the absolute and Euclidean distances between the
 #' cluster fill rates of two handwriting samples. If both handwriting samples are
 #' from the same writer, the class is 'same'. If the handwriting samples are from
 #' different writers, the class is 'different'. There are 300 'same' distances and
 #' 300 'different' distances in the data frame.}
+#' \item{rf}{A random forest created with \pkg{ranger} with settings:
+#' importance = 'permutation', scale.permutation.importance = TRUE, and num.trees = 200.}
 #' \item{densities}{A similarity score was obtained for each pair of handwriting samples in the
 #' training data frame, dists, by calculating the proportion of decision trees that voted 'same'
-#' class for the pair. The 'same_writer' density was created by applying the 'density' function
+#' class for the pair. The 'same_writer' density was created by applying \code{\link[stats]{density}}
 #' to the similarity scores for the 300 same writer pairs in dists. Similarly, the 'diff_writer'
-#' density was created by applying the 'density' function to the similarity scores for the 300
-#' different writer pairs in dists. The default settings were used with the 'density' function.}
+#' density was created by applying the \code{\link[stats]{density}} function to the similarity scores for the 300
+#' different writer pairs in dists. The default settings were used with \code{\link[stats]{density}}.}
 #' }
 #'
 #' @examples