diff --git a/NEWS.md b/NEWS.md index 431b65b4..cbcd4db9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,17 @@ +# officer 0.6.3.9000 + +## Changes + +- the dataframe returned by `docx_comments()` gains a list column `para_id` containing +the id(s) of the parent paragraph(s). A possible breaking change is that the `commented_text` +is now returned as a list column to account for comments spanning multiple runs. +The docs gain a description of the columns of the returned dataframe. + +## Issues + +- Fix. Refactor `docx_comments()` to (properly) account for comments spanning +multiple paragraphs, multiple comments in the same paragraph and replies. Closes #541. + # officer 0.6.3 ## Features diff --git a/R/docx_comments.R b/R/docx_comments.R index b107058f..7add121d 100644 --- a/R/docx_comments.R +++ b/R/docx_comments.R @@ -1,6 +1,22 @@ #' @title Get comments in a Word document as a data.frame #' @description return a data.frame representing the comments in a Word document. #' @param x an rdocx object +#' @details +#' Each row of the returned data frame contains data for one comment. The +#' columns contain the following information: +#' * "comment_id" - unique comment id +#' * "author" - name of the comment author +#' * "initials" - initials of the comment author +#' * "date" - timestamp of the comment +#' * "text" - a list column of characters containing the comment text. Elements can +#' be vectors of length > 1 if a comment contains multiple paragraphs, +#' blocks or runs or of length 0 if the comment is empty. +#' * "para_id" - a list column of characters containing the parent paragraph IDs. +#' Elememts can be vectors of length > 1 if a comment spans multiple paragraphs +#' or of length 0 if the comment has no parent paragraph. +#' * "commented_text" - a list column of characters containing the +#' commented text. Elememts can be vectors of length > 1 if a comment +#' spans multiple paragraphs or runs or of length 0 if the commented text is empty. #' @examples #' bl <- block_list( #' fpar("Comment multiple words."), @@ -10,7 +26,7 @@ #' a_par <- fpar( #' "This paragraph contains", #' run_comment( -#' cmt = bl, +#' cmt = bl, #' run = ftext("a comment."), #' author = "Author Me", #' date = "2023-06-01" @@ -27,19 +43,33 @@ docx_comments <- function(x) { stopifnot(inherits(x, "rdocx")) - comment_nodes <- xml_find_all( - x$doc_obj$get(), "//*[self::w:p/w:commentRangeStart]" + comment_ids <- xml_attr( + xml_find_all( + x$doc_obj$get(), "/w:document/w:body//*[self::w:commentRangeStart]" + ), "id" ) - if (length(comment_nodes) > 0) { - data <- lapply(comment_nodes, comment_as_tibble) - data <- rbind_match_columns(data) - } else { - data <- data.frame( - comment_id = integer(0), - commented_text = character(0) + comment_text_runs <- lapply(comment_ids, \(id) { + xml_find_all( + x$doc_obj$get(), + paste0( + "/w:document/w:body//*[self::w:r[w:t and", + "preceding::w:commentRangeStart[@w:id=\'", id, "\']", + " and ", + "following::w:commentRangeEnd[@w:id=\'", id, "\']]]" + ) ) - } + }) + + data <- data.frame( + comment_id = comment_ids + ) + # Add parent paragraph id + data$para_id <- lapply( + comment_text_runs, + function(x) xml_attr(xml_parent(x), "paraId") + ) + data$commented_text <- lapply(comment_text_runs, xml_text) comments <- xml_find_all(x$comments$get(), "//w:comments/w:comment") @@ -57,30 +87,5 @@ docx_comments <- function(x) { ) data <- merge(out, data, by = "comment_id", all.x = TRUE) - - data -} - -comment_as_tibble <- function(node) { - node_name <- xml_name(node) - name_children <- xml_name(xml_children(node)) - - comment_range <- grep("commentRange", name_children) - - comment_data <- data.frame( - comment_id = xml_attr(xml_child(node, comment_range[[1]]), "id"), - stringsAsFactors = FALSE - ) - comment_range <- seq(comment_range[[1]] + 1, comment_range[[2]] - 1) - comment_data$commented_text <- - paste0( - vapply( - comment_range, - function(x) xml_text(xml_child(node, x)), - character(1) - ), - collapse = "" - ) - - comment_data + data[order(as.integer(data$comment_id)), ] } diff --git a/man/docx_comments.Rd b/man/docx_comments.Rd index e49bd1a6..60662e62 100644 --- a/man/docx_comments.Rd +++ b/man/docx_comments.Rd @@ -12,6 +12,25 @@ docx_comments(x) \description{ return a data.frame representing the comments in a Word document. } +\details{ +Each row of the returned data frame contains data for one comment. The +columns contain the following information: +\itemize{ +\item "comment_id" - unique comment id +\item "author" - name of the comment author +\item "initials" - initials of the comment author +\item "date" - timestamp of the comment +\item "text" - a list column of characters containing the comment text. Elements can +be vectors of length > 1 if a comment contains multiple paragraphs, +blocks or runs or of length 0 if the comment is empty. +\item "para_id" - a list column of characters containing the parent paragraph IDs. +Elememts can be vectors of length > 1 if a comment spans multiple paragraphs +or of length 0 if the comment has no parent paragraph. +\item "commented_text" - a list column of characters containing the +commented text. Elememts can be vectors of length > 1 if a comment +spans multiple paragraphs or runs or of length 0 if the commented text is empty. +} +} \examples{ bl <- block_list( fpar("Comment multiple words."), @@ -21,7 +40,7 @@ bl <- block_list( a_par <- fpar( "This paragraph contains", run_comment( - cmt = bl, + cmt = bl, run = ftext("a comment."), author = "Author Me", date = "2023-06-01" diff --git a/tests/testthat/docs_dir/test-docx_comments.docx b/tests/testthat/docs_dir/test-docx_comments.docx new file mode 100644 index 00000000..f4bbb3f2 Binary files /dev/null and b/tests/testthat/docs_dir/test-docx_comments.docx differ diff --git a/tests/testthat/test-docx-comments.R b/tests/testthat/test-docx-comments.R index 6ff56f64..82ce04ac 100644 --- a/tests/testthat/test-docx-comments.R +++ b/tests/testthat/test-docx-comments.R @@ -50,3 +50,262 @@ test_that("add comments", { expect_length(xml_children(comment1), 2) expect_length(xml_children(comment2), 1) }) + +test_that("docx_comments accounts for multiple comments in a paragraph", { + multi_comment_par <- fpar( + "This paragraph", + run_comment( + cmt = block_list( + fpar( + ftext("First Comment.") + ) + ), + run = ftext("contains"), + author = "Author Me", + date = "2023-06-01" + ), + "multiple", + run_comment( + cmt = block_list( + fpar( + ftext("Second Comment.") + ) + ), + run = ftext("comments"), + author = "Author Me", + date = "2023-06-01" + ) + ) + + doc <- read_docx() + doc <- body_add_fpar(doc, value = multi_comment_par, style = "Normal") + docx_file <- print(doc, target = tempfile(fileext = ".docx")) + + comments <- docx_comments(read_docx(docx_file)) + + expect_equal(nrow(comments), 2) + + expect_equal( + comments$text, + list("First Comment.", "Second Comment.") + ) + expect_equal( + comments$commented_text, + list("contains", "comments") + ) +}) + +test_that("docx_comments", { + example_docx <- "docs_dir/test-docx_comments.docx" + doc <- read_docx(path = example_docx) + + comments <- docx_comments(doc) + + # Doc includes 16 comments + expect_equal(nrow(comments), 16) + # No NAs in "commented_text" + expect_true( + all( + vapply( + comments[["commented_text"]], + function(x) all(!is.na(x)), + FUN.VALUE = logical(1) + ) + ) + ) + # Accounts for empty comments or multi line comments + expect_true( + all( + lengths(comments[["text"]][-c(1, 4)]) == 1 + ) + ) + ## Comment 1 has 2 lines + expect_equal( + length(comments[["text"]][[1]]), 2 + ) + ## Comment 4 is empty + expect_identical( + comments[["text"]][[4]], character(0) + ) +}) + +test_that("docx_comments accounts for comments spanning no or multiple paragraphs", { + example_docx <- "docs_dir/test-docx_comments.docx" + doc <- read_docx(path = example_docx) + + comments <- docx_comments(doc) + + expect_true( + all( + lengths(comments[["para_id"]][-c(5, 6)]) == 1 + ) + ) + ## Comment 5 spans no paragraph + expect_identical( + comments[["para_id"]][[5]], character(0) + ) + expect_equal( + paste(comments[["commented_text"]][[5]], collapse = " "), + "" + ) + ## Comment 6 spans 2 paragraphs + expect_equal( + length(comments[["para_id"]][[6]]), 2 + ) + expect_equal( + paste(comments[["commented_text"]][[6]], collapse = " "), + "a comment … … which spans multiple paragraphs." + ) +}) + +test_that("docx_comments accounts for comments spanning no or multiple runs", { + example_docx <- "docs_dir/test-docx_comments.docx" + doc <- read_docx(path = example_docx) + + comments <- docx_comments(doc) + + expect_true( + all( + lengths(comments[["commented_text"]][-c(5, 6, 7, 15)]) == 1 + ) + ) + ## Comment 5 spans no run + expect_identical( + comments[["commented_text"]][[5]], character(0) + ) + expect_equal( + paste(comments[["commented_text"]][[5]], collapse = " "), + "" + ) + ## Comment 6 spans 2 runs as it spans 2 paragraphs + expect_equal( + length(comments[["commented_text"]][[6]]), 2 + ) + expect_equal( + paste(comments[["commented_text"]][[6]], collapse = " "), + "a comment … … which spans multiple paragraphs." + ) + ## Comment 7 spans 3 runs. + expect_equal( + length(comments[["commented_text"]][[7]]), 3 + ) + expect_equal( + paste(comments[["commented_text"]][[7]], collapse = ""), + "spanning multiple runs." + ) + ## Comment 15 spans 3 runs because of an inner comment + expect_equal( + length(comments[["commented_text"]][[15]]), 3 + ) + expect_equal( + paste(comments[["commented_text"]][[15]], collapse = ""), + "This paragraph contains two nested comments." + ) +}) + +test_that("docx_comments accounts for replies", { + example_docx <- "docs_dir/test-docx_comments.docx" + doc <- read_docx(path = example_docx) + + comments <- docx_comments(doc) + + # Make "unique" id based on commented text and paragraph id + comments$unique_id <- paste( + comments$para_id, comments$commented_text, + sep = "." + ) + comments$unique_id <- factor( + comments$unique_id, + levels = unique(comments$unique_id) + ) + comments$unique_id <- as.integer(comments$unique_id) + + comments_split <- split( + comments, + comments$unique_id + ) + + # Accouting for replies we have only 12 comments + expect_equal( + length(comments_split), + 12 + ) + expect_equal( + vapply( + comments_split[-c(8, 9, 10)], nrow, + FUN.VALUE = integer(1), USE.NAMES = FALSE + ), + rep(1, 9) + ) + ## 8 has two replies + expect_equal( + nrow(comments_split[[8]]), + 3 + ) + expect_equal( + unique(unlist(comments[8:10, "commented_text"])), + comments[["commented_text"]][[8]] + ) + expect_equal( + unique(unlist(comments[8:10, "para_id"])), + comments[["para_id"]][[8]] + ) + ## 9 and 10 have one reply each + expect_equal( + nrow(comments_split[[9]]), + 2 + ) + expect_equal( + unique(unlist(comments[11:12, "commented_text"])), + comments[["commented_text"]][[11]] + ) + expect_equal( + unique(unlist(comments[11:12, "para_id"])), + comments[["para_id"]][[11]] + ) + expect_equal( + nrow(comments_split[[10]]), + 2 + ) + expect_equal( + unique(unlist(comments[13:14, "commented_text"])), + comments[["commented_text"]][[13]] + ) + expect_equal( + unique(unlist(comments[13:14, "para_id"])), + comments[["para_id"]][[13]] + ) +}) + +test_that("docx_comments accounts for nested comments", { + example_docx <- "docs_dir/test-docx_comments.docx" + doc <- read_docx(path = example_docx) + + comments <- docx_comments(doc) + + ## Outer Comment 15 spans 3 runs + expect_equal( + length(comments[["commented_text"]][[15]]), 3 + ) + expect_equal( + paste(comments[["commented_text"]][[15]], collapse = ""), + "This paragraph contains two nested comments." + ) + expect_equal( + paste(comments[["text"]][[15]], collapse = ""), + "Outer Comment." + ) + + ## Inner Comment 16 spans 1 run + expect_equal( + length(comments[["commented_text"]][[16]]), 1 + ) + expect_equal( + paste(comments[["commented_text"]][[16]], collapse = ""), + "contains two " + ) + expect_equal( + paste(comments[["text"]][[16]], collapse = ""), + "Inner Comment." + ) +})