From 9a44f5295fa106448226a7a418c8042db3f9879e Mon Sep 17 00:00:00 2001 From: trekonom Date: Tue, 2 Jan 2024 12:11:46 +0100 Subject: [PATCH] Fix. Refactor docx_comments to (properly) account for comments spanning multiple paragraphs, multiple comments in the same paragraph and replies. Closes #541. Update docs to include a detailed description of the columns in the returned dataframe. --- NEWS.md | 14 + R/docx_comments.R | 79 +++--- man/docx_comments.Rd | 21 +- .../testthat/docs_dir/test-docx_comments.docx | Bin 0 -> 17942 bytes tests/testthat/test-docx-comments.R | 259 ++++++++++++++++++ 5 files changed, 335 insertions(+), 38 deletions(-) create mode 100644 tests/testthat/docs_dir/test-docx_comments.docx diff --git a/NEWS.md b/NEWS.md index 431b65b4..cbcd4db9 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,17 @@ +# officer 0.6.3.9000 + +## Changes + +- the dataframe returned by `docx_comments()` gains a list column `para_id` containing +the id(s) of the parent paragraph(s). A possible breaking change is that the `commented_text` +is now returned as a list column to account for comments spanning multiple runs. +The docs gain a description of the columns of the returned dataframe. + +## Issues + +- Fix. Refactor `docx_comments()` to (properly) account for comments spanning +multiple paragraphs, multiple comments in the same paragraph and replies. Closes #541. + # officer 0.6.3 ## Features diff --git a/R/docx_comments.R b/R/docx_comments.R index b107058f..7add121d 100644 --- a/R/docx_comments.R +++ b/R/docx_comments.R @@ -1,6 +1,22 @@ #' @title Get comments in a Word document as a data.frame #' @description return a data.frame representing the comments in a Word document. #' @param x an rdocx object +#' @details +#' Each row of the returned data frame contains data for one comment. The +#' columns contain the following information: +#' * "comment_id" - unique comment id +#' * "author" - name of the comment author +#' * "initials" - initials of the comment author +#' * "date" - timestamp of the comment +#' * "text" - a list column of characters containing the comment text. Elements can +#' be vectors of length > 1 if a comment contains multiple paragraphs, +#' blocks or runs or of length 0 if the comment is empty. +#' * "para_id" - a list column of characters containing the parent paragraph IDs. +#' Elememts can be vectors of length > 1 if a comment spans multiple paragraphs +#' or of length 0 if the comment has no parent paragraph. +#' * "commented_text" - a list column of characters containing the +#' commented text. Elememts can be vectors of length > 1 if a comment +#' spans multiple paragraphs or runs or of length 0 if the commented text is empty. #' @examples #' bl <- block_list( #' fpar("Comment multiple words."), @@ -10,7 +26,7 @@ #' a_par <- fpar( #' "This paragraph contains", #' run_comment( -#' cmt = bl, +#' cmt = bl, #' run = ftext("a comment."), #' author = "Author Me", #' date = "2023-06-01" @@ -27,19 +43,33 @@ docx_comments <- function(x) { stopifnot(inherits(x, "rdocx")) - comment_nodes <- xml_find_all( - x$doc_obj$get(), "//*[self::w:p/w:commentRangeStart]" + comment_ids <- xml_attr( + xml_find_all( + x$doc_obj$get(), "/w:document/w:body//*[self::w:commentRangeStart]" + ), "id" ) - if (length(comment_nodes) > 0) { - data <- lapply(comment_nodes, comment_as_tibble) - data <- rbind_match_columns(data) - } else { - data <- data.frame( - comment_id = integer(0), - commented_text = character(0) + comment_text_runs <- lapply(comment_ids, \(id) { + xml_find_all( + x$doc_obj$get(), + paste0( + "/w:document/w:body//*[self::w:r[w:t and", + "preceding::w:commentRangeStart[@w:id=\'", id, "\']", + " and ", + "following::w:commentRangeEnd[@w:id=\'", id, "\']]]" + ) ) - } + }) + + data <- data.frame( + comment_id = comment_ids + ) + # Add parent paragraph id + data$para_id <- lapply( + comment_text_runs, + function(x) xml_attr(xml_parent(x), "paraId") + ) + data$commented_text <- lapply(comment_text_runs, xml_text) comments <- xml_find_all(x$comments$get(), "//w:comments/w:comment") @@ -57,30 +87,5 @@ docx_comments <- function(x) { ) data <- merge(out, data, by = "comment_id", all.x = TRUE) - - data -} - -comment_as_tibble <- function(node) { - node_name <- xml_name(node) - name_children <- xml_name(xml_children(node)) - - comment_range <- grep("commentRange", name_children) - - comment_data <- data.frame( - comment_id = xml_attr(xml_child(node, comment_range[[1]]), "id"), - stringsAsFactors = FALSE - ) - comment_range <- seq(comment_range[[1]] + 1, comment_range[[2]] - 1) - comment_data$commented_text <- - paste0( - vapply( - comment_range, - function(x) xml_text(xml_child(node, x)), - character(1) - ), - collapse = "" - ) - - comment_data + data[order(as.integer(data$comment_id)), ] } diff --git a/man/docx_comments.Rd b/man/docx_comments.Rd index e49bd1a6..60662e62 100644 --- a/man/docx_comments.Rd +++ b/man/docx_comments.Rd @@ -12,6 +12,25 @@ docx_comments(x) \description{ return a data.frame representing the comments in a Word document. } +\details{ +Each row of the returned data frame contains data for one comment. The +columns contain the following information: +\itemize{ +\item "comment_id" - unique comment id +\item "author" - name of the comment author +\item "initials" - initials of the comment author +\item "date" - timestamp of the comment +\item "text" - a list column of characters containing the comment text. Elements can +be vectors of length > 1 if a comment contains multiple paragraphs, +blocks or runs or of length 0 if the comment is empty. +\item "para_id" - a list column of characters containing the parent paragraph IDs. +Elememts can be vectors of length > 1 if a comment spans multiple paragraphs +or of length 0 if the comment has no parent paragraph. +\item "commented_text" - a list column of characters containing the +commented text. Elememts can be vectors of length > 1 if a comment +spans multiple paragraphs or runs or of length 0 if the commented text is empty. +} +} \examples{ bl <- block_list( fpar("Comment multiple words."), @@ -21,7 +40,7 @@ bl <- block_list( a_par <- fpar( "This paragraph contains", run_comment( - cmt = bl, + cmt = bl, run = ftext("a comment."), author = "Author Me", date = "2023-06-01" diff --git a/tests/testthat/docs_dir/test-docx_comments.docx b/tests/testthat/docs_dir/test-docx_comments.docx new file mode 100644 index 0000000000000000000000000000000000000000..f4bbb3f2f11e7759d91e67449fe7303ad37bdea4 GIT binary patch literal 17942 zcmaL91z24>voMUiySo%9?(SaP-QC^Y-CLk&ad&rjcXxLvQ2eKT&$*X#{`Ek&FYPY{;AW%7`S z%4e8$itQ(X^-ImH>OVsJIg2ZKln|bs!4{rlJRrFqJ`G(8(%D3}J=|e{ zOem*78EZD{Wew=iJcN~Z zpew}76K>~ycqsQNRaq1Ym<4RpB_$3ceZ1O&uX=Tl$qx7Y7FcOt;X36^R z@-r3{h!N;+4}DAcE+hKahmhjI&o9YU6f-g-scxo21Bk*c-9?B5O^YRH{7O$`l%=D79RY;0!^P_;|SsTB*C$NJiGA&IUd9 zD(Mh~hr3Px)AN4M_H|Ls9G*VII`xtvlQAmW8TL-Sq_fIfLob(&ulX8;OP>?5}d=eJkC&I{nYnX7EGFMFN=PHJh(9jI2j0< zz+=Ln{7Y3dPp{g3L@g}oG*`g;{md=QhE^sV)OnEMH0N^m|5jMYO{wu6Za29*D70=_ zARf8Pg8;0ggbkr;4ywGxUeweS`G&re$4`yDH&A>$ zM8zJkp9V~>gDk;{&rH$hZzQhpgQ|)vh27^%T6+39NkOPev>Hap5U;!h<@|d?a`RxsX6W95JqCi7#!H6q*W#=cN#6bVSPHdJ0#Y{)oxblXA4XIfuUG7*9g5S+YQ&>Arb7p}V1q z_WX|plP13LDDHH+^wZ77M`{*K(Rk};Y^XX#i2h+0;SNS}?LuvZ!Fq+kPV;#$7LOM9 zXFNJ}v(D8LKxWva?B=K0R)_3g)Vo+~#vIgSgJmESR#Y zRppLNHM=p&3NwUG^k-7?F6h|YyX0zB&E*?94Y7DQk1a$p-g&5>|9FF88UTHbQC;x7 zjayi#2qouR*CBH~=w|zVOV;wM<7|%iHwQ7g>|z4mIS2#%-#Cc#&OsL&dqdj)te5X& z-O1{GP58~n|E!%!Qr6vc2toI0MAz+h0vb@Leo`7?R+8|#r{bYlZ-aR1jb5-2Vkr8M zaeI+FtQ%aMJ4c>WC5PyxRCU8*SmY!?eRZFd;^Hp$26~c(;8KuEr=b|CfkpRgx^y2Q zQ3X4O$|QGw432#I7RUUxfvpbSt%!I zFjNadrg9b2G;AbZ^Agg{)4g3ro_N3;18E&EmP?ff}WA6=txtBBU?j4%@wFjn0frWW59FD zLQdPzfZG|KlaeS+zuOEQgz+MRzQcY6F~M*0zK4zR-lUcy71miB8c*zSTl6Vl6d6x} zY_}WEB&S052VoeK`8-(Bc9r`InO<*Q$0f{&=0gNSxiayZRGQWOXDJii3#;M<9?GL96Y^;8>$l*7S&QdgNO4twvHl8RDE~s_S3(w>P;+5*u zt>&C{?#4jLvw&Hu)ANrCbOgl&XjTLOjxUMP(sArE+`_y4 zA@X9^a_#sai5O1r)Wx~Lig!qvhLk>{*d!5DbsUG3B1He@n4KHztz*0(z!t#e9hHz=5tMIkWuzO~p|cr> zQKukwTUq5_mP;ZOhG(rbnBJPZNHw5y(SK0x@%?H~j}?gYb`;8?+<)Dz`c5#lWv&xpaR^%r<)Fb?UUiR$^1lzJg_0s|OkMf{kOrX&=x&q0m53!#>C^0x&Ssf! zaOSvO`NJWF5fu>an$A2IsNPQLx+=B@qPnm+G?m@xe0ScARAlMLTEHbZ>4;?oT7!be z;LblI28hd{%X{Pt2qoQ@;*D9-nhp#OE>g%$AtQkY z+}~x)^bL07tSeX+b}FZ3ne2$TuGX`Xr7KmH_M1<0T9v+b0VEv&ieII4LuEy5^<0AJf{guPu999=O*Bg2RkHU zLsDYUV|%+~Nbs;woVTJ9qclnksSlZlsUu0M^eOTBOG8EF{(>eJ(6EJ)&lHmkI4ou9 z8AmzybW8GNiDq#p&{wKAHs-9a@TUeA_-WUDJf6<&Pd5$n{CNHPoJMc%so*~RdT3R5 zrm9X;blQpoGuVlrSLmsw7G|+b7n{vmj@g;uJn*QbfUyoHvm{pmQJigVfqcWPn(n5; zN0|x?Tyh@;+e`ub#Gr~3AcfVCa^Q4r-2=Mh2XtZb$LZW0;d&AVUz?qd*Nlr6uyfaw zIe0Dy1dpG@+j_<20OxT}xr^K6bR$$!i{OLj416R(%76iC*8wJBU5qCbodB{r7$!*} zvsjUQVI5aa_hV}O0&DUBY9~6@Kfj3t6m{YOq61hA`n)9pILbnfK0bl@gG_Fgb)tQb z-*;3G0hwbm&bcAM`N^f6>TQCAM<-I-!fVZx!wLa2lM1v^2~RlbRRwQ-bPm7|4e_dd z9YI*AS@~4*_P)OXhmaxS)_8qaQmAM@PM-hPV=@0dc?!F}Z+#4n41Z6eW#2Zeztds# zU67yQBHWjC`N9^OB_S}}T2I%5+jc<~E{d=`%rs<*17+ug>sM1DjXpg5`gKF@>dsy{ zyewBF(PdKqbhwDNmog~Xp0ecQU4bXCA`iR8UIbFW_!L0-wD<7TP|$|mo1zGD}RwNRg<@8uMsJFn_|3qf2WDm$kz&pi_r}sU$Ixg3wMp z%(GM7V^yme--gz}hi?cP2g|LAQBF?+fCw=_zy_GB0JnH5{Wb^W(i zmr@S)4r#}9Z08!`pWzyxJu_+?x*b7v$TCpI2sBygfIATZjDG6&LrlQLYDv3{P&hEc z9kUzpY<^*61So7Bf+mJKwFfQx%9Kk`jDb{?Qj|5|Rmm@MyWbvgDHv|$m7R1yrA$-%fnUBXjPXIr1Ohzbe3Ol$Ve~r0W@$JxFEU2ODEs< zXHVX1WipE^TNDx3b4)Q?41xs*TwusR$w(s0YP8+Yn?U*5+>K8OVlSIaX_>*0U(ntl zL!3sGlf!Tjnz^f!?u5G%*m@Ftoy|FJ!Kx33AS54 z1J#n5QAZ~}%mzz9|LayfBLgFEC>=h&_iN*Zy0wI}-Y^B}m+(GmB(m^^cnk?cqE+Ri z(b2q80RW*?giPI_Cyy-n9IFtG04Sh4qym%+qbe}H;F*m8PQ~`O1c=$X6$?; z=Jn!DFh{=B9rgINp-K^A1q6T&ctTjQG6XfRVLYj(tf69XcJ)bf4IzPx35yN4yMlkf=tKKg6OC~X89 zPUUylDPb7#tqt~-FNMdOGrj70>{R8vu-8AWMR z82JntMnowJ%cxBCt9sed9-74ZNtw?Y;oX&U%$?Twe<|HKIZcP3)E>+vH5%=pL99(E zRrERyJ&)VvSxck_Pf+z+l&(h_=X2l%AKMyq0QYD2rNZw_)`_$%h&T)+!)Iz&kFQrn z124J5^Oo>9A)&a*DC;{LsK%=lWylp{r$N?|O?p@9n&u#M^O9kN$N-~=o^J!4xAM`} zW`muN9n37sowNz>Ywr?Z2+gc)4kdgOqI*SaV7A5PDBo*2Bx7?VxGn>2g~b~+FzqQ= zNhA$Bi0FLu7Q}_c>GSrF|J1yA042(?=8wkDq!QxD2G{jFk+oXhXS_gmDdiepZUw5mhL!LA4ajp`UwCo))xpvkBrgnDRs0C zP&l}h43Rw(z9S}mcVI$Li@n0FKyt2B%Mo6S+(terY3yeC<%zuMMoaX*ddM&@)$^=< zZ~5in@?|B4Ti-0DOkQchZRW%(A6Jmsz0PK)PUELn3lLv{u(qU^&Ns7yTKD6Nu?gVY zpNjeHizPI`7(~Bbl~#(F7M4q?3~wmwJhrC;(!V;{+k}J1`qsv3W#X5ErAEmj0QkUc zfyTNCM)oJNe%IGMGzWc!56nCmS z9cx%)EHt{Wwzz;<9d7shTvF}BNM>jv@^x&)Kl;m@aABKeeJ6ieb?bF?&5!x>atq~x z#}>(#$;V?=pT|bU-K4c@<0gRJkoM`9ruv$pb5*<|M^;Umiy}XoiZ9W{Q^HHy)*M<4|8Z)Wn#|_ z4GcRPMu!-Xt-5Uh9BpeJDu$(9@EDH*N@SRfh}wf7GXo4WdZYcMHL1xQ|Q8|7GX-v-kXHADB~LvCm;c z@XD;na?KF=(PxXdRXaFT)=ZZ4==6Lv zVvd5k#X!uiz$Ty_rEKI{(jS>bELGB-Y}nX56d#MGL~p{X7iZSznGq6O^0% zSxVIj#1IB(TN#%yr6*RJD~%Zj5|4PPCw7uPo)P9KRCemcHj0(wSXkkm2s_|w*CD_d zwc`3DDb{i-s~IT_XJ4<;jeZ(Q)~`Auv-~lpP@&2{Vj!P5uq7GBH`BNI%a7PpSoZ)n z5q(kP$=P8{BC>;8PSM!JNkRIdg!qcmiEqOm#xG z09LB=A0_*q9ZBFq=|TY>Nr58sn(87dY4G)43(0NRp7W&F-|X{df13n>Uy99%8CFXP#c;D{t>gA z6H_h~ctCkWN(3lPJmGYz0(HG45FUG`(dp%2on9cfBVv?Psajc_&&Mc9B&_Pj7;4^+ zM7ZQ`sFATJb^aWndYNi1<$KYwh?nHSr`9KqLB&P1x@Ki!Ehq=O#G(xa zknYs~J;kG5TG*UR-<#=r01RTj1W5%EElW^p% z$C%Ppb_o#Q#*XH8j~MV1h322g<4Dtm1|d&dD5~8_rfNWVHL%I0b$`uw-JubgY_A00 z@E$ANEv*UP(-bUdSx;oBT*Q5%mI2-80tiJ+(%UUFvyL98<>TjV!8k#o}orjOy+e_qJbIJB&zEL!ggwq2A=n zG@=3F&YzH>u~p@uYHp<@$Rm=^g5g08gzK>j4A<47Afq9C)avL%;w3Nspdgp8dpMH|EXOL7KloFjkcapM5Q}GF#OO>hGE`G3SxGMP13IQo6 zNW23I0KiE8afSHz1{>*b#mo5Jsi>&;*B+xSY0Y|t0LAw|JL@PPtO7H|0`qFuRHifi zV?1A7oE*$bHWKS~?}5dkRwJboiS)d|eVlP{PhWeNw_Hkl1hm{KiYPKdQUQkq$u`NO zn1NGsX-Kww2W3oZd3O}Pi*NY6tIQGIrr;na%%4$Y%#JkMAb>XWWMRni(XbMNs>0bV zm$kBzUz#c^8s(TB?5U`ZqWN)x8J7)CEk5OcB0elEsicoo;CTx)CNE?R0gQf2uYKl87HdehVn?O^a4h*18@bZ+ut|C*57IZ zqRPHI=!s+Eyg%sK>9%4B1~90l9sQ%1yn(#-)|H0`9r8o!5feg+umO45nBM{MD-_ z(jt~PZHAqaL2SpVeE&1`{#AFu(vT-47Z$Mjn!2LrAdwDdlZN>{NC%OjIeoJ)?{=x`?$7OMDblwQ*8!&sAjDD0b)z1*Qi}Rvf^N6xXvAQO5c+ zIHjYOB(dZM$qW$R*d@p!FkF)@5mN1CiRco*(&UyH#g|Uvl z%%G`lctb-}`+)8Bq8PV(r%%%?WHQ`KYhwE$B-o-M!~pjQbrO={UsTM1UPb1`PU83t z?2f{!RgjG^NFY*Y?e#lA(8(34eqpds+IajjH2}ojX+WfeJ2wbdmp=)9+@$f@!nI^5dJRMs|o83n{i?(5|ipx6aKfBysYW zLW>Q&yrvj6=%%L4b$v-xo~V_dQTcfuntp;m6Y|COqaLjEF<=(spr({y)tmp|n77i5X=dnPQpbTCH$fCFW9_7Ecv(P@BvwFDB^Z&~ zvVUDZ=7HfRVFGoZ(mg`1Scv;vv?Ov!8&_&x~GB7N*5zqE% zd%Vx2J@KVJLf)#*Cc`X+n0I5FGa$SM@Fj|b?CT?y639SZ($oyb1YLSpEkQ~d@c>QL zjUP0|YhqD?c31W5yMx+$g1*m~A`VzAsR1HF7>t50onk86gaVUwUifbU#6}{hYbCB> z?*Zi%0}BN3$t#8I!VtG1rEx}rw2;_f44ub6>*bC+|5$^~*b5vQcI&4=yH277=zDfc z(Gjza7QRl*4O^7RKbpr#t2_V?QeJ1b=E0%nR3GZ$qh{eCPt?yvP0k>3%Ir}*L$ zlMgN~gRKn53TQ|?Eimi#K;fX`)G^XcO8OO7kIT3;G@*3}&he9!azC@-?K()Q zX)$Zcm;k<{mCGE%sb9h1Mj5UY)!qd%^-bWMVrc#V1=6y?yhPz@*OMQ()L7T9x->7q zDZpy1>3K zPu=M2z5%#yA1;l35A|#M-5YQ1eZ;3o2H!h~8 zo;rKbTJwma5al!VGHnZ~J&YF#?UG9cLN>U*d(t^|8+d!0;gghH(6m8LOqHNH*i&~J zh>k4~-V(C#ZYXBy%5Da|@9Qrh95tM{g;>pBgs-DY#U2~S#MkmaODUPnNlbE;op|~L zr==XJFds=MkzC8gD47>wYba5Y5|gqtsO5nzK@oQa~`M|5QnYt7rp1L1EwB(Z&fv!m;*ubdyJWTBW653ChWr`PTcf$7@m%nX z23w%B2XGOrlVYQT_g3(VPK!wk^%?Smw_?i^#WHM#50QUidf4X~UP4s2%)E`ur;yF; zJ3VTdiz$R>8JMo*pV0#Qqv+6nlO@oA>d_vv(Pf0rr>nnsTC38tolnOtZ`K7HOsaP} zhHTy#1aWa497m_{C8oilw1WlZUr6J8@>vh=cFDRck>Cay3p;T= zv?_Fi00n;&x4p&wW7YI+ggbxVipCOrTn+v`TfqOVXxJLr*#5_oS@v}#`a2y;@O{!F z+}8zCFr;J5_jT$nqNw_j*J6c)o3q5z?VWprFf7lD5hM>nLAraA*2|l2a`92MR$TW2dMN zY^tN=AB;`2wdUr4HSzu{7Nq{C4P(&)Vmb!)H*pL*`s zZZD~Auvm0m-oaNym|!7OzkMxPHlST)3^1)(M>*R%GH4{=Fw#yvmU)w-rS!i{7~yH2cV2;qmKrd!5FVNy5kg> zL3pIVkuwm!QBdcbNySOJP7*a8wTbAc%TT_YO@A_-qAi7P;QMI_ZD3J$p3oxE{&lyr zsB)o{Pzw5+D8-j?MMm6Ts;N)O3$Z&IkZjjLED6-f+$_uUN|@@p_)iVXS3m4uS?Di4 z6l;W-!gY%kKUWTi*~n?gd(AD-o# zwY_(O{TpEEEr=!cSY_ec>}=x<)>`Nn|KlL8>cr!ylN}igS1%X3xf>R7%B$R5FUJm! z@A;Z)@t^Zs)G%g((1ihSjb9Z`scq3brYxh;m(swJ9}&tjOvwVC3ldzRdm6V~k|E6?*7k)}E;BGqR;U zJQF8tv9T3Ky7Sp2b~snAm&^r!-G&C^tX3_#%UlSqyo9*)fhyFanAfR)E$-AUtlLs7 zY1kP#6G2TOtlj*iTP8AU-l1U`LP=axX-^16s3KVAzOQPQE)|1!9Zv+-#UX*G+J2M^ zE@Ab>kj_NyR=Q&3n%|?6|4isQVGvyql^zpDv!oztdgBI-lZ}%{z(F^>yV18~&Y&q} z(>szIOtFa=sxg4aU^qmf|jnt1XxU@#X z3JYoh3)1Fynp)JiIumG7x0`S|HiKiW-zidR)TV{RIunggA4K(eF0HRXe|O|1L7_7!sgRZrs!<)0csjIkRluZ z6ax8_b-5Sc^b^-5ff;tY_LVKSoo|=FR{wh;pT}vy-JqZivi&!axWk{z6PNRWprnyz zb-weXux`EIu3UaFx<$x;l3Pp5^(lLy_T`2Fr~|iz1`Oc-2`_ zL!eJACm@cJDIBQ?O?z!o?MZ5LT6!d6w|C8cvuOGmAB#Jp8+stAZup-i**zMwEmrHFGF@r`2Z1IEHnv;IzE|5~2udYn<_oP~zyC+vTE&HAcAW z$dqB2#=E^85)(csh7C78Z5RE(ajZP~U@FVrpi&?2#Ys3|0`azf!KdkqQ-+SM8&YV; z)vP$h4e`Zxi6rZ^Md-Be;MCJ6s$bXVfmZk#%5N@!#`^~INoF)a>|JX*6yt8c~hEjROC zBed+ECuXm9bW_scU5%8@C3M7=4gf!n2=wLWlGy|V3TBe-1{lv5Y|hOV8<7?32?oAZ zZ5Ipa!G~EP**YShcMv(|06K{Ku?l4JC> zsU>BC7$St##03x$(D(I(Y$64Vm4w8FaH$j~MPMD<4Ay&`{XQd7Bh*Db3}WRr13qms)T7k27AXWMtkUAa$?0us(1;K+S48 zpag!_&%U5OiCd?rvV{#+q(Hsho7#}IZtNs{R9;mOU^~GT87xdFGu0kexsvBk&vEXy zp*L*1DP>BeF|sy8I;E`oTi=AKLtiT%w_AYw!K{>+0rra^6q)q)(%2 z1I+~z3BINS4HfgmrAfxK3V`MdfrZcmNOnTte`*&V;(L|#WC6Id#OKNzJPJ%q3@R9_ zi-9!i4+2cu=?C&%z#kXb4=@M-l@0_ln6T@!7#%j!=%45OOPJm!lY8ER~#7WvKf}S}^&%I@3P7fUdo0slfigKrS;?i;S($E05@U zs9ckp4t_SW*_-#O@C05i4$CMsZT3@}Q(o{QX{F3hKjC!tWA2lUy#{ji4Y4|Rk8vF& zyoRqua&P!fS`nQTp@man0Z+7=bFGo?I36w7%tAIDIl+f#zOx8zTyfQp1mAK+j?TWn z-=!yF=~p%gq^?7mxl@5`mm!2U2t2Ms;o`v+5`saa9{mR)QT-tPzJ^A%fclLA5Wn#R z`kx5-@!{MwN^u^ABD7E;TBt0wSGM}2K;6$E^CRfFY4qYe^4|qt+y5?L4l=iZngc^D zh9y$e&?&5MmwgYISbnFt4KhE0{9RB+`&R+3R5nMdfD>Qfu`k5j6nb$Et)i$-VRXAJ zu2(jP`=5eC=-&m5VucS#dB+xa`(w=Q_WBF81fxsK}R$eEp)Km*zBG@xw}tz8=kt z{KcGe?;cVEf*YGhi8#v}B0sLpOtDFGY}96;E_LsyeL4+-6&Sy zHwbSrJV7P#J&ktKXyQxBxtl6vN0#1hxj=p7*gSGQIaVIL8p2dvX26eHOI3A@!Iw)8 z$y_58(xify5iQUqSdAKLYfiJf<0~nQA>zjB?I94(!ymuAd&oHTPco**M5jXSuJ~<* zb4sm-I9f!X?X-Surv0dPbU4p@qp7AIzh2pGuY6#UnCLdi^|1_8yVIDou~(Mf7|?~MM-&Z5Ol`D68YS|vq+~_$i-5s?BHRU+}g^` zVj1`1gT@Sp*i+~gs_I;3kGAMnACg!fqTP8X3%{~L$<L=>a=R0Z`cHDtkb~?1+B(z|i&hE(j{iXHKPXy|*w4yU^G+i@_S58jEOW84u?KF&G&vKb93&@tx>G+Lq-#$oKs6=S^?y*Vl&O85&c^yTgywgvPkecP{zN8c-%0`pe4UyyxpC{W^f zNw9@jJGPx;j5?oF!OL`3neF#qv#DCmack+bqE2+w7Yy*JyYJ=@sGs}i?8-8Ce%$XI zf4Y-bEh?ZY@}CvNPhaDO_FhhiEywT2FE8D`hlDj6v-EIhH*e#a;enK7LoYWS&oJvL zKywKdihYE`jT$4n>vjLuPGc0xJOWFA3pQI%MK>vkvIbmlT)|h^!qtgqW}|rJvas3>l){s+Fmkr$w=GdgL<(M;+?ngMLkNIXZQ!;IC%xPb%?7*40%ME6uVE6gs9RJcbmFCkX>!aLc20J zNF4&)^0UBAlfF;FiARVk!5Z1GV5xZ2 z5<(CApyShxOshOyHZ4fd-u!M`G*H2Hox!M~~(^)M*bYW^o%=R_r_B=06aJ9;#R4=UIhZJ_zg3ey&w-bJ3 z*Kt?E%P9VwxL<{UUT#Fde2fP20;)+^T*k+F8 zpmO8EQ2mx!NDgn91EsgEne36|>UO>vVK&xftw84XwKwg{BenD?F6=Bj)?o}Amb`Iw zz9X~RotQ!B{`c7ClAuA7L6zl&J9#^+gr}DrC0LWxMa~Xe+brISXbqE6VuszY5ib2_ zjBw{^hIlh=8=>TVJI?OW@-E@z^qm=;w|yxBKjVm7z0zm5PY@AO%Bmg@b)TQ2+_Egj zZntVqk4~pcVK)K>d(t0H%RS{2Eq7-L48OaAV5#Nb)3I{yH?sq{WydAmMSPxHucjBR zIvd(uGliK7vz%+Hfi|0phMOfi;m2i5N&XrD?x_1z2i^WP==8O5UP&4Ml@2twNfN{BV@cfh%pb!1=Y+ROUrP8nfyO zZ1}gGWWo>|_crh5IIoQVwpT~}+g|5rYGh?Z`{$efkAYuJ zGG;{xr-S$kSERimI>Gh37PE#`wK()yeTk*|>`rgS;!Gs>{HURegWwq=B%~YyIHesh zBw&(ym?Zm^99npm*bCJ2OQ->Pf)_1H9Xmcjkt>B)(#>!MlsEnOVOtPWCx-0iSFL23gm~cgck+&50Z~+yvL$2L= z`UB!Z`6Ojk1g>eiYjLP=gJG78e$Mn3SOkN69$wWIFuj$Emp{xD<74#7b(kH9^1n`^ z9X1b<=ZrwC$}wjnuM?2hHs8)seP@3&ANE09IC5OZr8@WqP`TrqHxE?5ZlzjAF#+_+ zOLKRvd@;!#Q)g30qqCx3o!X-=uwv};7SuDRT7IMO>*X!xy7p{G-IR4j;^iGoN44*T zwwhMaw=2j&6DAz$tR2_=xFJHpack2pSYOtBNIJokpi_w>!Me{2K+nDrGyw6N`oe;_ zWQdsh;I>wyhkTQ&0NTWjmt^VoyWNVPp~Xcf#h9buiD5|T16sYWdQQ+MCcAnLz~Oy( zTz?*JZpmO3W3&AXSFC7%IlUmq&*{kUx;ef=cW?c??elh(u;Kln39l%;IYOKHdi5UI zQ^E6c^E80%Ng@JnRA_RAsAkK{GB<|j~$p-Rd zJ=k*LVZ2uIEYL&Wg#ma8sEc3vrG;RwGw>3f)Z=Oq&}CAr25Q2zk%BU`nvoHWc>OIL zOJ_~Locj@X8Rn`ipK^2eSps4WVgiM-f8pXI{vd3TF&nDTm_^jQ&K9_*FxM+~63>K7 z%s2#tC7?krjd2-b)wq|9V8nfrs@4z@P}0pe5LX?FTDVf(;wzztQ$=)xrqU_c9T#FF z4Uq(myMx#%+(DvnVsh!t%{`Yg#eJ-_sS=kJff5$w*rgtILu?unwZaFFQ#QeR6g~yF z2cZa`Vo^~ib!`&Zrtbv|k?iq`L2{~IZkOcN$t1Y?UNxBnvbE=)R!K;vH@_v>#*ww9 z7sct@9yQToCs^Y7aGUTZH?j^uUTMH|Y}EL0x4Pe*XXEX%-n(P_osYlOV*DcYW@Rz$ z+uY4V!>$@SO9b{=pj4&f0(WlYSs^5GgW+@sY4U-y9s?$c&;uFo^%gy{O1B#j`GlD5 zEuzwEbPO?q0Vxt!O4S`ncw=BrP*tLfJx`vl1A~bSe3Ea!c|^;RW0~vX=n@IB3>xU1 zKG*=Na*I8So-!0yKGVp&WDR_hFcm=U!h^*lqh-! z2WY0gpr6*5qi;(+2}z{IBv3(1v!IuR1??d&2Y3Lsm}3ui$<1JNTSMXnK>QkNoYCFy zGK$NQ2aYN4V~9!_!i$> zZ#?8HOh~Vsq_J_&DK0)0k+Y`@JnW+CPr5@bnVOM2Bn~ep4qqyUmJCwuCfl0j{Mqp-l&3Rh(s9kj%=y*bNsyAp&QIf0`CUQR z?2g%k1=1>=&6$HhZJVh|eUyY~8+-%J@oQ=u5^lLt<2On%?qafsj>G7ImL;R(#K~%N z+t+MHiMJ`5T9|nqz}1qfyyfdji{8KwGuH9!L|tRxW3x&NohR*G*xw$D&@xi89>vi* zn#(kvl~pG4yY0|6{Tqwdw~MXtU5cVM6aBVnO;;7!B6nA2crN&V%C(OZi}gxIl|Ek& z0UoY?!a%QiY;fj}8t%Kmg&r@bwsX$2rys9pZm)-TscTaGVZ4PCABpxFx*Y&c;DH>? zYVdV@5JKdrr(O&O&(a29dq>v%UFCeiV^Ob^Gw)M;jhC;depkp7B8_wrOA zrgwelZtq-9m_zsxCM~(Nv>gYSB^i5Pf!w%;;W-6bMxW8#OmnF{+BY@yZH79Z#t{aY zl;+-m4pvZWnYU*DXJ!+Mme)VosG0aF44>Ye$g8;jZK?U6;9nObeT5Gu_Vk%CX<)+l zNcWbo3%iK!XCxPp5t3xzaiJp>^;uuf2jY?XZ|<& z$8^o#>9c=Bt{>HV9Ls-8ru}zKA9I)f*^oJi|0CJ+Bl+)M_?QOrJHhF1$n`^dkLLYH zuIR^fr+-)TF|z)5&dA@8>j(DUsQ-U8|C2%T?}|Rg68|2He?zX{<2#rCQPE#v`5$Ze z&6$s}X8-IEIrP73_!xowk^Faue2gdh9X0efc8obWIDf5f=_oBpF4@wcD*Z^-pqCH;&3f4$xxiGMf#qwDOS zefRCtU&J5%jvvW?xBa6w|Ic@CQv6N+Ki;-~6Mvkae=GcdL#|&W`(MQWR{#H9#m9-} ipTcv``&SkJn$~3{LEo1I005--znJ$7Ks)~5cmE%1&IXMD literal 0 HcmV?d00001 diff --git a/tests/testthat/test-docx-comments.R b/tests/testthat/test-docx-comments.R index 6ff56f64..82ce04ac 100644 --- a/tests/testthat/test-docx-comments.R +++ b/tests/testthat/test-docx-comments.R @@ -50,3 +50,262 @@ test_that("add comments", { expect_length(xml_children(comment1), 2) expect_length(xml_children(comment2), 1) }) + +test_that("docx_comments accounts for multiple comments in a paragraph", { + multi_comment_par <- fpar( + "This paragraph", + run_comment( + cmt = block_list( + fpar( + ftext("First Comment.") + ) + ), + run = ftext("contains"), + author = "Author Me", + date = "2023-06-01" + ), + "multiple", + run_comment( + cmt = block_list( + fpar( + ftext("Second Comment.") + ) + ), + run = ftext("comments"), + author = "Author Me", + date = "2023-06-01" + ) + ) + + doc <- read_docx() + doc <- body_add_fpar(doc, value = multi_comment_par, style = "Normal") + docx_file <- print(doc, target = tempfile(fileext = ".docx")) + + comments <- docx_comments(read_docx(docx_file)) + + expect_equal(nrow(comments), 2) + + expect_equal( + comments$text, + list("First Comment.", "Second Comment.") + ) + expect_equal( + comments$commented_text, + list("contains", "comments") + ) +}) + +test_that("docx_comments", { + example_docx <- "docs_dir/test-docx_comments.docx" + doc <- read_docx(path = example_docx) + + comments <- docx_comments(doc) + + # Doc includes 16 comments + expect_equal(nrow(comments), 16) + # No NAs in "commented_text" + expect_true( + all( + vapply( + comments[["commented_text"]], + function(x) all(!is.na(x)), + FUN.VALUE = logical(1) + ) + ) + ) + # Accounts for empty comments or multi line comments + expect_true( + all( + lengths(comments[["text"]][-c(1, 4)]) == 1 + ) + ) + ## Comment 1 has 2 lines + expect_equal( + length(comments[["text"]][[1]]), 2 + ) + ## Comment 4 is empty + expect_identical( + comments[["text"]][[4]], character(0) + ) +}) + +test_that("docx_comments accounts for comments spanning no or multiple paragraphs", { + example_docx <- "docs_dir/test-docx_comments.docx" + doc <- read_docx(path = example_docx) + + comments <- docx_comments(doc) + + expect_true( + all( + lengths(comments[["para_id"]][-c(5, 6)]) == 1 + ) + ) + ## Comment 5 spans no paragraph + expect_identical( + comments[["para_id"]][[5]], character(0) + ) + expect_equal( + paste(comments[["commented_text"]][[5]], collapse = " "), + "" + ) + ## Comment 6 spans 2 paragraphs + expect_equal( + length(comments[["para_id"]][[6]]), 2 + ) + expect_equal( + paste(comments[["commented_text"]][[6]], collapse = " "), + "a comment … … which spans multiple paragraphs." + ) +}) + +test_that("docx_comments accounts for comments spanning no or multiple runs", { + example_docx <- "docs_dir/test-docx_comments.docx" + doc <- read_docx(path = example_docx) + + comments <- docx_comments(doc) + + expect_true( + all( + lengths(comments[["commented_text"]][-c(5, 6, 7, 15)]) == 1 + ) + ) + ## Comment 5 spans no run + expect_identical( + comments[["commented_text"]][[5]], character(0) + ) + expect_equal( + paste(comments[["commented_text"]][[5]], collapse = " "), + "" + ) + ## Comment 6 spans 2 runs as it spans 2 paragraphs + expect_equal( + length(comments[["commented_text"]][[6]]), 2 + ) + expect_equal( + paste(comments[["commented_text"]][[6]], collapse = " "), + "a comment … … which spans multiple paragraphs." + ) + ## Comment 7 spans 3 runs. + expect_equal( + length(comments[["commented_text"]][[7]]), 3 + ) + expect_equal( + paste(comments[["commented_text"]][[7]], collapse = ""), + "spanning multiple runs." + ) + ## Comment 15 spans 3 runs because of an inner comment + expect_equal( + length(comments[["commented_text"]][[15]]), 3 + ) + expect_equal( + paste(comments[["commented_text"]][[15]], collapse = ""), + "This paragraph contains two nested comments." + ) +}) + +test_that("docx_comments accounts for replies", { + example_docx <- "docs_dir/test-docx_comments.docx" + doc <- read_docx(path = example_docx) + + comments <- docx_comments(doc) + + # Make "unique" id based on commented text and paragraph id + comments$unique_id <- paste( + comments$para_id, comments$commented_text, + sep = "." + ) + comments$unique_id <- factor( + comments$unique_id, + levels = unique(comments$unique_id) + ) + comments$unique_id <- as.integer(comments$unique_id) + + comments_split <- split( + comments, + comments$unique_id + ) + + # Accouting for replies we have only 12 comments + expect_equal( + length(comments_split), + 12 + ) + expect_equal( + vapply( + comments_split[-c(8, 9, 10)], nrow, + FUN.VALUE = integer(1), USE.NAMES = FALSE + ), + rep(1, 9) + ) + ## 8 has two replies + expect_equal( + nrow(comments_split[[8]]), + 3 + ) + expect_equal( + unique(unlist(comments[8:10, "commented_text"])), + comments[["commented_text"]][[8]] + ) + expect_equal( + unique(unlist(comments[8:10, "para_id"])), + comments[["para_id"]][[8]] + ) + ## 9 and 10 have one reply each + expect_equal( + nrow(comments_split[[9]]), + 2 + ) + expect_equal( + unique(unlist(comments[11:12, "commented_text"])), + comments[["commented_text"]][[11]] + ) + expect_equal( + unique(unlist(comments[11:12, "para_id"])), + comments[["para_id"]][[11]] + ) + expect_equal( + nrow(comments_split[[10]]), + 2 + ) + expect_equal( + unique(unlist(comments[13:14, "commented_text"])), + comments[["commented_text"]][[13]] + ) + expect_equal( + unique(unlist(comments[13:14, "para_id"])), + comments[["para_id"]][[13]] + ) +}) + +test_that("docx_comments accounts for nested comments", { + example_docx <- "docs_dir/test-docx_comments.docx" + doc <- read_docx(path = example_docx) + + comments <- docx_comments(doc) + + ## Outer Comment 15 spans 3 runs + expect_equal( + length(comments[["commented_text"]][[15]]), 3 + ) + expect_equal( + paste(comments[["commented_text"]][[15]], collapse = ""), + "This paragraph contains two nested comments." + ) + expect_equal( + paste(comments[["text"]][[15]], collapse = ""), + "Outer Comment." + ) + + ## Inner Comment 16 spans 1 run + expect_equal( + length(comments[["commented_text"]][[16]]), 1 + ) + expect_equal( + paste(comments[["commented_text"]][[16]], collapse = ""), + "contains two " + ) + expect_equal( + paste(comments[["text"]][[16]], collapse = ""), + "Inner Comment." + ) +})