From a2fcea314db2033c5475f4593d2d07141270bcd0 Mon Sep 17 00:00:00 2001 From: massimoaria Date: Tue, 12 Mar 2024 15:22:54 +0100 Subject: [PATCH 1/3] biblioshiny: improved author name merging algorithm --- inst/biblioshiny/server.R | 33 +++++--------------- inst/biblioshiny/utils.R | 64 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 25 deletions(-) diff --git a/inst/biblioshiny/server.R b/inst/biblioshiny/server.R index 9acf7d8..037d306 100644 --- a/inst/biblioshiny/server.R +++ b/inst/biblioshiny/server.R @@ -239,11 +239,7 @@ To ensure the functionality of Biblioshiny, M <- convert2df(D, dbsource = input$dbsource, format = format(D)) - if (input$authorName=="AF"){ - M <- M %>% - rename(AU_IN = .data$AU, - AU = .data$AF) - } + M <- authorNameFormat(M, input$authorName) }) }, ### WoS Txt/Bib Files @@ -253,11 +249,7 @@ To ensure the functionality of Biblioshiny, M <- convert2df(inFile$datapath, dbsource = input$dbsource, format = format(inFile$datapath)) - if (input$authorName=="AF"){ - M <- M %>% - rename(AU_IN = .data$AU, - AU = .data$AF) - } + M <- authorNameFormat(M, input$authorName) }) }) }, @@ -271,11 +263,8 @@ To ensure the functionality of Biblioshiny, M <- convert2df(D, dbsource = input$dbsource, format = format(D)) - if (input$authorName=="AF"){ - M <- M %>% - rename(AU_IN = .data$AU, - AU = .data$AF) - } + M <- authorNameFormat(M, input$authorName) + M <- AuthorNameMerge(M) }) }, ### Scopus CSV/Bib Files @@ -285,11 +274,8 @@ To ensure the functionality of Biblioshiny, M <- convert2df(inFile$datapath, dbsource = input$dbsource, format = "csv") - if (input$authorName=="AF"){ - M <- M %>% - rename(AU_IN = .data$AU, - AU = .data$AF) - } + M <- authorNameFormat(M, input$authorName) + M <- AuthorNameMerge(M) }) }, bib = { @@ -298,11 +284,8 @@ To ensure the functionality of Biblioshiny, M <- convert2df(inFile$datapath, dbsource = input$dbsource, format = "bibtex") - if (input$authorName=="AF"){ - M <- M %>% - rename(AU_IN = .data$AU, - AU = .data$AF) - } + M <- authorNameFormat(M, input$authorName) + M <- AuthorNameMerge(M) }) }) }, diff --git a/inst/biblioshiny/utils.R b/inst/biblioshiny/utils.R index d729e29..42baa5f 100644 --- a/inst/biblioshiny/utils.R +++ b/inst/biblioshiny/utils.R @@ -1,5 +1,69 @@ ### COMMON FUNCTIONS #### +authorNameFormat <- function(M, format){ + if (format=="AF" & "AF" %in% names(M)){ + M <- M %>% + rename(AU_IN = .data$AU, + AU = .data$AF) + } + return(M) +} + +split_text_numbers <- function(input_str, UT) { + # Split the string into components based on "; " + components <- unlist(strsplit(input_str, "; ", fixed = TRUE)) + + # Initialize two vectors to store the separated parts + texts <- character(length(components)) + numbers <- numeric(length(components)) + + # Iterate through each component to separate text and numbers + for (i in seq_along(components)) { + # Extract the text using regex, matching everything up to " (" + texts[i] <- gsub("\\s\\(.*$", "", components[i]) + + # Extract the numbers using regex, matching digits inside parentheses + numbers[i] <- as.numeric(gsub(".*\\((\\d+)\\).*", "\\1", components[i])) + } + + # Return a list with texts and numbers separated + data.frame(Texts = texts, Numbers = numbers, UT=UT) +} + + +AuthorNameMerge <- function(M){ + + df_list <- list() + for (i in 1:nrow(M)){ + if(nchar(M$AU[i])>0){ + df_list[[i]] <- split_text_numbers(M$AU[i],M$UT[i]) + } + } + + df <- do.call(rbind,df_list) + + AU <- df %>% + group_by(.data$Numbers, .data$Texts) %>% + count() %>% + group_by(.data$Numbers) %>% + arrange(desc(.data$n)) %>% + mutate(AU = .data$Texts[1]) %>% + select(-"n", - "Texts") %>% + ungroup() %>% + distinct() + + df <- df %>% + left_join(AU, by = "Numbers") %>% + group_by(UT) %>% + summarize(AU = paste0(AU,collapse=";"), + AU_ID = paste0(.data$Numbers, collapse=";")) + + M <- M %>% + rename(AU_original = .data$AU) %>% + left_join(df, by="UT") + return(M) +} + getFileNameExtension <- function (fn) { # remove a path splitted <- strsplit(x=fn, split='/')[[1]] From 85a965eeab8e40f327cc48a778971c08f6d72bff Mon Sep 17 00:00:00 2001 From: Massimo Aria Date: Tue, 12 Mar 2024 16:24:49 +0100 Subject: [PATCH 2/3] Update server.R --- inst/biblioshiny/server.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inst/biblioshiny/server.R b/inst/biblioshiny/server.R index 037d306..81d6a2d 100644 --- a/inst/biblioshiny/server.R +++ b/inst/biblioshiny/server.R @@ -264,7 +264,7 @@ To ensure the functionality of Biblioshiny, dbsource = input$dbsource, format = format(D)) M <- authorNameFormat(M, input$authorName) - M <- AuthorNameMerge(M) + if (format(D)=="csv") M <- AuthorNameMerge(M) }) }, ### Scopus CSV/Bib Files @@ -285,7 +285,7 @@ To ensure the functionality of Biblioshiny, dbsource = input$dbsource, format = "bibtex") M <- authorNameFormat(M, input$authorName) - M <- AuthorNameMerge(M) + #M <- AuthorNameMerge(M) }) }) }, From 8178a090d19929a6f8e5d67386df5f427cd6a038 Mon Sep 17 00:00:00 2001 From: massimoaria Date: Tue, 12 Mar 2024 15:22:54 +0100 Subject: [PATCH 3/3] biblioshiny: improved author name merging algorithm --- R/convert2df.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/R/convert2df.R b/R/convert2df.R index f5b5f23..33f764e 100644 --- a/R/convert2df.R +++ b/R/convert2df.R @@ -229,7 +229,12 @@ convert2df<-function(file,dbsource="wos",format="plaintext", remove.duplicates=T id_field <- "UT" }, scopus={ - id_field <- "UT" + if (format=="csv"){ + id_field <- "UT" + } else { + id_field <- "TI" + } + }, openalex={ id_field <- "id_oa"