From a2fcea314db2033c5475f4593d2d07141270bcd0 Mon Sep 17 00:00:00 2001
From: massimoaria <aria@unina.it>
Date: Tue, 12 Mar 2024 15:22:54 +0100
Subject: [PATCH 1/3] biblioshiny: improved author name merging algorithm

---
 inst/biblioshiny/server.R | 33 +++++---------------
 inst/biblioshiny/utils.R  | 64 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 25 deletions(-)

diff --git a/inst/biblioshiny/server.R b/inst/biblioshiny/server.R
index 9acf7d8..037d306 100644
--- a/inst/biblioshiny/server.R
+++ b/inst/biblioshiny/server.R
@@ -239,11 +239,7 @@ To ensure the functionality of Biblioshiny,
                                   M <- convert2df(D,
                                                   dbsource = input$dbsource,
                                                   format = format(D))
-                                  if (input$authorName=="AF"){
-                                    M <- M %>% 
-                                      rename(AU_IN = .data$AU,
-                                             AU = .data$AF)
-                                  }
+                                  M <- authorNameFormat(M, input$authorName)
                                 })
                  },
                  ### WoS Txt/Bib Files
@@ -253,11 +249,7 @@ To ensure the functionality of Biblioshiny,
                                   M <- convert2df(inFile$datapath,
                                                   dbsource = input$dbsource,
                                                   format = format(inFile$datapath))
-                                  if (input$authorName=="AF"){
-                                    M <- M %>% 
-                                      rename(AU_IN = .data$AU,
-                                             AU = .data$AF)
-                                  }
+                                  M <- authorNameFormat(M, input$authorName)
                                 })
                  })
         },
@@ -271,11 +263,8 @@ To ensure the functionality of Biblioshiny,
                                   M <- convert2df(D,
                                                   dbsource = input$dbsource,
                                                   format = format(D))
-                                  if (input$authorName=="AF"){
-                                    M <- M %>% 
-                                      rename(AU_IN = .data$AU,
-                                             AU = .data$AF)
-                                  }
+                                  M <- authorNameFormat(M, input$authorName)
+                                  M <- AuthorNameMerge(M)
                                 })
                  },
                  ### Scopus CSV/Bib Files
@@ -285,11 +274,8 @@ To ensure the functionality of Biblioshiny,
                                   M <- convert2df(inFile$datapath,
                                                   dbsource = input$dbsource,
                                                   format = "csv")
-                                  if (input$authorName=="AF"){
-                                    M <- M %>% 
-                                      rename(AU_IN = .data$AU,
-                                             AU = .data$AF)
-                                  }
+                                  M <- authorNameFormat(M, input$authorName)
+                                  M <- AuthorNameMerge(M)
                                 })
                  },
                  bib = {
@@ -298,11 +284,8 @@ To ensure the functionality of Biblioshiny,
                                   M <- convert2df(inFile$datapath,
                                                   dbsource = input$dbsource,
                                                   format = "bibtex")
-                                  if (input$authorName=="AF"){
-                                    M <- M %>% 
-                                      rename(AU_IN = .data$AU,
-                                             AU = .data$AF)
-                                  }
+                                  M <- authorNameFormat(M, input$authorName)
+                                  M <- AuthorNameMerge(M)
                                 })
                  })
         },
diff --git a/inst/biblioshiny/utils.R b/inst/biblioshiny/utils.R
index d729e29..42baa5f 100644
--- a/inst/biblioshiny/utils.R
+++ b/inst/biblioshiny/utils.R
@@ -1,5 +1,69 @@
 ### COMMON FUNCTIONS ####
 
+authorNameFormat <- function(M, format){
+  if (format=="AF" & "AF" %in% names(M)){
+    M <- M %>% 
+      rename(AU_IN = .data$AU,
+             AU = .data$AF)
+  }
+  return(M)
+}
+
+split_text_numbers <- function(input_str, UT) {
+  # Split the string into components based on "; "
+  components <- unlist(strsplit(input_str, "; ", fixed = TRUE))
+  
+  # Initialize two vectors to store the separated parts
+  texts <- character(length(components))
+  numbers <- numeric(length(components))
+  
+  # Iterate through each component to separate text and numbers
+  for (i in seq_along(components)) {
+    # Extract the text using regex, matching everything up to " ("
+    texts[i] <- gsub("\\s\\(.*$", "", components[i])
+    
+    # Extract the numbers using regex, matching digits inside parentheses
+    numbers[i] <- as.numeric(gsub(".*\\((\\d+)\\).*", "\\1", components[i]))
+  }
+  
+  # Return a list with texts and numbers separated
+  data.frame(Texts = texts, Numbers = numbers, UT=UT)
+}
+
+
+AuthorNameMerge <- function(M){
+  
+  df_list <- list()
+  for (i in 1:nrow(M)){
+    if(nchar(M$AU[i])>0){
+      df_list[[i]] <- split_text_numbers(M$AU[i],M$UT[i])
+    }
+  }
+  
+  df <- do.call(rbind,df_list)
+  
+  AU <- df %>% 
+    group_by(.data$Numbers, .data$Texts) %>% 
+    count() %>% 
+    group_by(.data$Numbers) %>%
+    arrange(desc(.data$n)) %>% 
+    mutate(AU = .data$Texts[1]) %>% 
+    select(-"n", - "Texts") %>% 
+    ungroup() %>% 
+    distinct()
+  
+  df <- df %>% 
+    left_join(AU, by = "Numbers") %>% 
+    group_by(UT) %>% 
+    summarize(AU = paste0(AU,collapse=";"),
+              AU_ID = paste0(.data$Numbers, collapse=";"))
+  
+  M <- M %>% 
+    rename(AU_original = .data$AU) %>% 
+    left_join(df, by="UT")
+  return(M)
+}
+
 getFileNameExtension <- function (fn) {
   # remove a path
   splitted    <- strsplit(x=fn, split='/')[[1]]   

From 85a965eeab8e40f327cc48a778971c08f6d72bff Mon Sep 17 00:00:00 2001
From: Massimo Aria <aria@unina.it>
Date: Tue, 12 Mar 2024 16:24:49 +0100
Subject: [PATCH 2/3] Update server.R

---
 inst/biblioshiny/server.R | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/inst/biblioshiny/server.R b/inst/biblioshiny/server.R
index 037d306..81d6a2d 100644
--- a/inst/biblioshiny/server.R
+++ b/inst/biblioshiny/server.R
@@ -264,7 +264,7 @@ To ensure the functionality of Biblioshiny,
                                                   dbsource = input$dbsource,
                                                   format = format(D))
                                   M <- authorNameFormat(M, input$authorName)
-                                  M <- AuthorNameMerge(M)
+                                  if (format(D)=="csv") M <- AuthorNameMerge(M)
                                 })
                  },
                  ### Scopus CSV/Bib Files
@@ -285,7 +285,7 @@ To ensure the functionality of Biblioshiny,
                                                   dbsource = input$dbsource,
                                                   format = "bibtex")
                                   M <- authorNameFormat(M, input$authorName)
-                                  M <- AuthorNameMerge(M)
+                                  #M <- AuthorNameMerge(M)
                                 })
                  })
         },

From 8178a090d19929a6f8e5d67386df5f427cd6a038 Mon Sep 17 00:00:00 2001
From: massimoaria <aria@unina.it>
Date: Tue, 12 Mar 2024 15:22:54 +0100
Subject: [PATCH 3/3] biblioshiny: improved author name merging algorithm

---
 R/convert2df.R | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/R/convert2df.R b/R/convert2df.R
index f5b5f23..33f764e 100644
--- a/R/convert2df.R
+++ b/R/convert2df.R
@@ -229,7 +229,12 @@ convert2df<-function(file,dbsource="wos",format="plaintext", remove.duplicates=T
              id_field <- "UT"
            },
            scopus={
-             id_field <- "UT"
+             if (format=="csv"){
+               id_field <- "UT"
+             } else {
+               id_field <- "TI"
+             }
+             
            },
            openalex={
              id_field <- "id_oa"