Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

issue #404: OpenAlex integration #420

Merged
merged 2 commits into from
Feb 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 42 additions & 10 deletions R/convert2df.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,11 @@
#' d)\tab 'lens' \tab Lens.org (in csv '.csv');\cr
#' e)\tab 'pubmed' \tab an object of the class \code{pubmedR (package pubmedR)} containing a collection obtained from a query performed with pubmedR package;\cr
#' f)\tab 'dimensions' \tab an object of the class \code{dimensionsR (package dimensionsR)} containing a collection obtained from a query performed with dimensionsR package;\cr
#' g)\tab 'openalex' \tab a data frame object returned by openalexR package, containing a collection of works resulting from a query fetched from OpenAlex database.}
#' g)\tab 'openalex' \tab OpenAlex .csv file;\cr
#' h)\tab 'openalex_api' \tab a data frame object returned by openalexR package, containing a collection of works resulting from a query fetched from OpenAlex database.}
#' @param dbsource is a character indicating the bibliographic database. \code{dbsource} can be \code{dbsource = c('cochrane','dimensions','generic','isi','openalex', 'pubmed','scopus','wos', 'lens')} . Default is \code{dbsource = "isi"}.
#' @param format is a character indicating the format of the SCOPUS and Clarivate Analytics WoS export file. \code{format} can be \code{c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed')}. Default is \code{format = "plaintext"}.
#' @param format is a character indicating the SCOPUS, Clarivate Analytics WoS, and other databases export file format. \code{format} can be \code{c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed')}. Default is \code{format = "plaintext"}.
#' @param remove.duplicates is logical. If TRUE, the function will remove duplicated items checking by DOI and database ID.
#' @return a data frame with cases corresponding to articles and variables to Field Tags in the original export file.
#'
#' I.e We have three files downlaod from Web of Science in plaintext format, file will be:
Expand Down Expand Up @@ -57,10 +59,10 @@
#'
#' @export

convert2df<-function(file,dbsource="wos",format="plaintext"){
convert2df<-function(file,dbsource="wos",format="plaintext", remove.duplicates=TRUE){

allowed_formats <- c('api', 'bibtex', 'csv', 'endnote','excel','plaintext', 'pubmed')
allowed_db <- c('cochrane','dimensions','generic','isi','openalex', 'pubmed','scopus','wos', 'lens')
allowed_db <- c('cochrane','dimensions','generic','isi','openalex', 'openalex_api','pubmed','scopus','wos', 'lens')

cat("\nConverting your",dbsource,"collection into a bibliographic dataframe\n\n")
if (length(setdiff(dbsource,allowed_db))>0){
Expand Down Expand Up @@ -147,7 +149,10 @@ convert2df<-function(file,dbsource="wos",format="plaintext"){
})

},
openalex = {
openalex={
M <- csvOA2df(file)
},
openalex_api = {
if (!"bibliometrixDB" %in% class(file)){
M <- openalexR::oa2bibliometrix(file)
} else {
Expand All @@ -168,11 +173,11 @@ convert2df<-function(file,dbsource="wos",format="plaintext"){
M$CR <- trim.leading(trimES(gsub("\\[,||\\[||\\]|| \\.\\. || \\. ","",M$CR))) # remove foreign characters from CR (i.e. Chinese, Russian characters)
}

if (dbsource!="cochrane"){M$AU=gsub(intToUtf8(8217),intToUtf8(39),M$AU)}
if (dbsource!="cochrane"){M$AU <- gsub(intToUtf8(8217),intToUtf8(39),M$AU)}

cat("Done!\n\n")

if (!(dbsource %in% c("pubmed", "lens", "openalex"))) {
if (!(dbsource %in% c("pubmed", "lens", "openalex_api"))) {
## AU_UN field creation
if ("C1" %in% names(M)) {
cat("\nGenerating affiliation field tag AU_UN from C1: ")
Expand Down Expand Up @@ -204,10 +209,37 @@ convert2df<-function(file,dbsource="wos",format="plaintext"){
}

### SR field creation
if (isTRUE(remove.duplicates)){
switch(dbsource,
isi={
id_field <- "UT"
},
scopus={
id_field <- "UT"
},
openalex={
id_field <- "id_oa"
},
openalex_api={
id_field <- "id_oa"
},
dimneisons={
id_field <- "UT"
},
pubmed={
id_field <- "PMID"
},
lens={
id_field <- "UT"
},
{
id_field <- "TI"
})
d <- duplicated(M[id_field])
if (sum(d)>0) cat("\nRemoved ",sum(d),"duplicated documents\n")
M <- M[!d,]
}
suppressWarnings(M <- metaTagExtraction(M, Field="SR"))
d <- duplicated(M$SR)
if (sum(d)>0) cat("\nRemoved ",sum(d),"duplicated documents\n")
M <- M[!d,]
row.names(M) <- M$SR

### bibliometrix>DB class
Expand Down
7 changes: 4 additions & 3 deletions R/csvLens2df.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,13 @@ csvLens2df <- function(file){

# Iso Source Titles
DATA$SO[DATA$SO==""] <- DATA$Publisher[DATA$SO==""]
DATA$JI <- sapply(DATA$SO, AbbrevTitle, USE.NAMES = FALSE)
DATA$J9 <- gsub("\\.","",DATA$JI)
# DATA$JI <- sapply(DATA$SO, AbbrevTitle, USE.NAMES = FALSE)
# DATA$J9 <- gsub("\\.","",DATA$JI)
DATA$JI <- DATA$J9 <- DATA$SO
DATA$ID <- DATA$DE
DI <- DATA$DI
URL <- DATA$URL
DATA <- data.frame(lapply(DATA,toUpper))
DATA <- data.frame(lapply(DATA,toupper))
DATA$DI <- DI
DATA$URL <- URL
DATA$AU_CO <- "NA"
Expand Down
123 changes: 123 additions & 0 deletions R/csvOA2df.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
utils::globalVariables(c("all_of", "corr", "DI", "id_oa","RP","UN","AU_ID"))

csvOA2df <- function(file){
options(readr.num_columns = 0)

## import all files in a single data frame
for (i in 1:length(file)){
#D <- read.csv(file[i], quote='"', check.names = F, stringsAsFactors = F) #fileEncoding = "UTF-8-BOM")
D <- read_csv(file[i], na=character(), quote='"', trim_ws = FALSE, progress = show_progress(), show_col_types = FALSE) %>%
mutate(across(where(is.numeric), as.character)) %>%
mutate(across(where(is.character), \(x) tidyr::replace_na(x,""))) %>%
as.data.frame()

if (i>1){
l <- intersect(l,names(D))
DATA <- rbind(DATA[l],D[l])
}else{
l <- names(D)
DATA <- D}
}
rm(D)

## Post-Processing

# column re-labelling
DATA <- relabelling(DATA)

# recode as numeric
DATA$TC <- as.numeric(DATA$TC)
DATA$PY <- as.numeric(DATA$PY)
DATA$relevance_score <- as.numeric(DATA$relevance_score)

# replace | with ;
DATA <- DATA %>%
mutate(across(where(is.character), ~ stringi::stri_replace_all_regex(.,"\\|",";")))

DATA$AF <- DATA$AU
DATA$ID <- DATA$DE
DATA$AB=""
DATA$CR <- gsub("https://openalex.org/","",DATA$CR)
DATA$AU_ID <- gsub("https://openalex.org/","",DATA$AU_ID)
DATA$id_oa <- gsub("https://openalex.org/","",DATA$id_oa)
DATA$JI <- DATA$J9 <- gsub("https://openalex.org/","",DATA$SO_ID)
DATA$corresponding_author_ids <- gsub("https://openalex.org/","",DATA$corresponding_author_ids)
DATA$C1 <- gsub("https://", "", DATA$C1)
DATA$DB <- "OPENALEX"

## corresponding author
UN <- strsplit(DATA$C1,";")
corresp <- strsplit(DATA$authorships_is_corresponding,";")
df_UN <- data.frame(UN=unlist(UN), id_oa=rep(DATA$id_oa,lengths(UN))) %>%
group_by(id_oa) %>%
mutate(n=row_number())
df_COR <- data.frame(corr=unlist(corresp), id_oa=rep(DATA$id_oa,lengths(corresp))) %>%
group_by(id_oa) %>%
mutate(n=row_number())
df_UN <- df_UN %>%
left_join(df_COR, by=(c("id_oa","n")))
AU <- strsplit(DATA$AU,";")
AU_df <- data.frame(RP = unlist(AU), AU_ID=unlist(strsplit(DATA$AU_ID,";")), id_oa=rep(DATA$id_oa,lengths(AU))) %>%
group_by(id_oa) %>%
mutate(n=row_number()) %>%
left_join(df_UN %>% select("UN","id_oa", "corr", "n"),
by = c("id_oa","n")) %>%
dplyr::filter(corr == "true") %>%
mutate(RP = paste(RP,UN, sep=", ")) %>%
ungroup() %>%
select("RP", "AU_ID") %>%
distinct(AU_ID, .keep_all = TRUE)
DATA <- DATA %>%
left_join(AU_df, by = c("corresponding_author_ids" = "AU_ID"))


# move all char strings to Upper
ind <- apply(DATA,2,function(x){
sum(regexpr("https://",x)>-1, na.rm = TRUE)>0
})
label <- names(ind)[ind==FALSE & !is.na(ind)]

DATA <- DATA %>%
mutate(across(all_of(label), toupper),
DI = gsub("https://doi.org/","",DI),
DI = ifelse(DI == "null",NA,DI))

return(DATA)
}

relabelling <- function(DATA){
## column re-labelling
label <- names(DATA)
label[label %in% "id"] <- "id_oa"
label[label %in% "display_name"] <- "TI"
label[label %in% "primary_location_display_name"] <- "SO"
label[label %in% "primary_location_id"] <- "SO_ID"
label[label %in% "primary_location_host_organization"] <- "PU"
label[label %in% "primary_location_issns"] <- "ISSN"
label[label %in% "primary_location_issn_l"] <- "ISSN_I"
label[label %in% "primary_location_landing_page_url"] <- "URL"
label[label %in% "primary_location_pdf_url"] <- "URL_PDF"
label[label %in% "author_ids"] <- "AU_ID"
label[label %in% "author_names"] <- "AU"
label[label %in% "author_orcids"] <- "OI"
label[label %in% "author_institution_names"] <- "C3"
label[label %in% "cited_by_count"] <- "TC"
label[label %in% "publication_year"] <- "PY"
label[label %in% "type"] <- "DT"
label[label %in% "biblio_issue"] <- "IS"
label[label %in% "biblio_volume"] <- "VL"
label[label %in% "referenced_works" ] <- "CR"
label[label %in% "keywords_keyword"] <- "DE"
label[label %in% "concepts_display_name"] <- "CONCEPTS"
label[label %in% "topics_display_name"] <- "TOPICS"
label[label %in% "sustainable_development_goals_display_name"] <- "SDG"
label[label %in% "primary_topic_field_display_name"] <- "SC"
label[label %in% "mesh_descriptor_name"] <- "MESH"
label[label %in% "referenced_works_count"] <- "NR"
label[label %in% "language"] <- "LA"
label[label %in% "authorships_author_position"] <- "AU_POSITION"
label[label %in% "authorships_raw_affiliation_string"] <- "C1"
label[label %in% "doi"] <- "DI"
names(DATA) <- label
return(DATA)
}
93 changes: 2 additions & 91 deletions R/histNetwork.R
Original file line number Diff line number Diff line change
Expand Up @@ -195,95 +195,6 @@ wos <- function(M, min.citations, sep, network, verbose){
return(results)
}

# scopus <- function(M, min.citations, sep, network, verbose){
#
# if (isTRUE(verbose)) {
# cat("\nSCOPUS DB: Searching local citations (LCS) by document titles (TI) and DOIs...\n")
# }
#
# if (!("SR_FULL" %in% names(M))) {
# M = metaTagExtraction(M, Field = "SR")
# }
#
# M$nCITING <- 1:nrow(M)
# papers <- M$nCITING[M$TC >= min.citations]
#
# TIpost <-
# paste(gsub("[[:punct:]]", "", M$TI[papers]), " ", M$PY[papers], " ", sep = "")
#
# CR <- gsub("[[:punct:]]", "", M$CR)
# n <- nchar(CR)
# n[is.na(n)] <- 2
# n <- n + 1
# nCum <- c(1, cumsum(n[-length(n)]))
# CR <- paste(CR, collapse = " ")
#
# #L <- str_locate_all(CR, TIpost)
# L <- stringi::stri_locate_all_regex(CR,TIpost, omit_no_match = TRUE)
#
# LCS <- lengths(L) / 2
#
# M$LCS <- 0
# M$LCS[papers] <- LCS
#
#
# ### HistData
# histData <- M %>%
# select(.data$SR_FULL, .data$TI,.data$DE,.data$ID,.data$DI, .data$PY, .data$LCS, .data$TC) %>%
# rename(
# Paper = .data$SR_FULL,
# Title = .data$TI,
# Author_Keywords = .data$DE,
# KeywordsPlus = .data$ID,
# DOI = .data$DI,
# Year = .data$PY,
# GCS = .data$TC
# ) %>%
# arrange(.data$Year) %>%
# dplyr::filter(.data$GCS>=min.citations) %>%
# as.data.frame()
#
#
# if (isTRUE(network)) {
# ## Network matrix
# df <- lapply(seq_along(L), function(i) {
# l <-
# data.frame(
# ref = L[[i]],
# paper = rep(papers[i], length(L[[i]][, 1]))
# )
# })
# df <- (do.call(rbind, df))
#
# A <- outer(df$ref.start, nCum, "-")
# A[A < 0] <- NA
# df$CITINGn <- unlist(apply(A, 1, which.min))
# df$CITING <- M$SR[df$CITINGn]
# df$CITED <- M$SR[df$paper]
# df <- df %>%
# dplyr::filter(.data$CITING %in% histData$Paper)
#
# NetMatrix <-
# (as_adjacency_matrix(graph_from_data_frame(df[, c(6, 5)], directed = T)))
# } else{
# NetMatrix = NULL
# }
#
# if (isTRUE(verbose)) {
# cat("\nFound",
# length(M$LCS[M$LCS > 0]),
# "documents with no empty Local Citations (LCS)\n")
# }
#
# results <-
# list(
# NetMatrix = NetMatrix,
# histData = histData,
# M = M,
# LCS = M$LCS
# )
# }

# New algorithm for Scopus
# Local citation matching is based on First Author, Year and PP
scopus <- function(M, min.citations, sep, network, verbose){
Expand Down Expand Up @@ -387,7 +298,7 @@ scopus <- function(M, min.citations, sep, network, verbose){

openalex <- function(M, min.citations=min.citations, sep=sep, network=network, verbose=verbose){

M$CR[is.na(M$CR)] <- "none"
M$CR[is.na(M$CR) | M$CR==""] <- "none"
ids <- M$id_oa
CR <- strsplit(M$CR, ";")
CR <- data.frame(id_oa = rep(M$id_oa,lengths(CR)), ref = unlist(CR)) %>%
Expand Down Expand Up @@ -420,7 +331,7 @@ openalex <- function(M, min.citations=min.citations, sep=sep, network=network, v
SRrow <- WLCR %>% select(.data$id_oa) %>%
left_join(M %>%
select(.data$id_oa, .data$SR),
by="id_oa")
by="id_oa")

SR_col <- data.frame(id_oa = colnames(WLCR)[-1]) %>%
left_join(M %>%
Expand Down
2 changes: 1 addition & 1 deletion R/metaTagExtraction.R
Original file line number Diff line number Diff line change
Expand Up @@ -405,7 +405,7 @@ AU_UN<-function(M,sep){
})
AFFL=unlist(AFFL)
M$AU_UN=AFFL
if (M$DB[1]=="ISI" & "C3" %in% names(M)){
if (M$DB[1] %in% c("ISI", "OPENALEX") & "C3" %in% names(M)){
M$AU_UN[!is.na(M$C3) & M$C3!=""] <- M$C3[!is.na(M$C3) & M$C3!=""]
}
M$AU_UN=gsub("\\\\&","AND",M$AU_UN)
Expand Down
10 changes: 9 additions & 1 deletion inst/biblioshiny/server.R
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,15 @@ To ensure the functionality of Biblioshiny,
})
})
},
openalex = {
openalex={
withProgress(message = 'Conversion in progress',
value = 0, {
M <- convert2df(inFile$datapath,
dbsource = input$dbsource,
format = "csv")
})
},
openalex_api = {
M <- smart_load(inFile$datapath)
},
lens = {
Expand Down
3 changes: 2 additions & 1 deletion inst/biblioshiny/ui.R
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,8 @@ body <- dashboardBody(
"Web of Science (WoS/WoK)" = "isi",
"Scopus" = "scopus",
"Dimensions" = "dimensions",
"OpenAlex (via openalexR)" = "openalex",
"Openalex" ="openalex",
"OpenAlex API (via openalexR)" = "openalex_api",
"Lens.org" = "lens",
"PubMed" = "pubmed",
"Cochrane Library" = "cochrane"
Expand Down
Loading
Loading