-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path03_time_to_retraction.R
131 lines (105 loc) · 4.77 KB
/
03_time_to_retraction.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"
1. Plot time to retraction
2. Plot cor (time to retraction, journal impact factor)
"
# Global options
options(StringsAsFactors = FALSE)
# Set dir
setwd(githubdir)
setwd("propagation_of_error/")
# Read in data
retracted <- read.csv("data/05_retracted_articles/new_retracted_articles.csv")
notices <- read.csv("data/03_retraction_notices/new_retraction_notices.csv")
codebook <- read.csv("data/citation_codebook.csv")
# Merge files
re_no <- merge(retracted, notices, by = "index", suffixes = c("_re","_no"))
# Rename cols.
# cbind(names(re_no), codebook$r_label[match(names(re_no), paste0(codebook$label, "_re"))])
new_names_re <- codebook$r_label[match(names(re_no), paste0(codebook$label, "_re"))]
new_names_no <- codebook$r_label[match(names(re_no), paste0(codebook$label, "_no"))]
names(re_no) <- ifelse(!is.na(new_names_re), paste0(new_names_re, "_re"), names(re_no))
names(re_no) <- ifelse(!is.na(new_names_no), paste0(new_names_no, "_no"), names(re_no))
# Time to retraction
re_no$ttr <- re_no$pub_year_no - re_no$year
# Average/median/range time to retraction
mean(re_no$ttr)
median(re_no$ttr)
range(re_no$ttr)
quantile(re_no$ttr)
# 1. Plot time to retraction
# -----------------------------
library(ggplot2)
library(grid)
# Base
theme_base <- theme_minimal() +
theme(panel.grid.major.y = element_line(colour = "#e3e3e3", linetype = "dotted"),
panel.grid.minor.x = element_blank(),
panel.grid.major.x = element_line(colour = "#f7f7f7", linetype = "solid"),
panel.border = element_blank(),
legend.position = "bottom",
legend.text = element_text(size = 9),
legend.background = element_rect(color = "#ffffff"),
legend.key = element_rect(color = "#ffffff", fill = "#ffffff"),
legend.key.size = unit(.1, "cm"),
legend.spacing = unit(.2, "cm"),
title = element_text(size = 8, colour = "#333333"),
axis.title = element_text(size = 8, colour = "#333333"),
axis.text = element_text(size = 8, colour = "#333333"),
axis.ticks.y = element_blank(),
axis.ticks.x = element_blank(),
strip.text.x = element_text(size = 9),
plot.margin = unit(c(0, .5, .5, .5), "cm"))
# Plot
re_no$constant <- 1
ggplot(re_no, aes(y = pub_year_no - year, x = constant)) +
geom_boxplot(col = "#2b8cbe", fatten = .75) +
xlab("") +
geom_segment(aes(x = .7,
y = median(re_no$pub_year_no - re_no$year),
xend = .6, yend = median(re_no$pub_year_no - re_no$year)),
size = .25,
col="#2b8cbe") +
annotate("text", size = 3, x = .59, y = median(re_no$pub_year_no - re_no$year) - .5,
label = paste("Median: ", round(median(re_no$pub_year_no - re_no$year), 2))) +
ylab("Number of years since publication") +
coord_flip() +
theme_base +
theme(axis.text.y = element_blank())
ggplot(re_no, aes(pub_year_no - year)) +
geom_density() +
geom_vline(xintercept = median(re_no$pub_year_no - re_no$year),
col="#2b8cbe") +
annotate("text", size = 3, y = .3, x = median(re_no$pub_year_no - re_no$year) + 2,
label = paste("Median: ", round(median(re_no$pub_year_no - re_no$year), 2))) +
scale_x_continuous("Years from Publication", breaks = round(seq(0, max(re_no$pub_year_no - re_no$year), 1), 0)) +
theme_base
ggsave("figs/time_to_retraction.pdf")
#
# Correlation between journal impact factor and time to retraction
# Read JIF data
jif <- read.csv("data/journal_impact_factor.csv")
# Average by journal
library(dplyr)
ttr_by_journal <- re_no[, c("source_title_re", "ttr")] %>%
group_by(source_title_re) %>%
summarise(avg_time = mean(ttr))
# Merge ttr w/ JIF
ttr_jif <- merge(ttr_by_journal, jif, by.x = "source_title_re", by.y = "journal_title")
ttr_jif$jif <- as.numeric(ttr_jif$impact_factor)
ggplot(ttr_jif[ttr_jif$jif < 10, ], aes(y = avg_time, x = jif)) +
scale_y_continuous("Average Time to Retraction", expand = c(0, 0)) +
geom_point(alpha = 1/10, pch = 16) +
scale_x_continuous("Journal Impact Factor", breaks = round(seq(min(ttr_jif$jif), max(ttr_jif$jif), 1), 0)) +
geom_smooth(method = "loess", span = .3, size = .2, col = "#2b8cbe") +
theme_minimal() +
theme_base
ggsave("figs/jif_time_to_retraction.pdf")
# Log JIF to see if anything is going on more clearly
ggplot(ttr_jif[ttr_jif$jif < 10, ], aes(y = avg_time, x = log(jif))) +
scale_y_continuous("Average Time to Retraction", expand = c(0, 0)) +
geom_point(alpha = 1/10, pch = 16) +
scale_x_continuous("Journal Impact Factor", breaks = round(seq(min(ttr_jif$jif), max(ttr_jif$jif), 1), 0)) +
geom_smooth(method = "loess", span = .3, size = .2, col = "#2b8cbe") +
theme_minimal() +
theme_base
ggsave("figs/log_jif_time_to_retraction.pdf")