subugoe · njahn82 · May 5, 2019
diff --git a/_posts/unpaywall_evidence/data/results_evidence_overlap_08_12.csv b/_posts/unpaywall_evidence/data/results_evidence_overlap_08_12.csv
diff --git a/_posts/unpaywall_evidence/data/results_evidence_overlap_13_18.csv b/_posts/unpaywall_evidence/data/results_evidence_overlap_13_18.csv
diff --git a/_posts/unpaywall_evidence/unpaywall_evidence.Rmd b/_posts/unpaywall_evidence/unpaywall_evidence.Rmd
@@ -469,36 +469,31 @@ evidence_categories_df %>%
 We first illustrate for each evidence type - collating again the least frequent types in the category `Other` - the amount of articles which corresponds exclusively to this type and no others.
 
 ```{r, layout = "l-page", fig.cap="Proportion of articles per evidence type. In blue, the amount of articles uniquely associated with the corresponding evidence type is shown."}
-#determine number of articles corresponding only to one evidence type
-evidence_single_cat_df <- evidence_df %>%
-  group_by(evidence) %>%
-  summarize(number_of_articles = sum(number_of_articles)) %>%
-  left_join(evidence_categories_df, by = c("evidence" = "ev_cat")) %>%
-  rename(number_of_articles = number_of_articles.x, number_of_single_cat = number_of_articles.y) %>%
-  mutate(number_of_articles = replace_na(number_of_articles, 0),
-         number_of_single_cat = replace_na(number_of_single_cat, 0))
-#aggregate least frequent types as category `Other`
-evidence_single_cat_grouped_df <- evidence_single_cat_df %>%
-  ungroup() %>%
-  mutate(evidence = as_factor(evidence)) %>%
+# determine number of articles corresponding only to one evidence type
+evidence_unique <-
+  evidence_categories_df %>%
+  filter(!grepl("&", ev_cat, ignore.case = TRUE, fixed = FALSE)) %>%
+  group_by(ev_cat) %>%
+  summarize(single = sum(number_of_articles)) %>%
+  arrange(desc(single)) %>%
+  mutate(ev_cat = as_factor(ev_cat)) %>%
   mutate(
     evidence_grouped = factor(
-      fct_other(evidence, keep = articles_per_type_grouped_df$evidence_grouped),
+      fct_other(ev_cat, keep = articles_per_type_grouped_df$evidence_grouped),
       levels = articles_per_type_grouped_df$evidence_grouped
     )
   ) %>%
-  mutate(evidence_grouped = fct_relevel(fct_rev(evidence_grouped), "Other")) %>%
-  group_by(evidence_grouped) %>%
-  summarize(
-    number_of_articles = sum(number_of_articles),
-    number_of_single_cat = sum(number_of_single_cat)
-  ) %>%
-  #arrange data in order to enable stacked barplots
-  mutate(multiple = number_of_articles-number_of_single_cat, single = number_of_single_cat) %>%
+  mutate(evidence_grouped = fct_relevel(fct_rev(evidence_grouped), "Other"))
+# join with data.frame that contains all distinct occurences per DOI to obtain
+# number of articles where more than one evidence type was found
+evidence_unique %>%
+  inner_join(articles_per_type_df, by = c("ev_cat" = "evidence")) %>%
+  mutate(multiple = N_records - single) %>%
   select(evidence_grouped, single, multiple) %>%
-  gather(is_single, number_of_articles, -evidence_grouped)
-#create aggregated proportions barplot
-evidence_single_cat_grouped_df %>%
+  gather(single, multiple, key = "is_single", value = "number_of_articles") %>%
+  group_by(evidence_grouped, is_single) %>%
+  summarise(number_of_articles = sum(number_of_articles)) %>%
+  # prepare plot
   ggplot(aes(x = evidence_grouped, y = number_of_articles, fill = is_single)) +
   geom_bar(stat = "identity", position = "fill") +
   scale_fill_manual(values = c("#b3b3b3a0", "#56B4E9"), name = "Is unique?") +
@@ -510,8 +505,6 @@ evidence_single_cat_grouped_df %>%
   theme(panel.grid.major.y = element_blank()) +
   theme(panel.border = element_blank()) +
   coord_flip() +
-  theme(legend.position = "top",
-        legend.justification = "right") +
   labs(y = "Proportion of Articles", x = "Evidence Type",
        title = "Proportion of Articles per Evidence Type") -> plot_ev_types_is_single_prop
 #create interactive plot

diff --git a/_posts/unpaywall_evidence/unpaywall_evidence.html b/_posts/unpaywall_evidence/unpaywall_evidence.html
diff --git a/docs/index.html b/docs/index.html
@@ -1647,7 +1647,7 @@ <h1 class="posts-list-caption">Blog | Scholarly Communication Analytics with R</
 <a href="posts/unpaywall_evidence/" class="post-preview">
 <script class="post-metadata" type="text/json">{"categories":[]}</script>
 <div class="metadata">
-<div class="publishedDate">May 3, 2019</div>
+<div class="publishedDate">May 5, 2019</div>
 </div>
 <div class="thumbnail">
 <img data-src="posts/unpaywall_evidence/distill-preview.png"/>

diff --git a/docs/index.xml b/docs/index.xml
@@ -11,15 +11,15 @@ to publish case-studies rapidely showing how to support data-driven workflows an
 decision-making around scholarly communication in libraries using R.
 </description>
     <generator>Distill</generator>
-    <lastBuildDate>Fri, 03 May 2019 00:00:00 +0000</lastBuildDate>
+    <lastBuildDate>Sun, 05 May 2019 00:00:00 +0000</lastBuildDate>
     <item>
       <title>Open Access Evidence in Unpaywall</title>
       <dc:creator>Najko Jahn</dc:creator>
       <dc:creator>Anne Hobert</dc:creator>
       <link>https://subugoe.github.io/scholcomm_analytics/posts/unpaywall_evidence</link>
       <description>We investigated more than 31 million scholarly journal articles published between 2008 and 2018 that are indexed in Unpaywall, a widely used open access discovery tool. Using Google BigQuery and R, we determined over 11,6 million journal articles with open access full-text links in Unpaywall, corresponding to an open access share of 37 %. Our data analysis revealed various open access location and evidence types, as well as large overlaps between them, raising important questions about how to responsibly re-use Unpaywall data in bibliometric research and open access monitoring.</description>
       <guid>https://subugoe.github.io/scholcomm_analytics/posts/unpaywall_evidence</guid>
-      <pubDate>Fri, 03 May 2019 00:00:00 +0000</pubDate>
+      <pubDate>Sun, 05 May 2019 00:00:00 +0000</pubDate>
       <media:content url="https://subugoe.github.io/scholcomm_analytics/posts/unpaywall_evidence/distill-preview.png" medium="image" type="image/png" width="1248" height="768"/>
     </item>
     <item>

diff --git a/docs/posts/posts.json b/docs/posts/posts.json
@@ -13,10 +13,10 @@
         "url": {}
       }
     ],
-    "date": "2019-05-03",
+    "date": "2019-05-05",
     "categories": [],
     "preview": "posts/unpaywall_evidence/distill-preview.png",
-    "last_modified": "2019-05-03T12:27:22+02:00",
+    "last_modified": "2019-05-05T11:49:46+02:00",
     "preview_width": 1248,
     "preview_height": 768
   },