diff --git a/docs/no_toc/About.md b/docs/no_toc/About.md index 36e3f8d..3e5c265 100644 --- a/docs/no_toc/About.md +++ b/docs/no_toc/About.md @@ -46,7 +46,7 @@ These credits are based on our [course contributors table guidelines](https://gi ## collate en_US.UTF-8 ## ctype en_US.UTF-8 ## tz Etc/UTC -## date 2024-03-25 +## date 2024-06-28 ## ## ─ Packages ─────────────────────────────────────────────────────────────────── ## package * version date lib source diff --git a/docs/no_toc/about-the-authors.html b/docs/no_toc/about-the-authors.html index e7cb5f4..de2bc0c 100644 --- a/docs/no_toc/about-the-authors.html +++ b/docs/no_toc/about-the-authors.html @@ -388,7 +388,7 @@

About the Authors

## collate en_US.UTF-8 ## ctype en_US.UTF-8 ## tz Etc/UTC -## date 2024-03-25 +## date 2024-06-28 ## ## ─ Packages ─────────────────────────────────────────────────────────────────── ## package * version date lib source diff --git a/docs/no_toc/index.html b/docs/no_toc/index.html index 5a060b5..695bfa1 100644 --- a/docs/no_toc/index.html +++ b/docs/no_toc/index.html @@ -264,7 +264,7 @@

About this Course

diff --git a/docs/no_toc/index.md b/docs/no_toc/index.md index fd11185..962291e 100644 --- a/docs/no_toc/index.md +++ b/docs/no_toc/index.md @@ -1,6 +1,6 @@ --- title: "Advanced Reproducibility in Cancer Informatics" -date: "March, 2024" +date: "June, 2024" site: bookdown::bookdown_site documentclass: book bibliography: [book.bib, packages.bib] diff --git a/docs/no_toc/resources/images/03-version-control-with-github_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_g1014c75158f_0_675.png b/docs/no_toc/resources/images/03-version-control-with-github_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_g1014c75158f_0_675.png index 42c60cb..cc79d61 100644 Binary files a/docs/no_toc/resources/images/03-version-control-with-github_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_g1014c75158f_0_675.png and b/docs/no_toc/resources/images/03-version-control-with-github_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_g1014c75158f_0_675.png differ diff --git a/docs/no_toc/resources/images/03-version-control-with-github_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_g101867ebdaa_18_0.png b/docs/no_toc/resources/images/03-version-control-with-github_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_g101867ebdaa_18_0.png index 648f2ac..90e93c9 100644 Binary files a/docs/no_toc/resources/images/03-version-control-with-github_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_g101867ebdaa_18_0.png and b/docs/no_toc/resources/images/03-version-control-with-github_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_g101867ebdaa_18_0.png differ diff --git a/docs/no_toc/resources/images/05-code-review-author_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfc3e8b194d_0_118.png b/docs/no_toc/resources/images/05-code-review-author_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfc3e8b194d_0_118.png index 66a9924..39441b2 100644 Binary files a/docs/no_toc/resources/images/05-code-review-author_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfc3e8b194d_0_118.png and b/docs/no_toc/resources/images/05-code-review-author_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfc3e8b194d_0_118.png differ diff --git a/docs/no_toc/resources/images/05-code-review-author_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfc3e8b194d_0_46.png b/docs/no_toc/resources/images/05-code-review-author_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfc3e8b194d_0_46.png index 796a19f..7f467f2 100644 Binary files a/docs/no_toc/resources/images/05-code-review-author_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfc3e8b194d_0_46.png and b/docs/no_toc/resources/images/05-code-review-author_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfc3e8b194d_0_46.png differ diff --git a/docs/no_toc/resources/images/06-code-review-reviewer_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfa97af8537_0_55.png b/docs/no_toc/resources/images/06-code-review-reviewer_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfa97af8537_0_55.png index db18812..a3b7b6e 100644 Binary files a/docs/no_toc/resources/images/06-code-review-reviewer_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfa97af8537_0_55.png and b/docs/no_toc/resources/images/06-code-review-reviewer_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfa97af8537_0_55.png differ diff --git a/docs/no_toc/resources/images/06-code-review-reviewer_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfc3e8b194d_0_177.png b/docs/no_toc/resources/images/06-code-review-reviewer_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfc3e8b194d_0_177.png index 4efd7b4..3ce78c3 100644 Binary files a/docs/no_toc/resources/images/06-code-review-reviewer_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfc3e8b194d_0_177.png and b/docs/no_toc/resources/images/06-code-review-reviewer_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfc3e8b194d_0_177.png differ diff --git a/docs/no_toc/resources/images/07-launching-docker_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfbb8cb91d5_0_116.png b/docs/no_toc/resources/images/07-launching-docker_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfbb8cb91d5_0_116.png index 85f19d3..891083d 100644 Binary files a/docs/no_toc/resources/images/07-launching-docker_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfbb8cb91d5_0_116.png and b/docs/no_toc/resources/images/07-launching-docker_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfbb8cb91d5_0_116.png differ diff --git a/docs/no_toc/resources/images/07-launching-docker_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfd7f4e514e_0_256.png b/docs/no_toc/resources/images/07-launching-docker_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfd7f4e514e_0_256.png index 8e97ff1..1a12320 100644 Binary files a/docs/no_toc/resources/images/07-launching-docker_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfd7f4e514e_0_256.png and b/docs/no_toc/resources/images/07-launching-docker_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfd7f4e514e_0_256.png differ diff --git a/docs/no_toc/resources/images/07-launching-docker_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfd7f4e514e_0_5.png b/docs/no_toc/resources/images/07-launching-docker_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfd7f4e514e_0_5.png index 5c00acb..4b426c0 100644 Binary files a/docs/no_toc/resources/images/07-launching-docker_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfd7f4e514e_0_5.png and b/docs/no_toc/resources/images/07-launching-docker_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfd7f4e514e_0_5.png differ diff --git a/docs/no_toc/resources/images/07-launching-docker_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfd7f4e514e_0_55.png b/docs/no_toc/resources/images/07-launching-docker_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfd7f4e514e_0_55.png index 3b109a6..959a17c 100644 Binary files a/docs/no_toc/resources/images/07-launching-docker_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfd7f4e514e_0_55.png and b/docs/no_toc/resources/images/07-launching-docker_files/figure-html/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I_gfd7f4e514e_0_55.png differ diff --git a/docs/no_toc/search_index.json b/docs/no_toc/search_index.json index a6a07e6..eef514a 100644 --- a/docs/no_toc/search_index.json +++ b/docs/no_toc/search_index.json @@ -1 +1 @@ -[["index.html", "Advanced Reproducibility in Cancer Informatics About this Course 0.1 Available course formats", " Advanced Reproducibility in Cancer Informatics March, 2024 About this Course This course is part of a series of courses for the Informatics Technology for Cancer Research (ITCR) called the Informatics Technology for Cancer Research Education Resource. This material was created by the ITCR Training Network (ITN) which is a collaborative effort of researchers around the United States to support cancer informatics and data science training through resources, technology, and events. This initiative is funded by the following grant: National Cancer Institute (NCI) UE5 CA254170. Our courses feature tools developed by ITCR Investigators and make it easier for principal investigators, scientists, and analysts to integrate cancer informatics into their workflows. Please see our website at www.itcrtraining.org for more information. 0.1 Available course formats This course is available in multiple formats which allows you to take it in the way that best suites your needs. You can take it for certificate which can be for free or fee. The material for this course can be viewed without login requirement on this Bookdown website. This format might be most appropriate for you if you rely on screen-reader technology. This course can be taken for free certification through Leanpub. This course can be taken on Coursera for certification here (but it is not available for free on Coursera). Our courses are open source, you can find the source material for this course on GitHub. "],["introduction.html", "Chapter 1 Introduction 1.1 Topics covered: 1.2 Motivation 1.3 Target Audience 1.4 Curriculum 1.5 How to use the course", " Chapter 1 Introduction 1.1 Topics covered: This is the second course in a two part series: 1.2 Motivation Cancer datasets are plentiful, complicated, and hold untold amounts of information regarding cancer biology. Cancer researchers are working to apply their expertise to the analysis of these vast amounts of data but training opportunities to properly equip them in these efforts can be sparse. This includes training in reproducible data analysis methods. Data analyses are generally not reproducible without direct contact with the original researchers and a substantial amount of time and effort (Beaulieu-Jones and Greene 2017). Reproducibility in cancer informatics (as with other fields) is still not monitored or incentivized despite that it is fundamental to the scientific method. Despite the lack of incentive, many researchers strive for reproducibility in their own work but often lack the skills or training to do so effectively. Equipping researchers with the skills to create reproducible data analyses increases the efficiency of everyone involved. Reproducible analyses are more likely to be understood, applied, and replicated by others. This helps expedite the scientific process by helping researchers avoid false positive dead ends. Open source clarity in reproducible methods also saves researchers’ time so they don’t have to reinvent the proverbial wheel for methods that everyone in the field is already performing. This course introduces tools that help enhance reproducibility and replicability in the context of cancer informatics. It uses hands-on exercises to demonstrate in practical terms how to get acquainted with these tools but is by no means meant to be a comprehensive dive into these tools. The course introduces tools and their concepts such as git and GitHub, code review, Docker, and GitHub actions. 1.3 Target Audience The course is intended for students in the biomedical sciences and researchers who use informatics tools in their research. It is the follow up course to the Introduction to Reproducibility in Cancer Informatics course 1.4 Curriculum Goal of this course: To equip learners with a deeper knowledge of the capabilities of reproducibility tools and how they can apply to their existing analyses scripts and projects. What is NOT the goal of this course To be a comprehensive tutorial to each of the tools shown. 1.5 How to use the course Each chapter has associated exercises that you are encourage to complete in order to get the full benefit of the course This course is designed with busy professional learners in mind – who may have to pick up and put down the course when their schedule allows. In general, you are able to skip to chapters you find a most useful to (One incidence where a prior chapter is required is noted). References "],["defining-reproducibility.html", "Chapter 2 Defining Reproducibility 2.1 Learning Objectives 2.2 What is reproducibility 2.3 Reproducibility in daily life 2.4 Reproducibility is worth the effort! 2.5 Reproducibility exists on a continuum!", " Chapter 2 Defining Reproducibility If you’ve not previously read through the defining reproducibility chapter in our introductory course, we recommend you read through it here; otherwise feel free to skip the next chapter. 2.1 Learning Objectives 2.2 What is reproducibility There’s been a lot of discussion about what is included in the term reproducibility and there is some discrepancy between fields. For the purposes of informatics and data analysis, a reproducible analysis is one that can be re-run by a different researcher and the same result and conclusion is found. Reproducibility is related to repeatability and replicability but it is worth taking time to differentiate these terms Perhaps you are like Ruby and have just found an interesting pattern through your data analysis! This has probably been the result of many months or years on your project and it’s worth celebrating! But before she considers these results a done deal, Ruby should test whether she is able to re-run her own analysis and get the same results again. This is known as repeatability. Given that Ruby’s analysis is repeatable; she may feel confident now to share her preliminary results with her colleague, Avi the Associate. Whether or not someone else will be able to take Ruby’s code and data, re-run the analysis and obtain the same results is known as reproducibility. If Ruby’s results are able to be reproduced by Avi, now Avi may collect new data and use Ruby’s same analysis methods to analyze his data. Whether or not Avi’s new data and results concur with Ruby’s study’s original inferences is known as replicability. You may realize that these levels of research build on each other (like science is supposed to do). In this way, we can think of these in a hierarchy. Skipping any of these levels of research applicability can lead to unreliable results and conclusions. Science progresses when data and hypotheses are put through these levels thoroughly and sequentially. If results are not repeatable, they won’t be reproducible or replicable. Ideally all analyses and results would be reproducible without too much time and effort spent; this would aid in the efficiency of research getting to the next stages and questions. But unfortunately, in practice, reproducibility is not as commonplace as we would hope. Institutions and reward systems generally do not prioritize or even measure reproducibility standards in research and training opportunities for reproducible techniques can be scarce. Reproducible research can often feel like an uphill battle that is made steeper by lack of training opportunities. In this course, we hope to equip your research with the tools you need to enhance the reproducibility of your analyses so this uphill battle is less steep. 2.3 Reproducibility in daily life What does reproducibility mean in the daily life of a researcher? Let’s say Ruby’s results are repeatable in her own hands and she excitedly tells her associate, Avi, about her preliminary findings. Avi is very excited about these results as well as Ruby’s methods! Avi is also interested in Ruby’s analysis methods and results. So Ruby sends Avi the code and data she used to obtain the results. Now, whether or not Avi is able to obtain the same exact results with this same data and same analysis code will indicate if Ruby’s analysis is reproducible. Ruby may have spent a lot of time on her code and getting it to work on her computer, but whether it will successfully run on Avi’s computer is another story. Often when researchers share their analysis code it leads to a substantial amount of effort on the part of the researcher who has received the code to get it working and this often cannot be done successfully without help from the original code author (Beaulieu-Jones and Greene 2017). Avi is encountering errors because Ruby’s code was written with Ruby’s computer and local setup in mind and she didn’t know how to make it more generally applicable. Avi is spending a lot of time just trying to re-run Ruby’s same analysis on her same data; he has yet to be able to try the code on any additional data (which will likely bring up even more errors). Avi is still struggling to work with Ruby’s code and is confused about the goals and approaches the code is taking. After struggling with Avi’s code for an untold amount of time, Avi may decide it’s time to email Ruby to get some clarity. Now both Avi and Ruby are confused about why this analysis isn’t nicely re-running for Avi. Their attempts to communicate about the code through email haven’t helped them clarify anything. Multiple versions of the code may have been sent back and forth between them and now things are taking a lot more time than either of them expected. Perhaps at some point Avi is able to successfully run Ruby’s code on Ruby’s same data. Just because Avi didn’t get any errors doesn’t mean that the code ran exactly the same as it did for Ruby. Lack of errors also doesn’t mean that either Ruby or Avi’s runs of the code ran with high accuracy or that the results can be trusted. Even a small difference in decimal point may indicate a more fundamental difference in how the analysis was performed and this could be due to differences in software versions, settings, or any number of items in their computing environments. 2.4 Reproducibility is worth the effort! Perhaps you’ve found yourself in a situation like Ruby and Avi; struggling to re-run code that you thought for sure was working a minute ago. In the upcoming chapters, we will discuss how to bolster your projects’ reproducibility. As you apply these reproducible techniques to your own projects, you may feel like it is taking more time to reach endpoints, but keep in mind that reproducible analyses and projects have higher upfront costs but these will absolutely pay off in the long term. Reproducibility in your analyses is not only a time saver for yourself, but also your colleagues, your field, and your future self! You might not change a single character in your code but then return to it in a a few days/months/years and find that it no longer runs! Reproducible code stands the test of time longer, making ‘future you’ glad you spent the time to work on it. It’s said that your closest collaborator is you from 6 months ago but you don’t reply to email (Broman 2016). Many a data scientist has referred to their frustration with their past selves: Dear past-Hadley: PLEASE COMMENT YOUR CODE BETTER. Love present-Hadley — Hadley Wickham (@hadleywickham) April 7, 2016 The more you comment your code, and make it clear and readable, your future self will thank you. Reproducible code also saves your colleagues time! The more reproducible your code is, the less time all of your collaborators will need to spend troubleshooting it. The more people who use your code and need to try to fix it, the more time is wasted. This can add up to a lot of wasted researcher time and effort. But, reproducible code saves everyone exponential amounts of time and effort! It will also motivate individuals to use and cite your code and analyses in the future! 2.5 Reproducibility exists on a continuum! Incremental work on your analyses is good! You do not need to make your analyses perfect on the first try or even within a particular time frame. The first step in creating an analysis is to get it to work once! But the work does not end there. Furthermore, no analysis is or will ever be perfect in that it will not be reproducible in every single context throughout time. incrementally pushing our analyses toward the right of this continuum is the goal. References "],["using-version-control-with-github.html", "Chapter 3 Using version control with GitHub 3.1 Learning Objectives 3.2 Prerequisites for this chapter 3.3 Set up a Git Client (GitKraken) 3.4 Get the exercise project files 3.5 Start a GitHub repository 3.6 More resources for learning GitHub", " Chapter 3 Using version control with GitHub 3.1 Learning Objectives In the introductory part of this course, we discussed some of the reasons for using GitHub but we didn’t get into version control (i.e. creating versions for managing changes over time) or GitHub’s capabilities much beyond its capacity to store code in a place where others can find it. In this advanced course, we will dig deeper into Git and GitHub’s capabilities so you can use this to your daily work’s advantage. However, to gain the benefit of these deeper GitHub skills, you will have to form some new habits. Fully embracing the GitHub workflow will make your work more efficient and help you create more transparent and reproducible analyses! In this chapter we’re going to introduce you to the basic git commands you’ll need, and guide you as we do them together one by one! 3.2 Prerequisites for this chapter In order to complete this chapter you will need a GitHub account (it’s free). If you do not currently have a GitHub account, we recommend you go through our Intro to Github chapter from the Introduction to Reproducibility course first, then return to this chapter. 3.3 Set up a Git Client (GitKraken) Interaction with git and GitHub can be done completely from the command line, but sometimes this can be harder to keep track of. To help us navigate this, we recommend using a git client. There are a lot of different clients out there, and they are generally free for most situations you will need. In this course, we will take you through how to use GitKraken, one such git client. GitKraken is nice because they have lots of nice tutorials, it works pretty well, and its free for most use cases. But if you find GitKraken doesn’t work for you, you can explore other git clients. For this course, we’ll be using GitKraken. 3.3.1 Install GitKraken Go here to install GitKraken. Follow their instructions to sign in with your GitHub account. It will ask you to authorize your GitHub account to connect to GitKraken. Click Authorize. You may find it helpful to watch GitKraken’s own tutorial (linked below) about how to “git” started, but we will also guide you through each step! GitHub has a host of terms that can feel like a whole language at first, but we’ll introduce them one at a time. To start, a lot of the GitHub workflow centers around handling copies of your code that are either stored on the internet (are remote) or are stored on your computer (are local). Remote = GitHub on the internet Local = What’s on your own computer A repository, in the case of a data science project, is mostly synonymous with the word “project”. Using GitHub, a project will exist both as a remote repository and a local repository (in other words, it will be on the internet on GitHub and on your computer). Repository = a set of project files that have a location on GitHub 3.4 Get the exercise project files In this course, you can work on the exercises from your own GitHub repository, but first we will need to set that up. Below are the files you will want to upload to that repository. Depending on whether you prefer to use R or Python, you can choose to follow this course using one or the other. Get the Python project example files Click this link to download. Now double click your chapter zip file to unzip. For Windows you may have to follow these instructions. Get the R project example files Click this link to download. Now double click your chapter zip file to unzip. For Windows you may have to follow these instructions. 3.5 Start a GitHub repository Go to Github’s main page and sign in with your GitHub account. Follow these instructions to create a repository. As a general, but not absolute rule, you will want to keep one GitHub repository for one analysis project. Name the repository something that reminds you what its related to. For these examples, we’re calling using repository-name as our placeholder. Choose Public. Choose add a README. Follow these instructions to add all the files that are inside the reproducible-R-example.zip or reproducible-python-example.zip file you downloaded to this new repository. Your new repository should more or less look like this when you are done (with obviously some minor differences). {r, fig.alt=\"An image showing what a repository looks like with our example files loaded in. \", out.width = \"100%\", echo = FALSE} ottrpal::include_slide(\"https://docs.google.com/presentation/d/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I/edit#slide=id.g2c22eaae560_0_0\") 3.5.1 git clone Now you have a repository on GitHub online! In our daily grind, we will work on this code from our own computer. To set this up, we’ll need to clone it to our own computer. Cloning is making a remote copy of the project local. clone = To make a remote repository local. In other words, to make an online repository downloaded and linked on your computer. To get started, you will need to clone the GitHub repository you created. We will be using this repository for the duration of this course. It is simple to clone a GitHub repository using GitKraken. First, sign in to GitKraken; under Repository Management > Clone tab, click Clone a repo. Then, choose where you’d like the repository to be on your computer using the Browse button. You will need to Copy + Paste your new repository’s URL (web address) to where it says URL. Navigate to your repository on GitHub to copy the URL. Copying and pasting is advisable because any little typo will inhibit cloning. Now you are ready to click Clone the repository! It will ask you if you’d like to Open Now, click that. 3.5.2 Create a branch Handling branches is where you unleash the real benefit of GitHub, but it’s also the confusing part to get the hang of. branch = a unique working copy of file changes of a GitHub repository. A branch can be local and remote. The best way to get a grasp on what the branches represent is to create one and start using it. In GitKraken we can create a new branch; this will be your working copy. First, click the Branch button. Next, type in a branch name in the box that the cursor is blinking in. In our example, we are calling it a-new-branch. Now click Enter! Now you have a new branch! Now we can edit our files and code however we normally would. Go ahead and make an edit to any file in your new repository. If you’ve made a change to any file in your repository, it will appear in GitKraken and you can click on it to see the differences. If we want to add these file changes to our current branch, we need to commit them. add = to stage your files to be committed to your current branch. commit = include your set of file changes to your current branch. Now that we have changes committed to our branch we are ready to add them to the remote, internet copy! To do this, we will need to push our branch. To push means to add changes that are on your new branch to the remote branch (internet version). You can select your origin, which refers to where your branch is stored on the internet. Choose your origin in the dropdown menu and click Submit. origin = where your branch is stored on the internet (remotely) push = to add changes from your branch to its remote counterpart. In other words, put your changes online. After a variable number of commits, your branch, called a-new-branch, is a different version of the original code base that may have a nifty improvement to it. But our main goal is to add that nifty improvement to the main branch. To start this process of bringing in new changes to the main curated repository, we will create a pull request. pull request = A way to propose changes from a branch to be included into the main repository. From GitHub: > Pull requests let you tell others about changes you’ve pushed to a GitHub repository. Once a pull request is sent, interested parties can review the set of changes, discuss potential modifications, and even push follow-up commits if necessary. Pull requests are the meat of how code changes and improvements get reviewed and incorporated! A vast majority of the benefits of incorporating GitHub into your workflow centers around fully utilizing the power of pull requests! Now we can open up a pull request if we go to our GitHub repository on GitHub. After you click on Compare & pull request you’ll be taken to a screen where you can add information about your changes. After you are done writing your description, click Create Pull Request! (If you don’t have your pull request description perfect don’t worry about it, you can always edit it later). Congrats! You’ve just opened a pull request! In an upcoming chapter we will discuss what information you should put in this pull request description to make it pertinent for yourself and whoever reviews your pull request. To summarize, below is what this workflow looks like: One more note: if you do want to use the command line or if you want to know more about the specific git commands that GitKraken is doing for you (which might be handy for troubleshooting), the specific commands that can be used or Googled at each step are highlighted in red in the images - you just need to add git before them! For example, you would type git push in your command line in order to push your code. Or if you’d like to know more about pushing code, you can google git push. 3.6 More resources for learning GitHub Happy Git and GitHub for the useR by Bryan and Hester (2021). GitHub for data scientists by Vickery (2019). Intro to GitHub by “Introduction to GitHub” (2022). First Day on GitHub by “First Day on GitHub” (2022). First Week on GitHub by “First Week on GitHub” (2022). GitHub docs about creating a Pull Request by “Creating a Pull Request” (2021). Making a Pull Request by Radigan (2021). If you have any feedback on this chapter, please fill out this form, we’d love to hear your feedback! References "],["providing-data.html", "Chapter 4 Providing data 4.1 Learning Objectives", " Chapter 4 Providing data 4.1 Learning Objectives The first part of any analysis should be getting all the data needed to run it. Data come in all kinds of formats and sizes so while we can’t give specifics on how to share your data we can provide these guidelines: 4.1.1 Overview of data sharing The data to be shared does not contain PII (personal identifiable information) or PHI (protected health information) information. The data are accessible by a download script that is automatically downloaded when re-running the analysis. Every data file needed to run the analysis is available. The data are downloaded to files in an organized manner. For more about project organization, see this chapter from the Introduction to Reproducibility course. 4.1.2 A very general example of a data download bash script As far as how to have your data downloaded, this will be dependent on where and how it’s stored online. The most general form of a data download script might look like this: #!/bin/bash # This is a template script for downloading data using the wget command # See docs here: https://www.gnu.org/software/wget/manual/wget.html mkdir <FOLDER_TO_SAVE_TO> # To see wget options, use -h (the help flag) wget -h wget -O <FOLDER/FILE_TO_SAVE_TO> <URL> You can download this general template download file here (Shapiro et al. 2021). 4.1.3 Examples of data download scripts Downloading data from GEO with GEOquery Data download script for multiple files of the same place Data download script - refine.bio example For more about data sharing techniques, see the Ethical Data Handling for Cancer Research course. References "],["engaging-in-code-review---as-an-author.html", "Chapter 5 Engaging in Code Review - as an author 5.1 Learning Objectives 5.2 Author responsibilities in code review 5.3 Characteristics of great pull requests 5.4 Exercise: Create your pull request description", " Chapter 5 Engaging in Code Review - as an author 5.1 Learning Objectives We’ve previously discussed that the only way to know if your analysis is truly reproducible is to send it to someone else to reproduce! That sentiment is at the heart of code review. Although most wouldn’t dare send out a manuscript for publishing without having our collaborators giving it a line-by-line review, people don’t always feel the same way about code. Parker (2017) describes code review: Code review will not guarantee an accurate analysis, but it’s one of the most reliable ways of establishing one that is more accurate than before. Not only does code review help boost the accuracy and reproducibility of the analysis, it also helps everyone involved in the process learn something new! An effective code review atmosphere is something that individuals and their team have to commit to (pun intended). Effective code review brings so many benefits not only to your project quality but also your communication skills through fostering a learning atmosphere! In this chapter and the next we will discuss the two sides of code review. Code review ideally includes at least two people: the author of the pull request and the reviewer of the pull request. Depending on your job context, we realize that sometimes authors have to become their own reviewers if code review is not something that can be prioritized by your institution or team. 5.2 Author responsibilities in code review The code review process begins with the creation of a pull request (which we practiced in the previous chapter). Successful and efficient code review is born out of quality communication, which is a skill set on its own. You can set up your reviewers (and yourself) up for success by knowing what basic information can help get the code review conversation going. Even if you end up being the only person who will review your own code, writing these things out is still very helpful and highly recommended. It can help you spot problems you might not have otherwise seen and generally help you document better for future you! 5.3 Characteristics of great pull requests 5.3.1 There’s plenty of context! What’s the story behind around the changes you are proposing? Sometimes when we are in the thick of a project we can make the mistake of assuming everyone knows what we know. This can unfortunately leave a huge burden on your reviewer to try to follow a paper trail to try to understand what you are doing. Before sending off a review request, re-read your PR description and think about the perspective of your reviewer. Err on the side that they have no idea what is happening on the project (because sometimes this is the case!) Tell a short story to explain what lead to you making these changes including attempting to answer these questions: What is the problem that these changes will solve? Do you have any URLs relevant issues or files you can share? What inspired you to take this approach – are there other things you tried? Are there other pull requests related to this change? 5.3.2 Includes an explicit request for what kind of feedback is needed What would you like your reviewer to do with this pull request? Stating this explicitly can save both of you time in this code review. Are you still in the early stages and looking for a bigger picture review? Let them know that before they waste their time digging into the code line-by-line. Are you in the later stages and looking for detailed nit-picky review? Are you looking for feedback on the results or methods? 5.3.3 Points out questionable areas that need extra attention Are there specific areas of the code you are having trouble with or are unsure about? Send a link to the specific lines in GitHub you are asking about. Are there results that are surprising, confusing, or smell wrong? Be sure to detail what you have dug into and tried at this point for any problematic points. 5.3.4 Are relatively small and focused Try to make sure your pull requests aren’t too long! Code reviewing fatigue is very real. If you send a reviewer thousands of lines of code to review it will be very overwhelming to review or understand. 10 lines of code = 10 issues.500 lines of code = \"looks fine.\"Code reviews. — I Am Devloper ((iamdevloper?)) November 5, 2013 Alternatively, when you create a new branch try to set a very intentional (and relatively small) goal you would like to achieve with your upcoming pull request. Keeping your pull requests small and focused on one task at a time will not only help your reviewers but also will help yourself feel more accomplished and organized. Also recall that incremental changes are good! Perhaps you do have a very large restructuring of your repository you are trying to accomplish, but finding smaller reasonable sets of changes (which would each have their own pull requests) to reach that goal incrementally can help keep things more manageable. Using Stacked Pull Requests in GitHub 5.3.5 Don’t ask a reviewer to dig through dirty code Determining when a pull request fully cooked and ready for review is a skill in itself. Pull requests that haven’t had enough time to be polished can put an unnecessarily larger burden on the reviewer. On the other hand, pull requests that have been hashed and rehashed in a silo might have benefitted from big picture feedback at an earlier stage of the code. This is something that you and your team can figure out a balance for in time using lots of communication! This being said, the first reviewer of your code should always be yourself! Take time to review your own changes by clicking on the Files Changed tab and going over that section carefully. Are all the changes included that you were expecting? Are there any changes you didn’t expect that are showing up? These can be symptomatic of a deeper problem. Definitely dig into anything that is not what you expected. Set aside your changes and return them in a few hours, or the next day. Looking at your changes with fresh eyes may also allow you to find things you didn’t notice before. Additional tip, if you don’t want others to look at your pull request yet because you are still working on reviewing it, you can change it to a draft pull request so no one reviews it before you are ready. This can also be a handy tactic to use if you just want to ask for big picture feedback someone but want to make it clear that it is not anywhere near ready for merging to main. 5.3.5.1 In summary: Let’s revisit our scenario with Avi and Ruby and see how Ruby could better prepare her changes for review: In this scenario Ruby was able to save Avi time in getting into the code review by being more specific about what kind of feedback she is looking for as well as links that explain the context behind these changes. Additionally, by supplying Avi with a smaller PR, Avi is less likely to be overwhelmed by Ruby’s request and be able to give her suggestions in a more timely manner. 5.4 Exercise: Create your pull request description Add a pull request template to your repository! This will help initiate consistent and clear communication around the pull requests in your repository. Pull request templates are a way to give yourself and other contributors prompts when starting a new pull request. See below for an example. The comments between <!-- and --> are html comments that will not show up so you don’t need to delete them if you don’t want to. On the right side, it shows how this template looks when it’s rendered. You can see this at any time by clicking Preview – this is true in other places in GitHub. 5.4.0.1 Set up a pull request template Create a new branch as we described in the previous chapter. In your local repository, create a folder called .github Copy and paste this pull request template file to a new text file and save it as a .md to get started. Feel free to edit this file to your own needs and add it to the .github folder of your repository. Use GitKraken to add and commit this new file. Push this commit. Open up a pull request. Craft your pull request description based on what we discussed in this chapter. Click on the Files Changed tab and make sure it includes the .github/PULL_REQUEST_TEMPLATE.md file. Walk away from your pull request and then return later and review it yourself. Make any necessary changes. When you are ready, request a reviewer by choosing someone underneath Reviewer on the right side! 5.4.1 Preparing for the return of your review As you wait for your reviewer to get back to you, it can be helpful to remind yourself the purpose of code reviews get yourself in a positive mindset. You’ve given your reviewer information to help them help you and now is the time to wait. First of all, you should pat yourself on the back for engaging in code review. It does require more time and sometimes that can feel scary with looming deadlines, but kudos for being able to prioritize your commitment to creating increasingly reproducibility analyses! Remember that you are not your code and mistakes are all a part of the process! Putting your project out there can feel a tad vulnerable even. You may have felt the impulse to keep your code’s problems buried under a rug, but you pushed past that and are making your analyses transparent! Remember that hidden problems don’t get solved, but known problems are opportunities for reaching an even better end result than you imagined! When you receive a review back remember that you and the reviewer are on the same team and both want the best end result feasible for this project! They may suggest ideas that you love and can’t wait to implement. They also might suggest ideas you don’t agree with. Do your best to take all their comments as positive learning opportunities and look for ways to compromise and determine solutions collaboratively. 5.4.1.1 Recommended reading about code review Why code reviews matter (and actually save time!) by Radigan (2021). Pull request descriptions by Bañuelos (2020). A zen manifesto for effective code reviews by Jean-Charles (2019). Best practices for Code Review by Smartbear Team (2021). If you have any feedback on this chapter, please fill out this form, we’d love to hear your feedback! References "],["engaging-in-code-review---as-a-reviewer.html", "Chapter 6 Engaging in Code Review - as a reviewer 6.1 Learning Objectives 6.2 Reviewer responsibilities in code review", " Chapter 6 Engaging in Code Review - as a reviewer 6.1 Learning Objectives 6.2 Reviewer responsibilities in code review When reviewing a pull request, you take on responsibility to ensure that the pull request is getting the project to a better state than before. There are three aspects to reviewing we will focus on: Identify areas in the code and documentation that are opportunities for improvement. Communicate your questions and concerns effectively and in a way that creates a positive atmosphere. Determine solutions collaboratively in a way that allows for a learning as well as a long term improved product. 6.2.1 What to look for! Depending on the goals of the project, and pull request there can be a lot to keep an eye out for. There are many articles out there about what to look for in a code review. Here’s some general points: Does the analysis answer the question it’s asking? Are the methods it uses to do so appropriate? Is the code clear and readable? Does it contain a healthy amount of comments and documentation for individuals not familiar with the project to understand generally what is going on? Is the code efficient with computational resources? (Are there areas it’s a bit too greedy with memory usage?) Does the code stick to the style and conventions of this project? Are there alternate scenarios where the current strategy might fail? (depending on the likelihood of this use case, this may be an instance for a new issue and for it to be addressed in a different pull request). 6.2.2 How to communicate it The pull request may be the author’s precious bundle. Try to be empathetic to the learning process! You are both working on this project together – assume you both want the best out of this project. If something seems wrong, work together to find a solution, don’t ever waste time on placing blame. Remember that everything sounds harsher when you don’t have in-person cues! In this example, Avi may be stating factual things, but without his pleasant and reassuring disposition, it can feel super harsh. If Avi had reframed his comments, they might be more effective in this collaboration. Babatunde (2018) suggests framing review comments in three ways to help communication: questions, suggestions, and appreciations. 6.2.2.1 Questions For example: What happens if this doesn’t get saved? Does it throw an exception or fails silently? The key is to be specific with the questions. Mention exact file names. Put comments on the line you are referring to. Explain what you think is happening and ask them to explain if that is correct. 6.2.2.2 Suggestions For example: I suggest you use an ArrayHelper getValue method here because of its error handling capability instead of accessing the value directly You could even go further by giving an example: $a = $b[‘key’]; would raise an error if key is not set but \\(a = ArrayHelper::getValue(\\)b, ‘key’); would return a null value if key is not set. Giving suggestions and explain not only how to implement it but why it might be preferred in this scenario is a great learning process both for the author and yourself. 6.2.2.3 Appreciations Start every review comment with an appreciation for the hard work completed! This goes a long way for creating a positive atmosphere. For example: Nice Job! Alice. I suggest we create an interface for this service so other substitute services can implement the interface as well, this would enable us change to a different service with very minimal efforts when the need arises. What do you think? Let’s see how Avi’s message could have been reworked to give a more effective review: This interaction reminds us that effective code review is steeped in empathy from both sides. Authors need to appreciate the time and effort the reviewer is spending to help them; while reviewers need to be sensitive to the amount of effort put in by the author already. 6.2.3 Exercise: Review Past you’s code Find the oldest code you wrote and currently have on your computer. Create a repository and pull request with this old code, following the general steps for creating a repository and pull request from the previous chapter. Request yourself as a reviewer. Review the code on Github using their docs as a guide for the mechanics of it. As you review, have empathy for past yourself, and give questions, appreciations, and suggestions in regards to this code. 6.2.3.1 Recommended reading about code review Comments during Code Reviews by Babatunde (2018) On Empathy and Pull Requests by Duretti Hirpa (2016). Code Review Guidelines for Humans by Hauer (2018). Your Code Sucks! – Code Review Best Practices by Hildebr (2020). An even longer list of readings about code review If you have any feedback on this chapter, please fill out this form, we’d love to hear your feedback! References "],["launching-a-docker-image.html", "Chapter 7 Launching a Docker image 7.1 Learning Objectives 7.2 What’s Docker? 7.3 Install Docker 7.4 Getting started with Docker 7.5 More about Docker", " Chapter 7 Launching a Docker image 7.1 Learning Objectives In the introductory part of this course, we discussed package managers like renv or conda. Recall that even if you have the same packages installed between two computers, you can still get different results! This is because package versions do influence results as demonstrated by Beaulieu-Jones and Greene (2017). Package managers address part of this problem, however their limitation is that generally only can help with certain sets of packages. conda really only manages conda installed packages and renv doesn’t help with package management outside of R. Both of these have limited capabilities for cross platform shipping. This is where Docker can help fill in the gaps. I don’t even count anymore how many times did my code break when someone else run it. The strange part was — it worked on my machine. That’s where Docker saves the day. If it works on your machine, it will work on any. Radečić (2020) 7.2 What’s Docker? One way to ensure that her collaborators have the same computing environment is Ruby could ship her computer to each of her collaborators and have them run the analysis on her computer. But before you buy hundreds of laptops for your projects, we’ll show you how Docker will allow you to send your computing environment to your collaborators in a practical manner. Ruby can create a Docker image that Avi can use to run the analysis. This way Ruby and Avi know they are using the same computing environment. Now if Ruby and Avi obtain different results, it won’t be because of version differences. 7.3 Install Docker Go here to install Docker, following the instructions for your particular operating system. If you don’t have a Docker account create an account when prompted, or go here. After you install Docker, start up Docker desktop by double clicking on the app. It may take some time to start up. 7.4 Getting started with Docker Open up your command line. First we need to get the Docker image. A Docker image is like a snapshot of your computing environment that you can move from place to place. We can download images from online and then use them to make a container. Containers are what we use to actually run analyses. From command line, run one of these commands depending on whether you’d like to use Python or R: To obtain the python docker image docker pull jhudsl/reproducible-python To obtain the R docker image docker pull jhudsl/reproducible-r Open up the Docker Desktop app. Click on ‘images’ on the left. This shows the images you currently have available on your computer. Return to your command line. Using cd and ls navigate to your project repository (or whatever files you’d like to be accessible in your development environment) and we can start up a docker container using docker run. To run the Python docker image docker run --rm -v $PWD:/home/jovyan/work -e JUPYTER_ENABLE_LAB=yes -p 8787:8787 jhudsl/reproducible-python Now in your internet browser, go to the address printed out. It should take you to Jupyter Lab. Now you are ready to develop inside a Docker container! To run the R docker image But you can change the password to whatever you’d like. docker run --rm -v $PWD:/home/rstudio -e PASSWORD=password -p 8787:8787 jhudsl/reproducible-r Now in your internet browser, go to localhost:8787. You should see an RStudio login page. Login to RStudio. Your username will be rstudio and your password, will be whatever you set your password to be. Now you are ready to develop inside a Docker container! To see what containers you have running or to clear out old containers, in Docker Desktop you can go to the Containers/Apps page. 7.4.1 A Breakdown what these Docker run options are Docker has super extensive documentation! But to get you started, here’s the highlights for this docker run command: The remove option (--rm) Automatically removes the container when docker run exits. The volume option (-v) is how you specify what files you’d like available in the container and where to find them. In this instance we are using the output of the pwd command (print working directory) so that wherever you run this command, the files in that directory will be available in the container. The part after the colon specifies where these files will be found in the container. The environment option (-e) is how you can specify any environment variables you will need. In this instance for the rocker image we need to specify a password. but for the python image we needed to specify JUPYTER_ENABLE_LAB=yes so that we can use Jupyter Lab. The port option (-p) is how you specify what port you can connect to this on using your internet browser. The image to use is specified in the last part of the docker run command says what image to run, so in these instances, we are running a container using the jhudsl/reproducible-r or jhudsl/reproducible-python images. 7.5 More about Docker Docker tutorial for beginners by Srivastav (2018). 7.5.0.1 Python specific: Jupyter Docker stacks by “Jupyter Docker Stacks — Docker-Stacks Latest Documentation” (2018). How to Run Jupyter Notebook on Docker by Okada (2021). 7.5.0.2 R specific: Launching RStudio in Docker by openscilabs (2021). Getting started with R and Docker by Neuzerling (2018). If you have any feedback on this chapter, please fill out this form, we’d love to hear your feedback! References "],["modifying-a-docker-image.html", "Chapter 8 Modifying a Docker image 8.1 Learning Objectives 8.2 Managing images 8.3 Exercise: Build a Docker image", " Chapter 8 Modifying a Docker image 8.1 Learning Objectives The docker image you are using from the last chapter was pre-made for you, but you will find depending on the needs of your project, that you may need different packages installed. In this chapter we will introduce you to the basics of how to manage your own Docker image. 8.2 Managing images Images can be on your own computer or on dockerhub. To see your list of images on your computer, you can go to Docker desktop. From here you will want to delete images and containers periodically because they do take up room on your computer. To see what images you have on your internet repository, you can log on to dockerhub. Go here to login (or create a username if you have not yet). After you sign into dockerhub, click on the Repositories tab, so you can see the list of repositories you have stored online. At this point, you won’t have any if you just created your dockerhub account. To create a new repository, click the ‘Create Repository’ button. Upon adding the new repository to dockerhub, you will need name it the same as whatever you are calling it locally. You can put a description and name and click create. On the right it shows how you can interact with this from your local command line. After you’ve created the image repository, you will be brought to the image repository page. It will tell you Last pushed: never. On the right it will tell you the command you will need in order to push the image to dockerhub. Go to your local command line and use the command specified on the right side of your repository page. You don’t have to specify a tagname if you don’t want to. If you don’t want to specify a tagname, leave off the :tagname if you like. Now you will be able to test pulling your image using docker pull <image name> like we did in the previous chapter. You can also click on the Public View button to copy the pull command for your Docker image. Docker images can be pulled from being stored online but these images are built originally from a Dockerfile. 8.3 Exercise: Build a Docker image A Dockerfile is a recipe for how to build a docker image. The best way to learn to write Dockerfiles is to start off with one that is already written and modify it for your needs. You can practice building a docker image by downloading the dockerfiles we have started and changing it slightly. 8.3.1 Download an example Dockerfile Get the Python Dockerfile Download the example Dockerfile for Python analyses. wget https://raw.githubusercontent.com/jhudsl/Adv_Reproducibility_in_Cancer_Informatics/main/resources/python-docker/Dockerfile If you get a message like command not found that means you will need to install wget. Altervatively, you can navigate to the Dockerfile’s page on GitHub and use File > Save as but do not add any suffix to the end of the file (no .txt or anything). Just save it as Dockerfile. Get the R Dockerfile Download the example Dockerfile for R analyses. wget https://raw.githubusercontent.com/jhudsl/Adv_Reproducibility_in_Cancer_Informatics/main/resources/r-docker/Dockerfile If you get a message like command not found that means you will need to install wget. Altervatively, you can navigate to the Dockerfile’s page on GitHub and use File > Save as but do not add any suffix to the end of the file (no .txt or anything). Just save it as Dockerfile. 8.3.2 Build a Docker image from a Dockerfile Place this newly downloaded Dockerfile with the rest of your project files. Build a docker image from this Dockerfile using the command below, but replace image_name with what you would like your modified image to be called. docker build -f Dockerfile . -t image_name Navigate back to your Docker desktop and the images window. If your image built successfully, you should see a new image in your list! 8.3.3 Modify a Docker image If you want add or remove a package from a Docker image, you’ll need to modify the Dockerfile. Using your preferred text editor (or RStudio or Jupyter Lab), open up the Dockerfile. You will see the first line in the Docker image is a FROM command. This is a command that will take another docker image to start from. - For our R example, we are starting off with an image that already has R and the tidyverse. - For our Python example we are starting off with an image that already has Python and Jupyter Lab. There are so many Docker images out there, that it might be that someone has already created a docker image with most of the functionality you need for your project. FROM is one of the main commands that a Dockerfile can take as described by their documentation: FROM creates a layer from the another Docker image. COPY adds files from your Docker client’s current directory. RUN builds your application with make. CMD specifies what command to run within the container. 8.3.4 Add to the Dockerfile To get a feel for how these work, let’s add a line to the your example Dockerfile. Using your preferred text editor (or RStudio or Jupyter Lab), open up the Dockerfile and add this line at the very end of the file. Do not add this line to the start of the file as this will not work. The FROM command needs to come first. CMD ["echo","Yay! I added to this Docker image"] Now re-run docker build as you did in the previous section. (Use the command below but replace image_name with whatever your image is called). docker build -f Dockerfile . -t image_name If all built successfully, you should see a message like: => exporting to image 0.0s => => exporting layers 0.0s => => writing image sha256:ayuahgfuiseohfauwheufhauwihefuahweufhawfbuibe 0.0s => => naming to docker.io/library/image_name Now to run the image we can use the docker run command we used in the previous chapter (see below) and we should have a message: Yay! I added to this Docker image pop up upon building. To run your new Python docker image But replace image_name with whatever you have called your image. docker run --rm -v $PWD:/home/jovyan/work -e JUPYTER_ENABLE_LAB=yes -p 8787:8787 image_name To run the R docker image But replace image_name with whatever you have called your image. docker run --rm -v $PWD:/home/rstudio -e PASSWORD=password -p 8787:8787 image_name Stop and remove these containers before moving on. You can do this by going to Docker desktop and clicking on the trash can button next to each container. For images click Clean up to check off the images you’d like to remove and then hit Remove. 8.3.5 Add another package! Starting off with your example Dockerfile, we will practice adding another package and re-build the docker image with a new package. Note that spacing is important as well as having a \\ at the end of each line if the command is continuing. 8.3.5.1 Adding an R package To add R packages from CRAN, you can use this kind of format: RUN Rscript -e "install.packages( \\ c('BiocManager', \\ 'R.utils', \\ 'newpackagename'))" To add an R package from Bioconductor, you can follow this kind of format: RUN Rscript -e "options(warn = 2); BiocManager::install( \\ c('limma', \\ 'newpackagename') To add a Python package using pip, you will need to add pip3 to install Python packages using this format: RUN pip3 install \\ "somepackage==0.1.0" There are so many things you can add to your Docker image. (Picture whatever software and packages you are using on your computer). We can only get you started for the feel of how to build a Dockerfile, and what you put on your Docker image will be up to you. To figure out how to add something, a good strategy is to look for other Dockerfiles that might have the package you want installed and borrow their RUN command. Then try to re-build your Docker image with that added RUN command and see if it builds successfully. And lastly, make sure that whatever changes you make to your Dockerfile, that you add it to your GitHub repository by creating a pull request as we did in Chapter 3. 8.3.6 More about Docker next steps Dockerfile Tutorial by Example. Dockerfile examples 8.3.7 A list of handy Docker commands: Get info on current containers: docker ps How to stop an individual container: docker container ls docker stop <containerID> Get rid of all non-running containers: docker container prune Stop all containers: docker stop $(docker ps -a -q) Remove all containers: docker rm -f $(docker ps -a -q) If you have any feedback on this chapter, please fill out this form, we’d love to hear your feedback! "],["automation-as-a-reproducibility-tool.html", "Chapter 9 Automation as a reproducibility tool 9.1 Learning Objectives 9.2 Build an example GitHub Actions 9.3 Exercise: Set up a GitHub action", " Chapter 9 Automation as a reproducibility tool 9.1 Learning Objectives We’ve discussed that a reproducible analysis can be run by someone else and obtain the same result. But what if before you bug your colleague to use their time to re-run your analysis, you had a robot re-run your analysis? Robots don’t get tired or have other deadlines to respond to and can be set up to re-run your analysis at any time. This is the basis of why automation is powerful tool for reproducibility. There are a lot of applications for GitHub Actions (see links at the end of this chapter) but in the context of our R and Python examples or for scientific notebooks in general, it can be useful to build a GitHub Actions that re-runs the notebook every time a pull request is opened. This is useful because if the notebook does not re-run successfully by GitHub actions, this can be informative to something being amiss in the changes being made. 9.2 Build an example GitHub Actions 9.2.1 Structure of GitHub actions file GitHub actions are written yaml file that you store in a folder called .github/workflows in your GitHub repository. They have two main parts: the trigger: on: the action: job: The trigger is specified by on: and the action that happens upon the trigger being activated is specified by jobs:. The job can be made up of multiple steps:. on: # Some stuff that specifies when the action should run jobs: # The action that should run 9.2.2 Setting up the trigger There’s a list of things that happen in GitHub that can be used to trigger a GitHub actions. See the list here in this case, we will set up a github action that happens whenever a pull request is opened that is going to the main branch. on: pull_request: branches: - main jobs: # The action that should run 9.2.3 Setting up the action The action part of the GitHub action can be named something (here we are calling it name-of-job and we can use the runs-on: to specify a docker image to run this on. For this we will use a base image of ubuntu-latest. This simple action will run a bash command echo to say \"GitHub action is run!\". on: pull_request: branches: - main jobs: name-of-job: runs-on: ubuntu-latest - name: Run message run: echo "GitHub action is run!" 9.3 Exercise: Set up a GitHub action Use a GitHub Action Template by following these instructions. You will need to navigate to your own repository to do this. Tips for developing a GitHub Action: As you are adding your GitHub actions, consult the GitHub actions log. GitHub actions has pretty great documentation so as you are setting up your GitHub actions template, you will want to reference them. Be careful with your spacing this will break your GitHub action. Take a look at other GitHub actions that are doing something similar to what you are trying to accomplish. For testing purposes, modify the trigger so you can test it. You may want to use a manual workflow trigger or pull request: and push:. Use | in your run: command to give a multi-line command. 9.3.1 Resources for setting up your GitHub Actions Python example GitHub Actions to re-run notebook R example GitHub Actions to re-run notebook Great course about GitHub actions Introduction to GitHub Actions for data scientists. If you have any feedback on this chapter, please fill out this form, we’d love to hear your feedback! "],["about-the-authors.html", "About the Authors", " About the Authors These credits are based on our course contributors table guidelines.     Credits Names Pedagogy Lead Content Instructor(s) Candace Savonen Lecturer(s) Candace Savonen Content Directors Jeff Leek, Sarah Wheelan Content Reviewer Sarah Wheelan Content Editor Jimin Hwang Acknowledgments Production Content Publisher Ira Gooding Content Publishing Reviewers Ira Gooding Technical Course Publishing Engineer Candace Savonen Template Publishing Engineers Candace Savonen, Carrie Wright Publishing Maintenance Engineer Candace Savonen Technical Publishing Stylists Carrie Wright, Candace Savonen Package Developers (ottrpal)John Muschelli, Candace Savonen, Carrie Wright Art and Design Illustrator Candace Savonen Jimin Hwang Figure Artist Candace Savonen Videographer Candace Savonen Videography Editor Candace Savonen Funding Funder National Cancer Institute (NCI) UE5 CA254170 Funding Staff Emily Voeglein, Fallon Bachman   ## ─ Session info ─────────────────────────────────────────────────────────────── ## setting value ## version R version 4.0.2 (2020-06-22) ## os Ubuntu 20.04.3 LTS ## system x86_64, linux-gnu ## ui X11 ## language (EN) ## collate en_US.UTF-8 ## ctype en_US.UTF-8 ## tz Etc/UTC ## date 2024-03-25 ## ## ─ Packages ─────────────────────────────────────────────────────────────────── ## package * version date lib source ## assertthat 0.2.1 2019-03-21 [1] RSPM (R 4.0.3) ## bookdown 0.24 2022-02-15 [1] Github (rstudio/bookdown@88bc4ea) ## callr 3.4.4 2020-09-07 [1] RSPM (R 4.0.2) ## cli 2.0.2 2020-02-28 [1] RSPM (R 4.0.0) ## crayon 1.3.4 2017-09-16 [1] RSPM (R 4.0.0) ## desc 1.2.0 2018-05-01 [1] RSPM (R 4.0.3) ## devtools 2.3.2 2020-09-18 [1] RSPM (R 4.0.3) ## digest 0.6.25 2020-02-23 [1] RSPM (R 4.0.0) ## ellipsis 0.3.1 2020-05-15 [1] RSPM (R 4.0.3) ## evaluate 0.14 2019-05-28 [1] RSPM (R 4.0.3) ## fansi 0.4.1 2020-01-08 [1] RSPM (R 4.0.0) ## fs 1.5.0 2020-07-31 [1] RSPM (R 4.0.3) ## glue 1.6.1 2022-01-22 [1] CRAN (R 4.0.2) ## hms 0.5.3 2020-01-08 [1] RSPM (R 4.0.0) ## htmltools 0.5.0 2020-06-16 [1] RSPM (R 4.0.1) ## jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.0.2) ## knitr 1.33 2022-02-15 [1] Github (yihui/knitr@a1052d1) ## lifecycle 1.0.0 2021-02-15 [1] CRAN (R 4.0.2) ## magrittr 2.0.2 2022-01-26 [1] CRAN (R 4.0.2) ## memoise 1.1.0 2017-04-21 [1] RSPM (R 4.0.0) ## ottrpal 0.1.2 2022-02-15 [1] Github (jhudsl/ottrpal@1018848) ## pillar 1.4.6 2020-07-10 [1] RSPM (R 4.0.2) ## pkgbuild 1.1.0 2020-07-13 [1] RSPM (R 4.0.2) ## pkgconfig 2.0.3 2019-09-22 [1] RSPM (R 4.0.3) ## pkgload 1.1.0 2020-05-29 [1] RSPM (R 4.0.3) ## prettyunits 1.1.1 2020-01-24 [1] RSPM (R 4.0.3) ## processx 3.4.4 2020-09-03 [1] RSPM (R 4.0.2) ## ps 1.3.4 2020-08-11 [1] RSPM (R 4.0.2) ## purrr 0.3.4 2020-04-17 [1] RSPM (R 4.0.3) ## R6 2.4.1 2019-11-12 [1] RSPM (R 4.0.0) ## readr 1.4.0 2020-10-05 [1] RSPM (R 4.0.2) ## remotes 2.2.0 2020-07-21 [1] RSPM (R 4.0.3) ## rlang 0.4.10 2022-02-15 [1] Github (r-lib/rlang@f0c9be5) ## rmarkdown 2.10 2022-02-15 [1] Github (rstudio/rmarkdown@02d3c25) ## rprojroot 2.0.2 2020-11-15 [1] CRAN (R 4.0.2) ## sessioninfo 1.1.1 2018-11-05 [1] RSPM (R 4.0.3) ## stringi 1.5.3 2020-09-09 [1] RSPM (R 4.0.3) ## stringr 1.4.0 2019-02-10 [1] RSPM (R 4.0.3) ## testthat 3.0.1 2022-02-15 [1] Github (R-lib/testthat@e99155a) ## tibble 3.0.3 2020-07-10 [1] RSPM (R 4.0.2) ## usethis 2.1.5.9000 2022-02-15 [1] Github (r-lib/usethis@57b109a) ## vctrs 0.3.4 2020-08-29 [1] RSPM (R 4.0.2) ## withr 2.3.0 2020-09-22 [1] RSPM (R 4.0.2) ## xfun 0.26 2022-02-15 [1] Github (yihui/xfun@74c2a66) ## yaml 2.2.1 2020-02-01 [1] RSPM (R 4.0.3) ## ## [1] /usr/local/lib/R/site-library ## [2] /usr/local/lib/R/library "],["references.html", "References", " References "],["404.html", "Page not found", " Page not found The page you requested cannot be found (perhaps it was moved or renamed). You may want to try searching to find the page's new location, or use the table of contents to find the page you are looking for. "]] +[["index.html", "Advanced Reproducibility in Cancer Informatics About this Course 0.1 Available course formats", " Advanced Reproducibility in Cancer Informatics June, 2024 About this Course This course is part of a series of courses for the Informatics Technology for Cancer Research (ITCR) called the Informatics Technology for Cancer Research Education Resource. This material was created by the ITCR Training Network (ITN) which is a collaborative effort of researchers around the United States to support cancer informatics and data science training through resources, technology, and events. This initiative is funded by the following grant: National Cancer Institute (NCI) UE5 CA254170. Our courses feature tools developed by ITCR Investigators and make it easier for principal investigators, scientists, and analysts to integrate cancer informatics into their workflows. Please see our website at www.itcrtraining.org for more information. 0.1 Available course formats This course is available in multiple formats which allows you to take it in the way that best suites your needs. You can take it for certificate which can be for free or fee. The material for this course can be viewed without login requirement on this Bookdown website. This format might be most appropriate for you if you rely on screen-reader technology. This course can be taken for free certification through Leanpub. This course can be taken on Coursera for certification here (but it is not available for free on Coursera). Our courses are open source, you can find the source material for this course on GitHub. "],["introduction.html", "Chapter 1 Introduction 1.1 Topics covered: 1.2 Motivation 1.3 Target Audience 1.4 Curriculum 1.5 How to use the course", " Chapter 1 Introduction 1.1 Topics covered: This is the second course in a two part series: 1.2 Motivation Cancer datasets are plentiful, complicated, and hold untold amounts of information regarding cancer biology. Cancer researchers are working to apply their expertise to the analysis of these vast amounts of data but training opportunities to properly equip them in these efforts can be sparse. This includes training in reproducible data analysis methods. Data analyses are generally not reproducible without direct contact with the original researchers and a substantial amount of time and effort (Beaulieu-Jones and Greene 2017). Reproducibility in cancer informatics (as with other fields) is still not monitored or incentivized despite that it is fundamental to the scientific method. Despite the lack of incentive, many researchers strive for reproducibility in their own work but often lack the skills or training to do so effectively. Equipping researchers with the skills to create reproducible data analyses increases the efficiency of everyone involved. Reproducible analyses are more likely to be understood, applied, and replicated by others. This helps expedite the scientific process by helping researchers avoid false positive dead ends. Open source clarity in reproducible methods also saves researchers’ time so they don’t have to reinvent the proverbial wheel for methods that everyone in the field is already performing. This course introduces tools that help enhance reproducibility and replicability in the context of cancer informatics. It uses hands-on exercises to demonstrate in practical terms how to get acquainted with these tools but is by no means meant to be a comprehensive dive into these tools. The course introduces tools and their concepts such as git and GitHub, code review, Docker, and GitHub actions. 1.3 Target Audience The course is intended for students in the biomedical sciences and researchers who use informatics tools in their research. It is the follow up course to the Introduction to Reproducibility in Cancer Informatics course 1.4 Curriculum Goal of this course: To equip learners with a deeper knowledge of the capabilities of reproducibility tools and how they can apply to their existing analyses scripts and projects. What is NOT the goal of this course To be a comprehensive tutorial to each of the tools shown. 1.5 How to use the course Each chapter has associated exercises that you are encourage to complete in order to get the full benefit of the course This course is designed with busy professional learners in mind – who may have to pick up and put down the course when their schedule allows. In general, you are able to skip to chapters you find a most useful to (One incidence where a prior chapter is required is noted). References "],["defining-reproducibility.html", "Chapter 2 Defining Reproducibility 2.1 Learning Objectives 2.2 What is reproducibility 2.3 Reproducibility in daily life 2.4 Reproducibility is worth the effort! 2.5 Reproducibility exists on a continuum!", " Chapter 2 Defining Reproducibility If you’ve not previously read through the defining reproducibility chapter in our introductory course, we recommend you read through it here; otherwise feel free to skip the next chapter. 2.1 Learning Objectives 2.2 What is reproducibility There’s been a lot of discussion about what is included in the term reproducibility and there is some discrepancy between fields. For the purposes of informatics and data analysis, a reproducible analysis is one that can be re-run by a different researcher and the same result and conclusion is found. Reproducibility is related to repeatability and replicability but it is worth taking time to differentiate these terms Perhaps you are like Ruby and have just found an interesting pattern through your data analysis! This has probably been the result of many months or years on your project and it’s worth celebrating! But before she considers these results a done deal, Ruby should test whether she is able to re-run her own analysis and get the same results again. This is known as repeatability. Given that Ruby’s analysis is repeatable; she may feel confident now to share her preliminary results with her colleague, Avi the Associate. Whether or not someone else will be able to take Ruby’s code and data, re-run the analysis and obtain the same results is known as reproducibility. If Ruby’s results are able to be reproduced by Avi, now Avi may collect new data and use Ruby’s same analysis methods to analyze his data. Whether or not Avi’s new data and results concur with Ruby’s study’s original inferences is known as replicability. You may realize that these levels of research build on each other (like science is supposed to do). In this way, we can think of these in a hierarchy. Skipping any of these levels of research applicability can lead to unreliable results and conclusions. Science progresses when data and hypotheses are put through these levels thoroughly and sequentially. If results are not repeatable, they won’t be reproducible or replicable. Ideally all analyses and results would be reproducible without too much time and effort spent; this would aid in the efficiency of research getting to the next stages and questions. But unfortunately, in practice, reproducibility is not as commonplace as we would hope. Institutions and reward systems generally do not prioritize or even measure reproducibility standards in research and training opportunities for reproducible techniques can be scarce. Reproducible research can often feel like an uphill battle that is made steeper by lack of training opportunities. In this course, we hope to equip your research with the tools you need to enhance the reproducibility of your analyses so this uphill battle is less steep. 2.3 Reproducibility in daily life What does reproducibility mean in the daily life of a researcher? Let’s say Ruby’s results are repeatable in her own hands and she excitedly tells her associate, Avi, about her preliminary findings. Avi is very excited about these results as well as Ruby’s methods! Avi is also interested in Ruby’s analysis methods and results. So Ruby sends Avi the code and data she used to obtain the results. Now, whether or not Avi is able to obtain the same exact results with this same data and same analysis code will indicate if Ruby’s analysis is reproducible. Ruby may have spent a lot of time on her code and getting it to work on her computer, but whether it will successfully run on Avi’s computer is another story. Often when researchers share their analysis code it leads to a substantial amount of effort on the part of the researcher who has received the code to get it working and this often cannot be done successfully without help from the original code author (Beaulieu-Jones and Greene 2017). Avi is encountering errors because Ruby’s code was written with Ruby’s computer and local setup in mind and she didn’t know how to make it more generally applicable. Avi is spending a lot of time just trying to re-run Ruby’s same analysis on her same data; he has yet to be able to try the code on any additional data (which will likely bring up even more errors). Avi is still struggling to work with Ruby’s code and is confused about the goals and approaches the code is taking. After struggling with Avi’s code for an untold amount of time, Avi may decide it’s time to email Ruby to get some clarity. Now both Avi and Ruby are confused about why this analysis isn’t nicely re-running for Avi. Their attempts to communicate about the code through email haven’t helped them clarify anything. Multiple versions of the code may have been sent back and forth between them and now things are taking a lot more time than either of them expected. Perhaps at some point Avi is able to successfully run Ruby’s code on Ruby’s same data. Just because Avi didn’t get any errors doesn’t mean that the code ran exactly the same as it did for Ruby. Lack of errors also doesn’t mean that either Ruby or Avi’s runs of the code ran with high accuracy or that the results can be trusted. Even a small difference in decimal point may indicate a more fundamental difference in how the analysis was performed and this could be due to differences in software versions, settings, or any number of items in their computing environments. 2.4 Reproducibility is worth the effort! Perhaps you’ve found yourself in a situation like Ruby and Avi; struggling to re-run code that you thought for sure was working a minute ago. In the upcoming chapters, we will discuss how to bolster your projects’ reproducibility. As you apply these reproducible techniques to your own projects, you may feel like it is taking more time to reach endpoints, but keep in mind that reproducible analyses and projects have higher upfront costs but these will absolutely pay off in the long term. Reproducibility in your analyses is not only a time saver for yourself, but also your colleagues, your field, and your future self! You might not change a single character in your code but then return to it in a a few days/months/years and find that it no longer runs! Reproducible code stands the test of time longer, making ‘future you’ glad you spent the time to work on it. It’s said that your closest collaborator is you from 6 months ago but you don’t reply to email (Broman 2016). Many a data scientist has referred to their frustration with their past selves: Dear past-Hadley: PLEASE COMMENT YOUR CODE BETTER. Love present-Hadley — Hadley Wickham (@hadleywickham) April 7, 2016 The more you comment your code, and make it clear and readable, your future self will thank you. Reproducible code also saves your colleagues time! The more reproducible your code is, the less time all of your collaborators will need to spend troubleshooting it. The more people who use your code and need to try to fix it, the more time is wasted. This can add up to a lot of wasted researcher time and effort. But, reproducible code saves everyone exponential amounts of time and effort! It will also motivate individuals to use and cite your code and analyses in the future! 2.5 Reproducibility exists on a continuum! Incremental work on your analyses is good! You do not need to make your analyses perfect on the first try or even within a particular time frame. The first step in creating an analysis is to get it to work once! But the work does not end there. Furthermore, no analysis is or will ever be perfect in that it will not be reproducible in every single context throughout time. incrementally pushing our analyses toward the right of this continuum is the goal. References "],["using-version-control-with-github.html", "Chapter 3 Using version control with GitHub 3.1 Learning Objectives 3.2 Prerequisites for this chapter 3.3 Set up a Git Client (GitKraken) 3.4 Get the exercise project files 3.5 Start a GitHub repository 3.6 More resources for learning GitHub", " Chapter 3 Using version control with GitHub 3.1 Learning Objectives In the introductory part of this course, we discussed some of the reasons for using GitHub but we didn’t get into version control (i.e. creating versions for managing changes over time) or GitHub’s capabilities much beyond its capacity to store code in a place where others can find it. In this advanced course, we will dig deeper into Git and GitHub’s capabilities so you can use this to your daily work’s advantage. However, to gain the benefit of these deeper GitHub skills, you will have to form some new habits. Fully embracing the GitHub workflow will make your work more efficient and help you create more transparent and reproducible analyses! In this chapter we’re going to introduce you to the basic git commands you’ll need, and guide you as we do them together one by one! 3.2 Prerequisites for this chapter In order to complete this chapter you will need a GitHub account (it’s free). If you do not currently have a GitHub account, we recommend you go through our Intro to Github chapter from the Introduction to Reproducibility course first, then return to this chapter. 3.3 Set up a Git Client (GitKraken) Interaction with git and GitHub can be done completely from the command line, but sometimes this can be harder to keep track of. To help us navigate this, we recommend using a git client. There are a lot of different clients out there, and they are generally free for most situations you will need. In this course, we will take you through how to use GitKraken, one such git client. GitKraken is nice because they have lots of nice tutorials, it works pretty well, and its free for most use cases. But if you find GitKraken doesn’t work for you, you can explore other git clients. For this course, we’ll be using GitKraken. 3.3.1 Install GitKraken Go here to install GitKraken. Follow their instructions to sign in with your GitHub account. It will ask you to authorize your GitHub account to connect to GitKraken. Click Authorize. You may find it helpful to watch GitKraken’s own tutorial (linked below) about how to “git” started, but we will also guide you through each step! GitHub has a host of terms that can feel like a whole language at first, but we’ll introduce them one at a time. To start, a lot of the GitHub workflow centers around handling copies of your code that are either stored on the internet (are remote) or are stored on your computer (are local). Remote = GitHub on the internet Local = What’s on your own computer A repository, in the case of a data science project, is mostly synonymous with the word “project”. Using GitHub, a project will exist both as a remote repository and a local repository (in other words, it will be on the internet on GitHub and on your computer). Repository = a set of project files that have a location on GitHub 3.4 Get the exercise project files In this course, you can work on the exercises from your own GitHub repository, but first we will need to set that up. Below are the files you will want to upload to that repository. Depending on whether you prefer to use R or Python, you can choose to follow this course using one or the other. Get the Python project example files Click this link to download. Now double click your chapter zip file to unzip. For Windows you may have to follow these instructions. Get the R project example files Click this link to download. Now double click your chapter zip file to unzip. For Windows you may have to follow these instructions. 3.5 Start a GitHub repository Go to Github’s main page and sign in with your GitHub account. Follow these instructions to create a repository. As a general, but not absolute rule, you will want to keep one GitHub repository for one analysis project. Name the repository something that reminds you what its related to. For these examples, we’re calling using repository-name as our placeholder. Choose Public. Choose add a README. Follow these instructions to add all the files that are inside the reproducible-R-example.zip or reproducible-python-example.zip file you downloaded to this new repository. Your new repository should more or less look like this when you are done (with obviously some minor differences). {r, fig.alt=\"An image showing what a repository looks like with our example files loaded in. \", out.width = \"100%\", echo = FALSE} ottrpal::include_slide(\"https://docs.google.com/presentation/d/1IJ_uFxJud7OdIAr6p8ZOzvYs-SGDqa7g4cUHtUld03I/edit#slide=id.g2c22eaae560_0_0\") 3.5.1 git clone Now you have a repository on GitHub online! In our daily grind, we will work on this code from our own computer. To set this up, we’ll need to clone it to our own computer. Cloning is making a remote copy of the project local. clone = To make a remote repository local. In other words, to make an online repository downloaded and linked on your computer. To get started, you will need to clone the GitHub repository you created. We will be using this repository for the duration of this course. It is simple to clone a GitHub repository using GitKraken. First, sign in to GitKraken; under Repository Management > Clone tab, click Clone a repo. Then, choose where you’d like the repository to be on your computer using the Browse button. You will need to Copy + Paste your new repository’s URL (web address) to where it says URL. Navigate to your repository on GitHub to copy the URL. Copying and pasting is advisable because any little typo will inhibit cloning. Now you are ready to click Clone the repository! It will ask you if you’d like to Open Now, click that. 3.5.2 Create a branch Handling branches is where you unleash the real benefit of GitHub, but it’s also the confusing part to get the hang of. branch = a unique working copy of file changes of a GitHub repository. A branch can be local and remote. The best way to get a grasp on what the branches represent is to create one and start using it. In GitKraken we can create a new branch; this will be your working copy. First, click the Branch button. Next, type in a branch name in the box that the cursor is blinking in. In our example, we are calling it a-new-branch. Now click Enter! Now you have a new branch! Now we can edit our files and code however we normally would. Go ahead and make an edit to any file in your new repository. If you’ve made a change to any file in your repository, it will appear in GitKraken and you can click on it to see the differences. If we want to add these file changes to our current branch, we need to commit them. add = to stage your files to be committed to your current branch. commit = include your set of file changes to your current branch. Now that we have changes committed to our branch we are ready to add them to the remote, internet copy! To do this, we will need to push our branch. To push means to add changes that are on your new branch to the remote branch (internet version). You can select your origin, which refers to where your branch is stored on the internet. Choose your origin in the dropdown menu and click Submit. origin = where your branch is stored on the internet (remotely) push = to add changes from your branch to its remote counterpart. In other words, put your changes online. After a variable number of commits, your branch, called a-new-branch, is a different version of the original code base that may have a nifty improvement to it. But our main goal is to add that nifty improvement to the main branch. To start this process of bringing in new changes to the main curated repository, we will create a pull request. pull request = A way to propose changes from a branch to be included into the main repository. From GitHub: > Pull requests let you tell others about changes you’ve pushed to a GitHub repository. Once a pull request is sent, interested parties can review the set of changes, discuss potential modifications, and even push follow-up commits if necessary. Pull requests are the meat of how code changes and improvements get reviewed and incorporated! A vast majority of the benefits of incorporating GitHub into your workflow centers around fully utilizing the power of pull requests! Now we can open up a pull request if we go to our GitHub repository on GitHub. After you click on Compare & pull request you’ll be taken to a screen where you can add information about your changes. After you are done writing your description, click Create Pull Request! (If you don’t have your pull request description perfect don’t worry about it, you can always edit it later). Congrats! You’ve just opened a pull request! In an upcoming chapter we will discuss what information you should put in this pull request description to make it pertinent for yourself and whoever reviews your pull request. To summarize, below is what this workflow looks like: One more note: if you do want to use the command line or if you want to know more about the specific git commands that GitKraken is doing for you (which might be handy for troubleshooting), the specific commands that can be used or Googled at each step are highlighted in red in the images - you just need to add git before them! For example, you would type git push in your command line in order to push your code. Or if you’d like to know more about pushing code, you can google git push. 3.6 More resources for learning GitHub Happy Git and GitHub for the useR by Bryan and Hester (2021). GitHub for data scientists by Vickery (2019). Intro to GitHub by “Introduction to GitHub” (2022). First Day on GitHub by “First Day on GitHub” (2022). First Week on GitHub by “First Week on GitHub” (2022). GitHub docs about creating a Pull Request by “Creating a Pull Request” (2021). Making a Pull Request by Radigan (2021). If you have any feedback on this chapter, please fill out this form, we’d love to hear your feedback! References "],["providing-data.html", "Chapter 4 Providing data 4.1 Learning Objectives", " Chapter 4 Providing data 4.1 Learning Objectives The first part of any analysis should be getting all the data needed to run it. Data come in all kinds of formats and sizes so while we can’t give specifics on how to share your data we can provide these guidelines: 4.1.1 Overview of data sharing The data to be shared does not contain PII (personal identifiable information) or PHI (protected health information) information. The data are accessible by a download script that is automatically downloaded when re-running the analysis. Every data file needed to run the analysis is available. The data are downloaded to files in an organized manner. For more about project organization, see this chapter from the Introduction to Reproducibility course. 4.1.2 A very general example of a data download bash script As far as how to have your data downloaded, this will be dependent on where and how it’s stored online. The most general form of a data download script might look like this: #!/bin/bash # This is a template script for downloading data using the wget command # See docs here: https://www.gnu.org/software/wget/manual/wget.html mkdir <FOLDER_TO_SAVE_TO> # To see wget options, use -h (the help flag) wget -h wget -O <FOLDER/FILE_TO_SAVE_TO> <URL> You can download this general template download file here (Shapiro et al. 2021). 4.1.3 Examples of data download scripts Downloading data from GEO with GEOquery Data download script for multiple files of the same place Data download script - refine.bio example For more about data sharing techniques, see the Ethical Data Handling for Cancer Research course. References "],["engaging-in-code-review---as-an-author.html", "Chapter 5 Engaging in Code Review - as an author 5.1 Learning Objectives 5.2 Author responsibilities in code review 5.3 Characteristics of great pull requests 5.4 Exercise: Create your pull request description", " Chapter 5 Engaging in Code Review - as an author 5.1 Learning Objectives We’ve previously discussed that the only way to know if your analysis is truly reproducible is to send it to someone else to reproduce! That sentiment is at the heart of code review. Although most wouldn’t dare send out a manuscript for publishing without having our collaborators giving it a line-by-line review, people don’t always feel the same way about code. Parker (2017) describes code review: Code review will not guarantee an accurate analysis, but it’s one of the most reliable ways of establishing one that is more accurate than before. Not only does code review help boost the accuracy and reproducibility of the analysis, it also helps everyone involved in the process learn something new! An effective code review atmosphere is something that individuals and their team have to commit to (pun intended). Effective code review brings so many benefits not only to your project quality but also your communication skills through fostering a learning atmosphere! In this chapter and the next we will discuss the two sides of code review. Code review ideally includes at least two people: the author of the pull request and the reviewer of the pull request. Depending on your job context, we realize that sometimes authors have to become their own reviewers if code review is not something that can be prioritized by your institution or team. 5.2 Author responsibilities in code review The code review process begins with the creation of a pull request (which we practiced in the previous chapter). Successful and efficient code review is born out of quality communication, which is a skill set on its own. You can set up your reviewers (and yourself) up for success by knowing what basic information can help get the code review conversation going. Even if you end up being the only person who will review your own code, writing these things out is still very helpful and highly recommended. It can help you spot problems you might not have otherwise seen and generally help you document better for future you! 5.3 Characteristics of great pull requests 5.3.1 There’s plenty of context! What’s the story behind around the changes you are proposing? Sometimes when we are in the thick of a project we can make the mistake of assuming everyone knows what we know. This can unfortunately leave a huge burden on your reviewer to try to follow a paper trail to try to understand what you are doing. Before sending off a review request, re-read your PR description and think about the perspective of your reviewer. Err on the side that they have no idea what is happening on the project (because sometimes this is the case!) Tell a short story to explain what lead to you making these changes including attempting to answer these questions: What is the problem that these changes will solve? Do you have any URLs relevant issues or files you can share? What inspired you to take this approach – are there other things you tried? Are there other pull requests related to this change? 5.3.2 Includes an explicit request for what kind of feedback is needed What would you like your reviewer to do with this pull request? Stating this explicitly can save both of you time in this code review. Are you still in the early stages and looking for a bigger picture review? Let them know that before they waste their time digging into the code line-by-line. Are you in the later stages and looking for detailed nit-picky review? Are you looking for feedback on the results or methods? 5.3.3 Points out questionable areas that need extra attention Are there specific areas of the code you are having trouble with or are unsure about? Send a link to the specific lines in GitHub you are asking about. Are there results that are surprising, confusing, or smell wrong? Be sure to detail what you have dug into and tried at this point for any problematic points. 5.3.4 Are relatively small and focused Try to make sure your pull requests aren’t too long! Code reviewing fatigue is very real. If you send a reviewer thousands of lines of code to review it will be very overwhelming to review or understand. 10 lines of code = 10 issues.500 lines of code = \"looks fine.\"Code reviews. — I Am Devloper ((iamdevloper?)) November 5, 2013 Alternatively, when you create a new branch try to set a very intentional (and relatively small) goal you would like to achieve with your upcoming pull request. Keeping your pull requests small and focused on one task at a time will not only help your reviewers but also will help yourself feel more accomplished and organized. Also recall that incremental changes are good! Perhaps you do have a very large restructuring of your repository you are trying to accomplish, but finding smaller reasonable sets of changes (which would each have their own pull requests) to reach that goal incrementally can help keep things more manageable. Using Stacked Pull Requests in GitHub 5.3.5 Don’t ask a reviewer to dig through dirty code Determining when a pull request fully cooked and ready for review is a skill in itself. Pull requests that haven’t had enough time to be polished can put an unnecessarily larger burden on the reviewer. On the other hand, pull requests that have been hashed and rehashed in a silo might have benefitted from big picture feedback at an earlier stage of the code. This is something that you and your team can figure out a balance for in time using lots of communication! This being said, the first reviewer of your code should always be yourself! Take time to review your own changes by clicking on the Files Changed tab and going over that section carefully. Are all the changes included that you were expecting? Are there any changes you didn’t expect that are showing up? These can be symptomatic of a deeper problem. Definitely dig into anything that is not what you expected. Set aside your changes and return them in a few hours, or the next day. Looking at your changes with fresh eyes may also allow you to find things you didn’t notice before. Additional tip, if you don’t want others to look at your pull request yet because you are still working on reviewing it, you can change it to a draft pull request so no one reviews it before you are ready. This can also be a handy tactic to use if you just want to ask for big picture feedback someone but want to make it clear that it is not anywhere near ready for merging to main. 5.3.5.1 In summary: Let’s revisit our scenario with Avi and Ruby and see how Ruby could better prepare her changes for review: In this scenario Ruby was able to save Avi time in getting into the code review by being more specific about what kind of feedback she is looking for as well as links that explain the context behind these changes. Additionally, by supplying Avi with a smaller PR, Avi is less likely to be overwhelmed by Ruby’s request and be able to give her suggestions in a more timely manner. 5.4 Exercise: Create your pull request description Add a pull request template to your repository! This will help initiate consistent and clear communication around the pull requests in your repository. Pull request templates are a way to give yourself and other contributors prompts when starting a new pull request. See below for an example. The comments between <!-- and --> are html comments that will not show up so you don’t need to delete them if you don’t want to. On the right side, it shows how this template looks when it’s rendered. You can see this at any time by clicking Preview – this is true in other places in GitHub. 5.4.0.1 Set up a pull request template Create a new branch as we described in the previous chapter. In your local repository, create a folder called .github Copy and paste this pull request template file to a new text file and save it as a .md to get started. Feel free to edit this file to your own needs and add it to the .github folder of your repository. Use GitKraken to add and commit this new file. Push this commit. Open up a pull request. Craft your pull request description based on what we discussed in this chapter. Click on the Files Changed tab and make sure it includes the .github/PULL_REQUEST_TEMPLATE.md file. Walk away from your pull request and then return later and review it yourself. Make any necessary changes. When you are ready, request a reviewer by choosing someone underneath Reviewer on the right side! 5.4.1 Preparing for the return of your review As you wait for your reviewer to get back to you, it can be helpful to remind yourself the purpose of code reviews get yourself in a positive mindset. You’ve given your reviewer information to help them help you and now is the time to wait. First of all, you should pat yourself on the back for engaging in code review. It does require more time and sometimes that can feel scary with looming deadlines, but kudos for being able to prioritize your commitment to creating increasingly reproducibility analyses! Remember that you are not your code and mistakes are all a part of the process! Putting your project out there can feel a tad vulnerable even. You may have felt the impulse to keep your code’s problems buried under a rug, but you pushed past that and are making your analyses transparent! Remember that hidden problems don’t get solved, but known problems are opportunities for reaching an even better end result than you imagined! When you receive a review back remember that you and the reviewer are on the same team and both want the best end result feasible for this project! They may suggest ideas that you love and can’t wait to implement. They also might suggest ideas you don’t agree with. Do your best to take all their comments as positive learning opportunities and look for ways to compromise and determine solutions collaboratively. 5.4.1.1 Recommended reading about code review Why code reviews matter (and actually save time!) by Radigan (2021). Pull request descriptions by Bañuelos (2020). A zen manifesto for effective code reviews by Jean-Charles (2019). Best practices for Code Review by Smartbear Team (2021). If you have any feedback on this chapter, please fill out this form, we’d love to hear your feedback! References "],["engaging-in-code-review---as-a-reviewer.html", "Chapter 6 Engaging in Code Review - as a reviewer 6.1 Learning Objectives 6.2 Reviewer responsibilities in code review", " Chapter 6 Engaging in Code Review - as a reviewer 6.1 Learning Objectives 6.2 Reviewer responsibilities in code review When reviewing a pull request, you take on responsibility to ensure that the pull request is getting the project to a better state than before. There are three aspects to reviewing we will focus on: Identify areas in the code and documentation that are opportunities for improvement. Communicate your questions and concerns effectively and in a way that creates a positive atmosphere. Determine solutions collaboratively in a way that allows for a learning as well as a long term improved product. 6.2.1 What to look for! Depending on the goals of the project, and pull request there can be a lot to keep an eye out for. There are many articles out there about what to look for in a code review. Here’s some general points: Does the analysis answer the question it’s asking? Are the methods it uses to do so appropriate? Is the code clear and readable? Does it contain a healthy amount of comments and documentation for individuals not familiar with the project to understand generally what is going on? Is the code efficient with computational resources? (Are there areas it’s a bit too greedy with memory usage?) Does the code stick to the style and conventions of this project? Are there alternate scenarios where the current strategy might fail? (depending on the likelihood of this use case, this may be an instance for a new issue and for it to be addressed in a different pull request). 6.2.2 How to communicate it The pull request may be the author’s precious bundle. Try to be empathetic to the learning process! You are both working on this project together – assume you both want the best out of this project. If something seems wrong, work together to find a solution, don’t ever waste time on placing blame. Remember that everything sounds harsher when you don’t have in-person cues! In this example, Avi may be stating factual things, but without his pleasant and reassuring disposition, it can feel super harsh. If Avi had reframed his comments, they might be more effective in this collaboration. Babatunde (2018) suggests framing review comments in three ways to help communication: questions, suggestions, and appreciations. 6.2.2.1 Questions For example: What happens if this doesn’t get saved? Does it throw an exception or fails silently? The key is to be specific with the questions. Mention exact file names. Put comments on the line you are referring to. Explain what you think is happening and ask them to explain if that is correct. 6.2.2.2 Suggestions For example: I suggest you use an ArrayHelper getValue method here because of its error handling capability instead of accessing the value directly You could even go further by giving an example: $a = $b[‘key’]; would raise an error if key is not set but \\(a = ArrayHelper::getValue(\\)b, ‘key’); would return a null value if key is not set. Giving suggestions and explain not only how to implement it but why it might be preferred in this scenario is a great learning process both for the author and yourself. 6.2.2.3 Appreciations Start every review comment with an appreciation for the hard work completed! This goes a long way for creating a positive atmosphere. For example: Nice Job! Alice. I suggest we create an interface for this service so other substitute services can implement the interface as well, this would enable us change to a different service with very minimal efforts when the need arises. What do you think? Let’s see how Avi’s message could have been reworked to give a more effective review: This interaction reminds us that effective code review is steeped in empathy from both sides. Authors need to appreciate the time and effort the reviewer is spending to help them; while reviewers need to be sensitive to the amount of effort put in by the author already. 6.2.3 Exercise: Review Past you’s code Find the oldest code you wrote and currently have on your computer. Create a repository and pull request with this old code, following the general steps for creating a repository and pull request from the previous chapter. Request yourself as a reviewer. Review the code on Github using their docs as a guide for the mechanics of it. As you review, have empathy for past yourself, and give questions, appreciations, and suggestions in regards to this code. 6.2.3.1 Recommended reading about code review Comments during Code Reviews by Babatunde (2018) On Empathy and Pull Requests by Duretti Hirpa (2016). Code Review Guidelines for Humans by Hauer (2018). Your Code Sucks! – Code Review Best Practices by Hildebr (2020). An even longer list of readings about code review If you have any feedback on this chapter, please fill out this form, we’d love to hear your feedback! References "],["launching-a-docker-image.html", "Chapter 7 Launching a Docker image 7.1 Learning Objectives 7.2 What’s Docker? 7.3 Install Docker 7.4 Getting started with Docker 7.5 More about Docker", " Chapter 7 Launching a Docker image 7.1 Learning Objectives In the introductory part of this course, we discussed package managers like renv or conda. Recall that even if you have the same packages installed between two computers, you can still get different results! This is because package versions do influence results as demonstrated by Beaulieu-Jones and Greene (2017). Package managers address part of this problem, however their limitation is that generally only can help with certain sets of packages. conda really only manages conda installed packages and renv doesn’t help with package management outside of R. Both of these have limited capabilities for cross platform shipping. This is where Docker can help fill in the gaps. I don’t even count anymore how many times did my code break when someone else run it. The strange part was — it worked on my machine. That’s where Docker saves the day. If it works on your machine, it will work on any. Radečić (2020) 7.2 What’s Docker? One way to ensure that her collaborators have the same computing environment is Ruby could ship her computer to each of her collaborators and have them run the analysis on her computer. But before you buy hundreds of laptops for your projects, we’ll show you how Docker will allow you to send your computing environment to your collaborators in a practical manner. Ruby can create a Docker image that Avi can use to run the analysis. This way Ruby and Avi know they are using the same computing environment. Now if Ruby and Avi obtain different results, it won’t be because of version differences. 7.3 Install Docker Go here to install Docker, following the instructions for your particular operating system. If you don’t have a Docker account create an account when prompted, or go here. After you install Docker, start up Docker desktop by double clicking on the app. It may take some time to start up. 7.4 Getting started with Docker Open up your command line. First we need to get the Docker image. A Docker image is like a snapshot of your computing environment that you can move from place to place. We can download images from online and then use them to make a container. Containers are what we use to actually run analyses. From command line, run one of these commands depending on whether you’d like to use Python or R: To obtain the python docker image docker pull jhudsl/reproducible-python To obtain the R docker image docker pull jhudsl/reproducible-r Open up the Docker Desktop app. Click on ‘images’ on the left. This shows the images you currently have available on your computer. Return to your command line. Using cd and ls navigate to your project repository (or whatever files you’d like to be accessible in your development environment) and we can start up a docker container using docker run. To run the Python docker image docker run --rm -v $PWD:/home/jovyan/work -e JUPYTER_ENABLE_LAB=yes -p 8787:8787 jhudsl/reproducible-python Now in your internet browser, go to the address printed out. It should take you to Jupyter Lab. Now you are ready to develop inside a Docker container! To run the R docker image But you can change the password to whatever you’d like. docker run --rm -v $PWD:/home/rstudio -e PASSWORD=password -p 8787:8787 jhudsl/reproducible-r Now in your internet browser, go to localhost:8787. You should see an RStudio login page. Login to RStudio. Your username will be rstudio and your password, will be whatever you set your password to be. Now you are ready to develop inside a Docker container! To see what containers you have running or to clear out old containers, in Docker Desktop you can go to the Containers/Apps page. 7.4.1 A Breakdown what these Docker run options are Docker has super extensive documentation! But to get you started, here’s the highlights for this docker run command: The remove option (--rm) Automatically removes the container when docker run exits. The volume option (-v) is how you specify what files you’d like available in the container and where to find them. In this instance we are using the output of the pwd command (print working directory) so that wherever you run this command, the files in that directory will be available in the container. The part after the colon specifies where these files will be found in the container. The environment option (-e) is how you can specify any environment variables you will need. In this instance for the rocker image we need to specify a password. but for the python image we needed to specify JUPYTER_ENABLE_LAB=yes so that we can use Jupyter Lab. The port option (-p) is how you specify what port you can connect to this on using your internet browser. The image to use is specified in the last part of the docker run command says what image to run, so in these instances, we are running a container using the jhudsl/reproducible-r or jhudsl/reproducible-python images. 7.5 More about Docker Docker tutorial for beginners by Srivastav (2018). 7.5.0.1 Python specific: Jupyter Docker stacks by “Jupyter Docker Stacks — Docker-Stacks Latest Documentation” (2018). How to Run Jupyter Notebook on Docker by Okada (2021). 7.5.0.2 R specific: Launching RStudio in Docker by openscilabs (2021). Getting started with R and Docker by Neuzerling (2018). If you have any feedback on this chapter, please fill out this form, we’d love to hear your feedback! References "],["modifying-a-docker-image.html", "Chapter 8 Modifying a Docker image 8.1 Learning Objectives 8.2 Managing images 8.3 Exercise: Build a Docker image", " Chapter 8 Modifying a Docker image 8.1 Learning Objectives The docker image you are using from the last chapter was pre-made for you, but you will find depending on the needs of your project, that you may need different packages installed. In this chapter we will introduce you to the basics of how to manage your own Docker image. 8.2 Managing images Images can be on your own computer or on dockerhub. To see your list of images on your computer, you can go to Docker desktop. From here you will want to delete images and containers periodically because they do take up room on your computer. To see what images you have on your internet repository, you can log on to dockerhub. Go here to login (or create a username if you have not yet). After you sign into dockerhub, click on the Repositories tab, so you can see the list of repositories you have stored online. At this point, you won’t have any if you just created your dockerhub account. To create a new repository, click the ‘Create Repository’ button. Upon adding the new repository to dockerhub, you will need name it the same as whatever you are calling it locally. You can put a description and name and click create. On the right it shows how you can interact with this from your local command line. After you’ve created the image repository, you will be brought to the image repository page. It will tell you Last pushed: never. On the right it will tell you the command you will need in order to push the image to dockerhub. Go to your local command line and use the command specified on the right side of your repository page. You don’t have to specify a tagname if you don’t want to. If you don’t want to specify a tagname, leave off the :tagname if you like. Now you will be able to test pulling your image using docker pull <image name> like we did in the previous chapter. You can also click on the Public View button to copy the pull command for your Docker image. Docker images can be pulled from being stored online but these images are built originally from a Dockerfile. 8.3 Exercise: Build a Docker image A Dockerfile is a recipe for how to build a docker image. The best way to learn to write Dockerfiles is to start off with one that is already written and modify it for your needs. You can practice building a docker image by downloading the dockerfiles we have started and changing it slightly. 8.3.1 Download an example Dockerfile Get the Python Dockerfile Download the example Dockerfile for Python analyses. wget https://raw.githubusercontent.com/jhudsl/Adv_Reproducibility_in_Cancer_Informatics/main/resources/python-docker/Dockerfile If you get a message like command not found that means you will need to install wget. Altervatively, you can navigate to the Dockerfile’s page on GitHub and use File > Save as but do not add any suffix to the end of the file (no .txt or anything). Just save it as Dockerfile. Get the R Dockerfile Download the example Dockerfile for R analyses. wget https://raw.githubusercontent.com/jhudsl/Adv_Reproducibility_in_Cancer_Informatics/main/resources/r-docker/Dockerfile If you get a message like command not found that means you will need to install wget. Altervatively, you can navigate to the Dockerfile’s page on GitHub and use File > Save as but do not add any suffix to the end of the file (no .txt or anything). Just save it as Dockerfile. 8.3.2 Build a Docker image from a Dockerfile Place this newly downloaded Dockerfile with the rest of your project files. Build a docker image from this Dockerfile using the command below, but replace image_name with what you would like your modified image to be called. docker build -f Dockerfile . -t image_name Navigate back to your Docker desktop and the images window. If your image built successfully, you should see a new image in your list! 8.3.3 Modify a Docker image If you want add or remove a package from a Docker image, you’ll need to modify the Dockerfile. Using your preferred text editor (or RStudio or Jupyter Lab), open up the Dockerfile. You will see the first line in the Docker image is a FROM command. This is a command that will take another docker image to start from. - For our R example, we are starting off with an image that already has R and the tidyverse. - For our Python example we are starting off with an image that already has Python and Jupyter Lab. There are so many Docker images out there, that it might be that someone has already created a docker image with most of the functionality you need for your project. FROM is one of the main commands that a Dockerfile can take as described by their documentation: FROM creates a layer from the another Docker image. COPY adds files from your Docker client’s current directory. RUN builds your application with make. CMD specifies what command to run within the container. 8.3.4 Add to the Dockerfile To get a feel for how these work, let’s add a line to the your example Dockerfile. Using your preferred text editor (or RStudio or Jupyter Lab), open up the Dockerfile and add this line at the very end of the file. Do not add this line to the start of the file as this will not work. The FROM command needs to come first. CMD ["echo","Yay! I added to this Docker image"] Now re-run docker build as you did in the previous section. (Use the command below but replace image_name with whatever your image is called). docker build -f Dockerfile . -t image_name If all built successfully, you should see a message like: => exporting to image 0.0s => => exporting layers 0.0s => => writing image sha256:ayuahgfuiseohfauwheufhauwihefuahweufhawfbuibe 0.0s => => naming to docker.io/library/image_name Now to run the image we can use the docker run command we used in the previous chapter (see below) and we should have a message: Yay! I added to this Docker image pop up upon building. To run your new Python docker image But replace image_name with whatever you have called your image. docker run --rm -v $PWD:/home/jovyan/work -e JUPYTER_ENABLE_LAB=yes -p 8787:8787 image_name To run the R docker image But replace image_name with whatever you have called your image. docker run --rm -v $PWD:/home/rstudio -e PASSWORD=password -p 8787:8787 image_name Stop and remove these containers before moving on. You can do this by going to Docker desktop and clicking on the trash can button next to each container. For images click Clean up to check off the images you’d like to remove and then hit Remove. 8.3.5 Add another package! Starting off with your example Dockerfile, we will practice adding another package and re-build the docker image with a new package. Note that spacing is important as well as having a \\ at the end of each line if the command is continuing. 8.3.5.1 Adding an R package To add R packages from CRAN, you can use this kind of format: RUN Rscript -e "install.packages( \\ c('BiocManager', \\ 'R.utils', \\ 'newpackagename'))" To add an R package from Bioconductor, you can follow this kind of format: RUN Rscript -e "options(warn = 2); BiocManager::install( \\ c('limma', \\ 'newpackagename') To add a Python package using pip, you will need to add pip3 to install Python packages using this format: RUN pip3 install \\ "somepackage==0.1.0" There are so many things you can add to your Docker image. (Picture whatever software and packages you are using on your computer). We can only get you started for the feel of how to build a Dockerfile, and what you put on your Docker image will be up to you. To figure out how to add something, a good strategy is to look for other Dockerfiles that might have the package you want installed and borrow their RUN command. Then try to re-build your Docker image with that added RUN command and see if it builds successfully. And lastly, make sure that whatever changes you make to your Dockerfile, that you add it to your GitHub repository by creating a pull request as we did in Chapter 3. 8.3.6 More about Docker next steps Dockerfile Tutorial by Example. Dockerfile examples 8.3.7 A list of handy Docker commands: Get info on current containers: docker ps How to stop an individual container: docker container ls docker stop <containerID> Get rid of all non-running containers: docker container prune Stop all containers: docker stop $(docker ps -a -q) Remove all containers: docker rm -f $(docker ps -a -q) If you have any feedback on this chapter, please fill out this form, we’d love to hear your feedback! "],["automation-as-a-reproducibility-tool.html", "Chapter 9 Automation as a reproducibility tool 9.1 Learning Objectives 9.2 Build an example GitHub Actions 9.3 Exercise: Set up a GitHub action", " Chapter 9 Automation as a reproducibility tool 9.1 Learning Objectives We’ve discussed that a reproducible analysis can be run by someone else and obtain the same result. But what if before you bug your colleague to use their time to re-run your analysis, you had a robot re-run your analysis? Robots don’t get tired or have other deadlines to respond to and can be set up to re-run your analysis at any time. This is the basis of why automation is powerful tool for reproducibility. There are a lot of applications for GitHub Actions (see links at the end of this chapter) but in the context of our R and Python examples or for scientific notebooks in general, it can be useful to build a GitHub Actions that re-runs the notebook every time a pull request is opened. This is useful because if the notebook does not re-run successfully by GitHub actions, this can be informative to something being amiss in the changes being made. 9.2 Build an example GitHub Actions 9.2.1 Structure of GitHub actions file GitHub actions are written yaml file that you store in a folder called .github/workflows in your GitHub repository. They have two main parts: the trigger: on: the action: job: The trigger is specified by on: and the action that happens upon the trigger being activated is specified by jobs:. The job can be made up of multiple steps:. on: # Some stuff that specifies when the action should run jobs: # The action that should run 9.2.2 Setting up the trigger There’s a list of things that happen in GitHub that can be used to trigger a GitHub actions. See the list here in this case, we will set up a github action that happens whenever a pull request is opened that is going to the main branch. on: pull_request: branches: - main jobs: # The action that should run 9.2.3 Setting up the action The action part of the GitHub action can be named something (here we are calling it name-of-job and we can use the runs-on: to specify a docker image to run this on. For this we will use a base image of ubuntu-latest. This simple action will run a bash command echo to say \"GitHub action is run!\". on: pull_request: branches: - main jobs: name-of-job: runs-on: ubuntu-latest - name: Run message run: echo "GitHub action is run!" 9.3 Exercise: Set up a GitHub action Use a GitHub Action Template by following these instructions. You will need to navigate to your own repository to do this. Tips for developing a GitHub Action: As you are adding your GitHub actions, consult the GitHub actions log. GitHub actions has pretty great documentation so as you are setting up your GitHub actions template, you will want to reference them. Be careful with your spacing this will break your GitHub action. Take a look at other GitHub actions that are doing something similar to what you are trying to accomplish. For testing purposes, modify the trigger so you can test it. You may want to use a manual workflow trigger or pull request: and push:. Use | in your run: command to give a multi-line command. 9.3.1 Resources for setting up your GitHub Actions Python example GitHub Actions to re-run notebook R example GitHub Actions to re-run notebook Great course about GitHub actions Introduction to GitHub Actions for data scientists. If you have any feedback on this chapter, please fill out this form, we’d love to hear your feedback! "],["about-the-authors.html", "About the Authors", " About the Authors These credits are based on our course contributors table guidelines.     Credits Names Pedagogy Lead Content Instructor(s) Candace Savonen Lecturer(s) Candace Savonen Content Directors Jeff Leek, Sarah Wheelan Content Reviewer Sarah Wheelan Content Editor Jimin Hwang Acknowledgments Production Content Publisher Ira Gooding Content Publishing Reviewers Ira Gooding Technical Course Publishing Engineer Candace Savonen Template Publishing Engineers Candace Savonen, Carrie Wright Publishing Maintenance Engineer Candace Savonen Technical Publishing Stylists Carrie Wright, Candace Savonen Package Developers (ottrpal)John Muschelli, Candace Savonen, Carrie Wright Art and Design Illustrator Candace Savonen Jimin Hwang Figure Artist Candace Savonen Videographer Candace Savonen Videography Editor Candace Savonen Funding Funder National Cancer Institute (NCI) UE5 CA254170 Funding Staff Emily Voeglein, Fallon Bachman   ## ─ Session info ─────────────────────────────────────────────────────────────── ## setting value ## version R version 4.0.2 (2020-06-22) ## os Ubuntu 20.04.3 LTS ## system x86_64, linux-gnu ## ui X11 ## language (EN) ## collate en_US.UTF-8 ## ctype en_US.UTF-8 ## tz Etc/UTC ## date 2024-06-28 ## ## ─ Packages ─────────────────────────────────────────────────────────────────── ## package * version date lib source ## assertthat 0.2.1 2019-03-21 [1] RSPM (R 4.0.3) ## bookdown 0.24 2022-02-15 [1] Github (rstudio/bookdown@88bc4ea) ## callr 3.4.4 2020-09-07 [1] RSPM (R 4.0.2) ## cli 2.0.2 2020-02-28 [1] RSPM (R 4.0.0) ## crayon 1.3.4 2017-09-16 [1] RSPM (R 4.0.0) ## desc 1.2.0 2018-05-01 [1] RSPM (R 4.0.3) ## devtools 2.3.2 2020-09-18 [1] RSPM (R 4.0.3) ## digest 0.6.25 2020-02-23 [1] RSPM (R 4.0.0) ## ellipsis 0.3.1 2020-05-15 [1] RSPM (R 4.0.3) ## evaluate 0.14 2019-05-28 [1] RSPM (R 4.0.3) ## fansi 0.4.1 2020-01-08 [1] RSPM (R 4.0.0) ## fs 1.5.0 2020-07-31 [1] RSPM (R 4.0.3) ## glue 1.6.1 2022-01-22 [1] CRAN (R 4.0.2) ## hms 0.5.3 2020-01-08 [1] RSPM (R 4.0.0) ## htmltools 0.5.0 2020-06-16 [1] RSPM (R 4.0.1) ## jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.0.2) ## knitr 1.33 2022-02-15 [1] Github (yihui/knitr@a1052d1) ## lifecycle 1.0.0 2021-02-15 [1] CRAN (R 4.0.2) ## magrittr 2.0.2 2022-01-26 [1] CRAN (R 4.0.2) ## memoise 1.1.0 2017-04-21 [1] RSPM (R 4.0.0) ## ottrpal 0.1.2 2022-02-15 [1] Github (jhudsl/ottrpal@1018848) ## pillar 1.4.6 2020-07-10 [1] RSPM (R 4.0.2) ## pkgbuild 1.1.0 2020-07-13 [1] RSPM (R 4.0.2) ## pkgconfig 2.0.3 2019-09-22 [1] RSPM (R 4.0.3) ## pkgload 1.1.0 2020-05-29 [1] RSPM (R 4.0.3) ## prettyunits 1.1.1 2020-01-24 [1] RSPM (R 4.0.3) ## processx 3.4.4 2020-09-03 [1] RSPM (R 4.0.2) ## ps 1.3.4 2020-08-11 [1] RSPM (R 4.0.2) ## purrr 0.3.4 2020-04-17 [1] RSPM (R 4.0.3) ## R6 2.4.1 2019-11-12 [1] RSPM (R 4.0.0) ## readr 1.4.0 2020-10-05 [1] RSPM (R 4.0.2) ## remotes 2.2.0 2020-07-21 [1] RSPM (R 4.0.3) ## rlang 0.4.10 2022-02-15 [1] Github (r-lib/rlang@f0c9be5) ## rmarkdown 2.10 2022-02-15 [1] Github (rstudio/rmarkdown@02d3c25) ## rprojroot 2.0.2 2020-11-15 [1] CRAN (R 4.0.2) ## sessioninfo 1.1.1 2018-11-05 [1] RSPM (R 4.0.3) ## stringi 1.5.3 2020-09-09 [1] RSPM (R 4.0.3) ## stringr 1.4.0 2019-02-10 [1] RSPM (R 4.0.3) ## testthat 3.0.1 2022-02-15 [1] Github (R-lib/testthat@e99155a) ## tibble 3.0.3 2020-07-10 [1] RSPM (R 4.0.2) ## usethis 2.1.5.9000 2022-02-15 [1] Github (r-lib/usethis@57b109a) ## vctrs 0.3.4 2020-08-29 [1] RSPM (R 4.0.2) ## withr 2.3.0 2020-09-22 [1] RSPM (R 4.0.2) ## xfun 0.26 2022-02-15 [1] Github (yihui/xfun@74c2a66) ## yaml 2.2.1 2020-02-01 [1] RSPM (R 4.0.3) ## ## [1] /usr/local/lib/R/site-library ## [2] /usr/local/lib/R/library "],["references.html", "References", " References "],["404.html", "Page not found", " Page not found The page you requested cannot be found (perhaps it was moved or renamed). You may want to try searching to find the page's new location, or use the table of contents to find the page you are looking for. "]]