diff --git a/docs/10-effective-data-storytelling.html b/docs/10-effective-data-storytelling.html
index fb1563b0c..36a2f1fc0 100644
--- a/docs/10-effective-data-storytelling.html
+++ b/docs/10-effective-data-storytelling.html
@@ -26,14 +26,14 @@
 <meta name="author" content="Chester Ismay and Albert Y. Kim">
 
 
-<meta name="date" content="2017-01-07">
+<meta name="date" content="2017-01-10">
 
   <meta name="viewport" content="width=device-width, initial-scale=1">
   <meta name="apple-mobile-web-app-capable" content="yes">
   <meta name="apple-mobile-web-app-status-bar-style" content="black">
   
   
-<link rel="prev" href="9-regression-via-broom.html">
+<link rel="prev" href="9-regress.html">
 <link rel="next" href="A-appendixA.html">
 
 <script src="libs/jquery-2.2.3/jquery.min.js"></script>
@@ -119,9 +119,9 @@
 
 <ul class="summary">
 <li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> Preamble</a><ul>
-<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book</a></li>
+<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book---for-instructors"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book - For Instructors</a></li>
 <li class="chapter" data-level="1.2" data-path="index.html"><a href="index.html#contribute"><i class="fa fa-check"></i><b>1.2</b> Contribute</a></li>
-<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started"><i class="fa fa-check"></i><b>1.3</b> Getting Started</a></li>
+<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started---for-students"><i class="fa fa-check"></i><b>1.3</b> Getting Started - For Students</a></li>
 <li class="chapter" data-level="" data-path="index.html"><a href="index.html#colophon"><i class="fa fa-check"></i>Colophon</a></li>
 </ul></li>
 <li class="chapter" data-level="2" data-path="2-intro.html"><a href="2-intro.html"><i class="fa fa-check"></i><b>2</b> Introduction</a><ul>
@@ -133,87 +133,100 @@
 </ul></li>
 <li class="part"><span><b>I Data Exploration</b></span></li>
 <li class="chapter" data-level="3" data-path="3-tidy.html"><a href="3-tidy.html"><i class="fa fa-check"></i><b>3</b> Tidy Data</a><ul>
+<li class="chapter" data-level="" data-path="3-tidy.html"><a href="3-tidy.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="3.1" data-path="3-tidy.html"><a href="3-tidy.html#what-is-tidy-data"><i class="fa fa-check"></i><b>3.1</b> What is tidy data?</a></li>
-<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#the-nycflights13-datasets"><i class="fa fa-check"></i><b>3.2</b> The <code>nycflights13</code> datasets</a></li>
-<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a></li>
+<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#datasets-in-the-nycflights13-package"><i class="fa fa-check"></i><b>3.2</b> Datasets in the <code>nycflights13</code> package</a></li>
+<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a><ul>
+<li class="chapter" data-level="3.3.1" data-path="3-tidy.html"><a href="3-tidy.html#identification-variables"><i class="fa fa-check"></i><b>3.3.1</b> Identification variables</a></li>
+</ul></li>
 <li class="chapter" data-level="3.4" data-path="3-tidy.html"><a href="3-tidy.html#normal-forms-of-data"><i class="fa fa-check"></i><b>3.4</b> Normal forms of data</a></li>
 <li class="chapter" data-level="3.5" data-path="3-tidy.html"><a href="3-tidy.html#whats-to-come"><i class="fa fa-check"></i><b>3.5</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via <code id="viz">ggplot2</code></a><ul>
-<li class="chapter" data-level="" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
-<li class="chapter" data-level="4.1.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
-<li class="chapter" data-level="4.1.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
-<li class="chapter" data-level="4.1.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
-<li class="chapter" data-level="4.1.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The <code>ggplot2</code> Package</a></li>
-</ul></li>
-<li class="chapter" data-level="4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
-<li class="chapter" data-level="4.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
-<li class="chapter" data-level="4.3.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatter-plots-via-geom_point"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via <code id="geompoint">geom_point</code></a></li>
-<li class="chapter" data-level="4.3.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
-<li class="chapter" data-level="4.3.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
-<li class="chapter" data-level="4.4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#line-graphs-via-geom_line"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via <code id="geomline">geom_line</code></a></li>
-<li class="chapter" data-level="4.4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.5" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
-<li class="chapter" data-level="4.5.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms-via-geom_histogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via <code id="geomhistogram">geom_histogram</code></a></li>
-<li class="chapter" data-level="4.5.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
-<li class="chapter" data-level="4.5.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.6" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
-<li class="chapter" data-level="4.7" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
-<li class="chapter" data-level="4.7.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#boxplots-via-geom_boxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via <code id="geomboxplot">geom_boxplot</code></a></li>
-<li class="chapter" data-level="4.7.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.8" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
-<li class="chapter" data-level="4.8.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via <code>geom_bar</code></a></li>
-<li class="chapter" data-level="4.8.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
-<li class="chapter" data-level="4.8.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
-<li class="chapter" data-level="4.8.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.9" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
-<li class="chapter" data-level="4.9.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.1</b> What’s to come?</a></li>
-<li class="chapter" data-level="4.9.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
-</ul></li>
-</ul></li>
-<li class="chapter" data-level="5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via <code id="manip">dplyr</code></a><ul>
-<li class="chapter" data-level="" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="5.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
-<li class="chapter" data-level="5.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#four-main-verbs---the-4mv"><i class="fa fa-check"></i><b>5.2</b> Four Main Verbs - The 4MV</a><ul>
-<li class="chapter" data-level="5.2.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#filter-observations-using-filter"><i class="fa fa-check"></i><b>5.2.1</b> Filter observations using <code id="filter">filter</code></a></li>
-<li class="chapter" data-level="5.2.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> Summarize variables using <code>summarize</code></a></li>
-<li class="chapter" data-level="5.2.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.3</b> Create new variables/change old variables using <code>mutate</code></a></li>
-<li class="chapter" data-level="5.2.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#reorder-the-data-frame-using-arrange"><i class="fa fa-check"></i><b>5.2.4</b> Reorder the data frame using <code id="arrange">arrange</code></a></li>
-</ul></li>
-<li class="chapter" data-level="5.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#other-verbs"><i class="fa fa-check"></i><b>5.3</b> Other verbs</a><ul>
-<li class="chapter" data-level="5.3.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#select-variables-using-select"><i class="fa fa-check"></i><b>5.3.1</b> Select variables using <code id="select">select</code></a></li>
-<li class="chapter" data-level="5.3.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#rename-variables-using-rename"><i class="fa fa-check"></i><b>5.3.2</b> Rename variables using <code id="rename">rename</code></a></li>
-<li class="chapter" data-level="5.3.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.3.3</b> Find the top number of values using <code>top_n</code></a></li>
-</ul></li>
-<li class="chapter" data-level="5.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#joiningmerging-data-frames"><i class="fa fa-check"></i><b>5.4</b> Joining/merging data frames</a></li>
-<li class="chapter" data-level="5.5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5</b> Script of R code</a></li>
-<li class="chapter" data-level="5.6" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.6</b> What’s to come?</a></li>
+<li class="chapter" data-level="4" data-path="4-viz.html"><a href="4-viz.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via ggplot2</a><ul>
+<li class="chapter" data-level="" data-path="4-viz.html"><a href="4-viz.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="4.1" data-path="4-viz.html"><a href="4-viz.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
+<li class="chapter" data-level="4.1.1" data-path="4-viz.html"><a href="4-viz.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
+<li class="chapter" data-level="4.1.2" data-path="4-viz.html"><a href="4-viz.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
+<li class="chapter" data-level="4.1.3" data-path="4-viz.html"><a href="4-viz.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
+<li class="chapter" data-level="4.1.4" data-path="4-viz.html"><a href="4-viz.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The ggplot2 Package</a></li>
+</ul></li>
+<li class="chapter" data-level="4.2" data-path="4-viz.html"><a href="4-viz.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
+<li class="chapter" data-level="4.3" data-path="4-viz.html"><a href="4-viz.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
+<li class="chapter" data-level="4.3.1" data-path="4-viz.html"><a href="4-viz.html#geompoint"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via geom_point</a></li>
+<li class="chapter" data-level="4.3.2" data-path="4-viz.html"><a href="4-viz.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
+<li class="chapter" data-level="4.3.3" data-path="4-viz.html"><a href="4-viz.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.4" data-path="4-viz.html"><a href="4-viz.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
+<li class="chapter" data-level="4.4.1" data-path="4-viz.html"><a href="4-viz.html#geomline"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via geom_line</a></li>
+<li class="chapter" data-level="4.4.2" data-path="4-viz.html"><a href="4-viz.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.5" data-path="4-viz.html"><a href="4-viz.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
+<li class="chapter" data-level="4.5.1" data-path="4-viz.html"><a href="4-viz.html#geomhistogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via geom_histogram</a></li>
+<li class="chapter" data-level="4.5.2" data-path="4-viz.html"><a href="4-viz.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
+<li class="chapter" data-level="4.5.3" data-path="4-viz.html"><a href="4-viz.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.6" data-path="4-viz.html"><a href="4-viz.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
+<li class="chapter" data-level="4.7" data-path="4-viz.html"><a href="4-viz.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
+<li class="chapter" data-level="4.7.1" data-path="4-viz.html"><a href="4-viz.html#geomboxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via geom_boxplot</a></li>
+<li class="chapter" data-level="4.7.2" data-path="4-viz.html"><a href="4-viz.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.8" data-path="4-viz.html"><a href="4-viz.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
+<li class="chapter" data-level="4.8.1" data-path="4-viz.html"><a href="4-viz.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via geom_bar</a></li>
+<li class="chapter" data-level="4.8.2" data-path="4-viz.html"><a href="4-viz.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
+<li class="chapter" data-level="4.8.3" data-path="4-viz.html"><a href="4-viz.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
+<li class="chapter" data-level="4.8.4" data-path="4-viz.html"><a href="4-viz.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.9" data-path="4-viz.html"><a href="4-viz.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="4.9.1" data-path="4-viz.html"><a href="4-viz.html#resources"><i class="fa fa-check"></i><b>4.9.1</b> Resources</a></li>
+<li class="chapter" data-level="4.9.2" data-path="4-viz.html"><a href="4-viz.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
+<li class="chapter" data-level="4.9.3" data-path="4-viz.html"><a href="4-viz.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.3</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="5" data-path="5-manip.html"><a href="5-manip.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via dplyr</a><ul>
+<li class="chapter" data-level="" data-path="5-manip.html"><a href="5-manip.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="5.1" data-path="5-manip.html"><a href="5-manip.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
+<li class="chapter" data-level="5.2" data-path="5-manip.html"><a href="5-manip.html#five-main-verbs---the-5mv"><i class="fa fa-check"></i><b>5.2</b> Five Main Verbs - The 5MV</a><ul>
+<li class="chapter" data-level="5.2.1" data-path="5-manip.html"><a href="5-manip.html#filter"><i class="fa fa-check"></i><b>5.2.1</b> 5MV#1: Filter observations using filter</a></li>
+<li class="chapter" data-level="5.2.2" data-path="5-manip.html"><a href="5-manip.html#mv2-summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> 5MV#2: Summarize variables using summarize</a></li>
+<li class="chapter" data-level="5.2.3" data-path="5-manip.html"><a href="5-manip.html#mv3-group-rows-using-group_by"><i class="fa fa-check"></i><b>5.2.3</b> 5MV#3: Group rows using group_by</a></li>
+<li class="chapter" data-level="5.2.4" data-path="5-manip.html"><a href="5-manip.html#mv4-create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.4</b> 5MV#4: Create new variables/change old variables using mutate</a></li>
+<li class="chapter" data-level="5.2.5" data-path="5-manip.html"><a href="5-manip.html#arrange"><i class="fa fa-check"></i><b>5.2.5</b> 5MV#5: Reorder the data frame using arrange</a></li>
+</ul></li>
+<li class="chapter" data-level="5.3" data-path="5-manip.html"><a href="5-manip.html#joining-data-frames"><i class="fa fa-check"></i><b>5.3</b> Joining data frames</a><ul>
+<li class="chapter" data-level="5.3.1" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables"><i class="fa fa-check"></i><b>5.3.1</b> Joining by Key Variables</a></li>
+<li class="chapter" data-level="5.3.2" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables-with-different-names"><i class="fa fa-check"></i><b>5.3.2</b> Joining by Key Variables with Different Names</a></li>
+</ul></li>
+<li class="chapter" data-level="5.4" data-path="5-manip.html"><a href="5-manip.html#optional-other-verbs"><i class="fa fa-check"></i><b>5.4</b> Optional: Other verbs</a><ul>
+<li class="chapter" data-level="5.4.1" data-path="5-manip.html"><a href="5-manip.html#select"><i class="fa fa-check"></i><b>5.4.1</b> Select variables using select</a></li>
+<li class="chapter" data-level="5.4.2" data-path="5-manip.html"><a href="5-manip.html#rename"><i class="fa fa-check"></i><b>5.4.2</b> Rename variables using rename</a></li>
+<li class="chapter" data-level="5.4.3" data-path="5-manip.html"><a href="5-manip.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.4.3</b> Find the top number of values using top_n</a></li>
+</ul></li>
+<li class="chapter" data-level="5.5" data-path="5-manip.html"><a href="5-manip.html#conclusion-1"><i class="fa fa-check"></i><b>5.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="5.5.1" data-path="5-manip.html"><a href="5-manip.html#resources-1"><i class="fa fa-check"></i><b>5.5.1</b> Resources</a></li>
+<li class="chapter" data-level="5.5.2" data-path="5-manip.html"><a href="5-manip.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5.2</b> Script of R code</a></li>
+<li class="chapter" data-level="5.5.3" data-path="5-manip.html"><a href="5-manip.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.5.3</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="part"><span><b>II Inference</b></span></li>
-<li class="chapter" data-level="6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via <code id="sim">mosaic</code></a><ul>
-<li class="chapter" data-level="" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="6.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
-<li class="chapter" data-level="6.1.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
-<li class="chapter" data-level="6.1.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+<li class="chapter" data-level="6" data-path="6-sim.html"><a href="6-sim.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via mosaic</a><ul>
+<li class="chapter" data-level="" data-path="6-sim.html"><a href="6-sim.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="6.1" data-path="6-sim.html"><a href="6-sim.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
+<li class="chapter" data-level="6.1.1" data-path="6-sim.html"><a href="6-sim.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
+<li class="chapter" data-level="6.1.2" data-path="6-sim.html"><a href="6-sim.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+</ul></li>
+<li class="chapter" data-level="6.2" data-path="6-sim.html"><a href="6-sim.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
+<li class="chapter" data-level="6.2.1" data-path="6-sim.html"><a href="6-sim.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
+<li class="chapter" data-level="6.2.2" data-path="6-sim.html"><a href="6-sim.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
 </ul></li>
-<li class="chapter" data-level="6.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
-<li class="chapter" data-level="6.2.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
-<li class="chapter" data-level="6.2.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
+<li class="chapter" data-level="6.3" data-path="6-sim.html"><a href="6-sim.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
+<li class="chapter" data-level="6.4" data-path="6-sim.html"><a href="6-sim.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
+<li class="chapter" data-level="6.5" data-path="6-sim.html"><a href="6-sim.html#conclusion-2"><i class="fa fa-check"></i><b>6.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="6.5.1" data-path="6-sim.html"><a href="6-sim.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5.1</b> Script of R code</a></li>
+<li class="chapter" data-level="6.5.2" data-path="6-sim.html"><a href="6-sim.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.5.2</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="6.3" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
-<li class="chapter" data-level="6.4" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
-<li class="chapter" data-level="6.5" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5</b> Script of R code</a></li>
-<li class="chapter" data-level="6.6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.6</b> What’s to come?</a></li>
 </ul></li>
 <li class="chapter" data-level="7" data-path="7-hypo.html"><a href="7-hypo.html"><i class="fa fa-check"></i><b>7</b> Hypothesis Testing</a><ul>
-<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="7.1" data-path="7-hypo.html"><a href="7-hypo.html#when-inference-is-not-needed"><i class="fa fa-check"></i><b>7.1</b> When Inference Is Not Needed</a></li>
 <li class="chapter" data-level="7.2" data-path="7-hypo.html"><a href="7-hypo.html#basics-of-hypothesis-testing"><i class="fa fa-check"></i><b>7.2</b> Basics of Hypothesis Testing</a></li>
 <li class="chapter" data-level="7.3" data-path="7-hypo.html"><a href="7-hypo.html#trial"><i class="fa fa-check"></i><b>7.3</b> Criminal trial analogy</a><ul>
@@ -249,44 +262,50 @@
 <li class="chapter" data-level="7.8.1" data-path="7-hypo.html"><a href="7-hypo.html#example-t-test-for-two-independent-samples"><i class="fa fa-check"></i><b>7.8.1</b> EXAMPLE: <span class="math inline">\(t\)</span>-test for two independent samples</a></li>
 <li class="chapter" data-level="7.8.2" data-path="7-hypo.html"><a href="7-hypo.html#conditions-for-t-test"><i class="fa fa-check"></i><b>7.8.2</b> Conditions for t-test</a></li>
 </ul></li>
-<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9</b> What’s to come?</a></li>
+<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#conclusion-3"><i class="fa fa-check"></i><b>7.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="7.9.1" data-path="7-hypo.html"><a href="7-hypo.html#script-of-r-code-3"><i class="fa fa-check"></i><b>7.9.1</b> Script of R code</a></li>
+<li class="chapter" data-level="7.9.2" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="chapter" data-level="8" data-path="8-ci.html"><a href="8-ci.html"><i class="fa fa-check"></i><b>8</b> Confidence Intervals</a><ul>
-<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a></li>
-<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#mean_rating"><i class="fa fa-check"></i><b>8.2</b> mean_rating</a><ul>
-<li class="chapter" data-level="8.2.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.2.1</b> Review of Bootstrapping</a></li>
-</ul></li>
-<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.3</b> Relation to hypothesis testing</a></li>
-<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.4</b> Effect size</a></li>
-<li class="chapter" data-level="8.5" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-3"><i class="fa fa-check"></i><b>8.5</b> Script of R code</a></li>
-<li class="chapter" data-level="8.6" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.6</b> What’s to come?</a></li>
-</ul></li>
-<li class="chapter" data-level="9" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html"><i class="fa fa-check"></i><b>9</b> Regression via <code id="regress">broom</code></a><ul>
-<li class="chapter" data-level="" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="9.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
-<li class="chapter" data-level="9.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
-<li class="chapter" data-level="9.2.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
-</ul></li>
-<li class="chapter" data-level="9.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
-<li class="chapter" data-level="9.3.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
-<li class="chapter" data-level="9.3.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
-<li class="chapter" data-level="9.3.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#interpretting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpretting the slope</a></li>
-<li class="chapter" data-level="9.3.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
-</ul></li>
-<li class="chapter" data-level="9.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
-<li class="chapter" data-level="9.4.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
-<li class="chapter" data-level="9.4.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
-<li class="chapter" data-level="9.4.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
-<li class="chapter" data-level="9.4.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
-<li class="chapter" data-level="9.4.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
-</ul></li>
-<li class="chapter" data-level="9.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
-<li class="chapter" data-level="9.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
-<li class="chapter" data-level="9.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#script-of-r-code-4"><i class="fa fa-check"></i><b>9.7</b> Script of R code</a></li>
-<li class="chapter" data-level="9.8" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.8</b> What’s to come?</a></li>
+<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a><ul>
+<li class="chapter" data-level="8.1.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.1.1</b> Review of Bootstrapping</a></li>
+</ul></li>
+<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.2</b> Relation to hypothesis testing</a></li>
+<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.3</b> Effect size</a></li>
+<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#conclusion-4"><i class="fa fa-check"></i><b>8.4</b> Conclusion</a><ul>
+<li class="chapter" data-level="8.4.1" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-4"><i class="fa fa-check"></i><b>8.4.1</b> Script of R code</a></li>
+<li class="chapter" data-level="8.4.2" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.4.2</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="9" data-path="9-regress.html"><a href="9-regress.html"><i class="fa fa-check"></i><b>9</b> Regression via broom</a><ul>
+<li class="chapter" data-level="" data-path="9-regress.html"><a href="9-regress.html#needed-packages-6"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="9.1" data-path="9-regress.html"><a href="9-regress.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
+<li class="chapter" data-level="9.2" data-path="9-regress.html"><a href="9-regress.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
+<li class="chapter" data-level="9.2.1" data-path="9-regress.html"><a href="9-regress.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
+</ul></li>
+<li class="chapter" data-level="9.3" data-path="9-regress.html"><a href="9-regress.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
+<li class="chapter" data-level="9.3.1" data-path="9-regress.html"><a href="9-regress.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
+<li class="chapter" data-level="9.3.2" data-path="9-regress.html"><a href="9-regress.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
+<li class="chapter" data-level="9.3.3" data-path="9-regress.html"><a href="9-regress.html#interpreting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpreting the slope</a></li>
+<li class="chapter" data-level="9.3.4" data-path="9-regress.html"><a href="9-regress.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
+</ul></li>
+<li class="chapter" data-level="9.4" data-path="9-regress.html"><a href="9-regress.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
+<li class="chapter" data-level="9.4.1" data-path="9-regress.html"><a href="9-regress.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
+<li class="chapter" data-level="9.4.2" data-path="9-regress.html"><a href="9-regress.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
+<li class="chapter" data-level="9.4.3" data-path="9-regress.html"><a href="9-regress.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
+<li class="chapter" data-level="9.4.4" data-path="9-regress.html"><a href="9-regress.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.5" data-path="9-regress.html"><a href="9-regress.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
+<li class="chapter" data-level="9.4.6" data-path="9-regress.html"><a href="9-regress.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.7" data-path="9-regress.html"><a href="9-regress.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
+</ul></li>
+<li class="chapter" data-level="9.5" data-path="9-regress.html"><a href="9-regress.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
+<li class="chapter" data-level="9.6" data-path="9-regress.html"><a href="9-regress.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
+<li class="chapter" data-level="9.7" data-path="9-regress.html"><a href="9-regress.html#conclusion-5"><i class="fa fa-check"></i><b>9.7</b> Conclusion</a><ul>
+<li class="chapter" data-level="9.7.1" data-path="9-regress.html"><a href="9-regress.html#script-of-r-code-5"><i class="fa fa-check"></i><b>9.7.1</b> Script of R code</a></li>
+<li class="chapter" data-level="9.7.2" data-path="9-regress.html"><a href="9-regress.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.7.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="part"><span><b>III Conclusion</b></span></li>
 <li class="chapter" data-level="10" data-path="10-effective-data-storytelling.html"><a href="10-effective-data-storytelling.html"><i class="fa fa-check"></i><b>10</b> Effective Data Storytelling</a><ul>
@@ -304,7 +323,8 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="B" data-path="B-appendixB.html"><a href="B-appendixB.html"><i class="fa fa-check"></i><b>B</b> Inference Examples</a><ul>
-<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-6"><i class="fa fa-check"></i><b>B.1</b> Needed packages</a></li>
+<li class="chapter" data-level="" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-7"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#inference-mind-map"><i class="fa fa-check"></i><b>B.1</b> Inference Mind Map</a></li>
 <li class="chapter" data-level="B.2" data-path="B-appendixB.html"><a href="B-appendixB.html#one-mean"><i class="fa fa-check"></i><b>B.2</b> One Mean</a><ul>
 <li class="chapter" data-level="B.2.1" data-path="B-appendixB.html"><a href="B-appendixB.html#problem-statement"><i class="fa fa-check"></i><b>B.2.1</b> Problem Statement</a></li>
 <li class="chapter" data-level="B.2.2" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses"><i class="fa fa-check"></i><b>B.2.2</b> Competing Hypotheses</a></li>
@@ -352,6 +372,7 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="C" data-path="C-appendixC.html"><a href="C-appendixC.html"><i class="fa fa-check"></i><b>C</b> Reach for the Starts</a><ul>
+<li class="chapter" data-level="" data-path="C-appendixC.html"><a href="C-appendixC.html#needed-packages-8"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="C.1" data-path="C-appendixC.html"><a href="C-appendixC.html#sorted-barplots"><i class="fa fa-check"></i><b>C.1</b> Sorted barplots</a></li>
 <li class="chapter" data-level="C.2" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-graphics"><i class="fa fa-check"></i><b>C.2</b> Interactive graphics</a><ul>
 <li class="chapter" data-level="C.2.1" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-line-graphs"><i class="fa fa-check"></i><b>C.2.1</b> Interactive line-graphs</a></li>
@@ -377,11 +398,11 @@ <h1>
             <section class="normal" id="section-">
 <div id="effective-data-storytelling" class="section level1">
 <h1><span class="header-section-number">10</span> Effective Data Storytelling</h1>
-<p>As we’ve progressed throughout this book, you’ve seen how to work with data in a variety of ways. You’ve learned effective strategies for plotting data by understanding which types of plots work best for which combinations of variable types. You’ve summarized data in table form and calculated summary statistics for a variety of different variables. Further, you’ve seen the value of inference as a process to come to conclusions about a population by using a random sample. Lastly, you’ve explored how to use linear regression and the importance of checking the conditions required to make it a valid procedure. Throughout, you’ve learned many computational techniques and focused on reproducible research in writing R code and keeping track of your work in R Markdown. All of these steps go into making a great story using data.</p>
-<p>As the textbook comes to a close, we thought it best that you explore what stellar work is being produced by data journalists throughout the world that specialize in effective data storytelling. We recommend you read and analyze this article by Walt Hickey entitled <a href="http://fivethirtyeight.com/features/the-dollar-and-cents-case-against-hollywoods-exclusion-of-women/">The Dollar-And-Cents Case Against Hollywood’s Exclusion of Women</a>. As you read over it, think carefully about how Walt is using his data, his graphics, and his analyses to paint the picture for the reader of what the story is he wants to tell. In the spirit of reproducibility, the members of 538 have also shared the data that they used to create this story and some R code <a href="https://github.com/fivethirtyeight/data/tree/master/bechdel">here</a>. Great data stories don’t mislead the reader, but rather engulf them in understanding the importance that data plays in our lives through the captivation of storytelling.</p>
+<p>As we’ve progressed throughout this book, you’ve seen how to work with data in a variety of ways. You’ve learned effective strategies for plotting data by understanding which types of plots work best for which combinations of variable types. You’ve summarized data in table form and calculated summary statistics for a variety of different variables. Further, you’ve seen the value of inference as a process to come to conclusions about a population by using a random sample. Lastly, you’ve explored how to use linear regression and the importance of checking the conditions required to make it a valid procedure. All throughout, you’ve learned many computational techniques and focused on reproducible research in writing R code and keeping track of your work in R Markdown. All of these steps go into making a great story using data.</p>
+<p>As the textbook comes to a close, we thought it best that you explore what stellar work is being produced by data journalists throughout the world that specialize in effective data storytelling. We recommend you read and analyze this article by Walt Hickey entitled <a href="http://fivethirtyeight.com/features/the-dollar-and-cents-case-against-hollywoods-exclusion-of-women/">The Dollar-And-Cents Case Against Hollywood’s Exclusion of Women</a>. As you read over it, think carefully about how Walt is using his data, his graphics, and his analyses to paint the picture for the reader of what the story is he wants to tell. In the spirit of reproducibility, the members of FiveThirtyEight have also shared the data that they used to create this story and some R code <a href="https://github.com/fivethirtyeight/data/tree/master/bechdel">here</a>. A vignette showing how to reproduce one of the plots at the end of the article using <code>dplyr</code>, <code>ggplot2</code>, and other packages in Hadley’s <code>tidyverse</code> is available <a href="https://cran.r-project.org/web/packages/fivethirtyeight/vignettes/bechdel.html">here</a>. Great data stories don’t mislead the reader, but rather engulf them in understanding the importance that data plays in our lives through the captivation of storytelling.</p>
 <div id="concluding-remarks" class="section level2 unnumbered">
 <h2>Concluding Remarks</h2>
-<p>If you’ve come to this point in the book, I’d suspect that you know a thing or two about how to work with data in R. You’ve also gained a lot of knowledge about how to use simulation techniques to determine statistical significance. The hope is that you’ve come to appreciate data manipulation, tidy data sets, and the power of statistical visualization. Actually, the data visualization part may be the most important thing here. If you can create truly beautiful graphics that display information in ways that the reader can clearly decipher, you’ve picked up a great skill. Let’s hope that that skill keeps you creating great stories with data into the near and far distant future. Thanks for coming along for the ride as we dove into modern data analysis using R!</p>
+<p>If you’ve come to this point in the book, I’d suspect that you know a thing or two about how to work with data in R. You’ve also gained a lot of knowledge about how to use simulation techniques to determine statistical significance and how these techniques build an intuition about traditional inferential methods like the <span class="math inline">\(t\)</span>-test. The hope is that you’ve come to appreciate data manipulation, tidy data sets, and the power of data visualization. Actually, the data visualization part may be the most important thing here. If you can create truly beautiful graphics that display information in ways that the reader can clearly decipher, you’ve picked up a great skill. Let’s hope that that skill keeps you creating great stories with data into the near and far distant future. Thanks for coming along for the ride as we dove into modern data analysis using R!</p>
 
 </div>
 </div>
@@ -393,7 +414,7 @@ <h2>Concluding Remarks</h2>
           </div>
         </div>
       </div>
-<a href="9-regression-via-broom.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
+<a href="9-regress.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
 <a href="A-appendixA.html" class="navigation navigation-next " aria-label="Next page""><i class="fa fa-angle-right"></i></a>
 
 <script src="libs/gitbook-2.6.7/js/app.min.js"></script>
diff --git a/docs/2-intro.html b/docs/2-intro.html
index 41bfc9b62..b159e2245 100644
--- a/docs/2-intro.html
+++ b/docs/2-intro.html
@@ -26,7 +26,7 @@
 <meta name="author" content="Chester Ismay and Albert Y. Kim">
 
 
-<meta name="date" content="2017-01-07">
+<meta name="date" content="2017-01-10">
 
   <meta name="viewport" content="width=device-width, initial-scale=1">
   <meta name="apple-mobile-web-app-capable" content="yes">
@@ -119,9 +119,9 @@
 
 <ul class="summary">
 <li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> Preamble</a><ul>
-<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book</a></li>
+<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book---for-instructors"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book - For Instructors</a></li>
 <li class="chapter" data-level="1.2" data-path="index.html"><a href="index.html#contribute"><i class="fa fa-check"></i><b>1.2</b> Contribute</a></li>
-<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started"><i class="fa fa-check"></i><b>1.3</b> Getting Started</a></li>
+<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started---for-students"><i class="fa fa-check"></i><b>1.3</b> Getting Started - For Students</a></li>
 <li class="chapter" data-level="" data-path="index.html"><a href="index.html#colophon"><i class="fa fa-check"></i>Colophon</a></li>
 </ul></li>
 <li class="chapter" data-level="2" data-path="2-intro.html"><a href="2-intro.html"><i class="fa fa-check"></i><b>2</b> Introduction</a><ul>
@@ -133,87 +133,100 @@
 </ul></li>
 <li class="part"><span><b>I Data Exploration</b></span></li>
 <li class="chapter" data-level="3" data-path="3-tidy.html"><a href="3-tidy.html"><i class="fa fa-check"></i><b>3</b> Tidy Data</a><ul>
+<li class="chapter" data-level="" data-path="3-tidy.html"><a href="3-tidy.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="3.1" data-path="3-tidy.html"><a href="3-tidy.html#what-is-tidy-data"><i class="fa fa-check"></i><b>3.1</b> What is tidy data?</a></li>
-<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#the-nycflights13-datasets"><i class="fa fa-check"></i><b>3.2</b> The <code>nycflights13</code> datasets</a></li>
-<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a></li>
+<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#datasets-in-the-nycflights13-package"><i class="fa fa-check"></i><b>3.2</b> Datasets in the <code>nycflights13</code> package</a></li>
+<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a><ul>
+<li class="chapter" data-level="3.3.1" data-path="3-tidy.html"><a href="3-tidy.html#identification-variables"><i class="fa fa-check"></i><b>3.3.1</b> Identification variables</a></li>
+</ul></li>
 <li class="chapter" data-level="3.4" data-path="3-tidy.html"><a href="3-tidy.html#normal-forms-of-data"><i class="fa fa-check"></i><b>3.4</b> Normal forms of data</a></li>
 <li class="chapter" data-level="3.5" data-path="3-tidy.html"><a href="3-tidy.html#whats-to-come"><i class="fa fa-check"></i><b>3.5</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via <code id="viz">ggplot2</code></a><ul>
-<li class="chapter" data-level="" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
-<li class="chapter" data-level="4.1.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
-<li class="chapter" data-level="4.1.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
-<li class="chapter" data-level="4.1.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
-<li class="chapter" data-level="4.1.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The <code>ggplot2</code> Package</a></li>
-</ul></li>
-<li class="chapter" data-level="4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
-<li class="chapter" data-level="4.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
-<li class="chapter" data-level="4.3.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatter-plots-via-geom_point"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via <code id="geompoint">geom_point</code></a></li>
-<li class="chapter" data-level="4.3.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
-<li class="chapter" data-level="4.3.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
-<li class="chapter" data-level="4.4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#line-graphs-via-geom_line"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via <code id="geomline">geom_line</code></a></li>
-<li class="chapter" data-level="4.4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.5" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
-<li class="chapter" data-level="4.5.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms-via-geom_histogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via <code id="geomhistogram">geom_histogram</code></a></li>
-<li class="chapter" data-level="4.5.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
-<li class="chapter" data-level="4.5.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.6" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
-<li class="chapter" data-level="4.7" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
-<li class="chapter" data-level="4.7.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#boxplots-via-geom_boxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via <code id="geomboxplot">geom_boxplot</code></a></li>
-<li class="chapter" data-level="4.7.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.8" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
-<li class="chapter" data-level="4.8.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via <code>geom_bar</code></a></li>
-<li class="chapter" data-level="4.8.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
-<li class="chapter" data-level="4.8.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
-<li class="chapter" data-level="4.8.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.9" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
-<li class="chapter" data-level="4.9.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.1</b> What’s to come?</a></li>
-<li class="chapter" data-level="4.9.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
-</ul></li>
-</ul></li>
-<li class="chapter" data-level="5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via <code id="manip">dplyr</code></a><ul>
-<li class="chapter" data-level="" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="5.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
-<li class="chapter" data-level="5.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#four-main-verbs---the-4mv"><i class="fa fa-check"></i><b>5.2</b> Four Main Verbs - The 4MV</a><ul>
-<li class="chapter" data-level="5.2.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#filter-observations-using-filter"><i class="fa fa-check"></i><b>5.2.1</b> Filter observations using <code id="filter">filter</code></a></li>
-<li class="chapter" data-level="5.2.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> Summarize variables using <code>summarize</code></a></li>
-<li class="chapter" data-level="5.2.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.3</b> Create new variables/change old variables using <code>mutate</code></a></li>
-<li class="chapter" data-level="5.2.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#reorder-the-data-frame-using-arrange"><i class="fa fa-check"></i><b>5.2.4</b> Reorder the data frame using <code id="arrange">arrange</code></a></li>
-</ul></li>
-<li class="chapter" data-level="5.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#other-verbs"><i class="fa fa-check"></i><b>5.3</b> Other verbs</a><ul>
-<li class="chapter" data-level="5.3.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#select-variables-using-select"><i class="fa fa-check"></i><b>5.3.1</b> Select variables using <code id="select">select</code></a></li>
-<li class="chapter" data-level="5.3.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#rename-variables-using-rename"><i class="fa fa-check"></i><b>5.3.2</b> Rename variables using <code id="rename">rename</code></a></li>
-<li class="chapter" data-level="5.3.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.3.3</b> Find the top number of values using <code>top_n</code></a></li>
-</ul></li>
-<li class="chapter" data-level="5.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#joiningmerging-data-frames"><i class="fa fa-check"></i><b>5.4</b> Joining/merging data frames</a></li>
-<li class="chapter" data-level="5.5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5</b> Script of R code</a></li>
-<li class="chapter" data-level="5.6" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.6</b> What’s to come?</a></li>
+<li class="chapter" data-level="4" data-path="4-viz.html"><a href="4-viz.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via ggplot2</a><ul>
+<li class="chapter" data-level="" data-path="4-viz.html"><a href="4-viz.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="4.1" data-path="4-viz.html"><a href="4-viz.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
+<li class="chapter" data-level="4.1.1" data-path="4-viz.html"><a href="4-viz.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
+<li class="chapter" data-level="4.1.2" data-path="4-viz.html"><a href="4-viz.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
+<li class="chapter" data-level="4.1.3" data-path="4-viz.html"><a href="4-viz.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
+<li class="chapter" data-level="4.1.4" data-path="4-viz.html"><a href="4-viz.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The ggplot2 Package</a></li>
+</ul></li>
+<li class="chapter" data-level="4.2" data-path="4-viz.html"><a href="4-viz.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
+<li class="chapter" data-level="4.3" data-path="4-viz.html"><a href="4-viz.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
+<li class="chapter" data-level="4.3.1" data-path="4-viz.html"><a href="4-viz.html#geompoint"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via geom_point</a></li>
+<li class="chapter" data-level="4.3.2" data-path="4-viz.html"><a href="4-viz.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
+<li class="chapter" data-level="4.3.3" data-path="4-viz.html"><a href="4-viz.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.4" data-path="4-viz.html"><a href="4-viz.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
+<li class="chapter" data-level="4.4.1" data-path="4-viz.html"><a href="4-viz.html#geomline"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via geom_line</a></li>
+<li class="chapter" data-level="4.4.2" data-path="4-viz.html"><a href="4-viz.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.5" data-path="4-viz.html"><a href="4-viz.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
+<li class="chapter" data-level="4.5.1" data-path="4-viz.html"><a href="4-viz.html#geomhistogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via geom_histogram</a></li>
+<li class="chapter" data-level="4.5.2" data-path="4-viz.html"><a href="4-viz.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
+<li class="chapter" data-level="4.5.3" data-path="4-viz.html"><a href="4-viz.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.6" data-path="4-viz.html"><a href="4-viz.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
+<li class="chapter" data-level="4.7" data-path="4-viz.html"><a href="4-viz.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
+<li class="chapter" data-level="4.7.1" data-path="4-viz.html"><a href="4-viz.html#geomboxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via geom_boxplot</a></li>
+<li class="chapter" data-level="4.7.2" data-path="4-viz.html"><a href="4-viz.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.8" data-path="4-viz.html"><a href="4-viz.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
+<li class="chapter" data-level="4.8.1" data-path="4-viz.html"><a href="4-viz.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via geom_bar</a></li>
+<li class="chapter" data-level="4.8.2" data-path="4-viz.html"><a href="4-viz.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
+<li class="chapter" data-level="4.8.3" data-path="4-viz.html"><a href="4-viz.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
+<li class="chapter" data-level="4.8.4" data-path="4-viz.html"><a href="4-viz.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.9" data-path="4-viz.html"><a href="4-viz.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="4.9.1" data-path="4-viz.html"><a href="4-viz.html#resources"><i class="fa fa-check"></i><b>4.9.1</b> Resources</a></li>
+<li class="chapter" data-level="4.9.2" data-path="4-viz.html"><a href="4-viz.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
+<li class="chapter" data-level="4.9.3" data-path="4-viz.html"><a href="4-viz.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.3</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="5" data-path="5-manip.html"><a href="5-manip.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via dplyr</a><ul>
+<li class="chapter" data-level="" data-path="5-manip.html"><a href="5-manip.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="5.1" data-path="5-manip.html"><a href="5-manip.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
+<li class="chapter" data-level="5.2" data-path="5-manip.html"><a href="5-manip.html#five-main-verbs---the-5mv"><i class="fa fa-check"></i><b>5.2</b> Five Main Verbs - The 5MV</a><ul>
+<li class="chapter" data-level="5.2.1" data-path="5-manip.html"><a href="5-manip.html#filter"><i class="fa fa-check"></i><b>5.2.1</b> 5MV#1: Filter observations using filter</a></li>
+<li class="chapter" data-level="5.2.2" data-path="5-manip.html"><a href="5-manip.html#mv2-summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> 5MV#2: Summarize variables using summarize</a></li>
+<li class="chapter" data-level="5.2.3" data-path="5-manip.html"><a href="5-manip.html#mv3-group-rows-using-group_by"><i class="fa fa-check"></i><b>5.2.3</b> 5MV#3: Group rows using group_by</a></li>
+<li class="chapter" data-level="5.2.4" data-path="5-manip.html"><a href="5-manip.html#mv4-create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.4</b> 5MV#4: Create new variables/change old variables using mutate</a></li>
+<li class="chapter" data-level="5.2.5" data-path="5-manip.html"><a href="5-manip.html#arrange"><i class="fa fa-check"></i><b>5.2.5</b> 5MV#5: Reorder the data frame using arrange</a></li>
+</ul></li>
+<li class="chapter" data-level="5.3" data-path="5-manip.html"><a href="5-manip.html#joining-data-frames"><i class="fa fa-check"></i><b>5.3</b> Joining data frames</a><ul>
+<li class="chapter" data-level="5.3.1" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables"><i class="fa fa-check"></i><b>5.3.1</b> Joining by Key Variables</a></li>
+<li class="chapter" data-level="5.3.2" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables-with-different-names"><i class="fa fa-check"></i><b>5.3.2</b> Joining by Key Variables with Different Names</a></li>
+</ul></li>
+<li class="chapter" data-level="5.4" data-path="5-manip.html"><a href="5-manip.html#optional-other-verbs"><i class="fa fa-check"></i><b>5.4</b> Optional: Other verbs</a><ul>
+<li class="chapter" data-level="5.4.1" data-path="5-manip.html"><a href="5-manip.html#select"><i class="fa fa-check"></i><b>5.4.1</b> Select variables using select</a></li>
+<li class="chapter" data-level="5.4.2" data-path="5-manip.html"><a href="5-manip.html#rename"><i class="fa fa-check"></i><b>5.4.2</b> Rename variables using rename</a></li>
+<li class="chapter" data-level="5.4.3" data-path="5-manip.html"><a href="5-manip.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.4.3</b> Find the top number of values using top_n</a></li>
+</ul></li>
+<li class="chapter" data-level="5.5" data-path="5-manip.html"><a href="5-manip.html#conclusion-1"><i class="fa fa-check"></i><b>5.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="5.5.1" data-path="5-manip.html"><a href="5-manip.html#resources-1"><i class="fa fa-check"></i><b>5.5.1</b> Resources</a></li>
+<li class="chapter" data-level="5.5.2" data-path="5-manip.html"><a href="5-manip.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5.2</b> Script of R code</a></li>
+<li class="chapter" data-level="5.5.3" data-path="5-manip.html"><a href="5-manip.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.5.3</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="part"><span><b>II Inference</b></span></li>
-<li class="chapter" data-level="6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via <code id="sim">mosaic</code></a><ul>
-<li class="chapter" data-level="" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="6.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
-<li class="chapter" data-level="6.1.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
-<li class="chapter" data-level="6.1.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+<li class="chapter" data-level="6" data-path="6-sim.html"><a href="6-sim.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via mosaic</a><ul>
+<li class="chapter" data-level="" data-path="6-sim.html"><a href="6-sim.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="6.1" data-path="6-sim.html"><a href="6-sim.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
+<li class="chapter" data-level="6.1.1" data-path="6-sim.html"><a href="6-sim.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
+<li class="chapter" data-level="6.1.2" data-path="6-sim.html"><a href="6-sim.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+</ul></li>
+<li class="chapter" data-level="6.2" data-path="6-sim.html"><a href="6-sim.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
+<li class="chapter" data-level="6.2.1" data-path="6-sim.html"><a href="6-sim.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
+<li class="chapter" data-level="6.2.2" data-path="6-sim.html"><a href="6-sim.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
 </ul></li>
-<li class="chapter" data-level="6.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
-<li class="chapter" data-level="6.2.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
-<li class="chapter" data-level="6.2.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
+<li class="chapter" data-level="6.3" data-path="6-sim.html"><a href="6-sim.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
+<li class="chapter" data-level="6.4" data-path="6-sim.html"><a href="6-sim.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
+<li class="chapter" data-level="6.5" data-path="6-sim.html"><a href="6-sim.html#conclusion-2"><i class="fa fa-check"></i><b>6.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="6.5.1" data-path="6-sim.html"><a href="6-sim.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5.1</b> Script of R code</a></li>
+<li class="chapter" data-level="6.5.2" data-path="6-sim.html"><a href="6-sim.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.5.2</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="6.3" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
-<li class="chapter" data-level="6.4" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
-<li class="chapter" data-level="6.5" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5</b> Script of R code</a></li>
-<li class="chapter" data-level="6.6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.6</b> What’s to come?</a></li>
 </ul></li>
 <li class="chapter" data-level="7" data-path="7-hypo.html"><a href="7-hypo.html"><i class="fa fa-check"></i><b>7</b> Hypothesis Testing</a><ul>
-<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="7.1" data-path="7-hypo.html"><a href="7-hypo.html#when-inference-is-not-needed"><i class="fa fa-check"></i><b>7.1</b> When Inference Is Not Needed</a></li>
 <li class="chapter" data-level="7.2" data-path="7-hypo.html"><a href="7-hypo.html#basics-of-hypothesis-testing"><i class="fa fa-check"></i><b>7.2</b> Basics of Hypothesis Testing</a></li>
 <li class="chapter" data-level="7.3" data-path="7-hypo.html"><a href="7-hypo.html#trial"><i class="fa fa-check"></i><b>7.3</b> Criminal trial analogy</a><ul>
@@ -249,44 +262,50 @@
 <li class="chapter" data-level="7.8.1" data-path="7-hypo.html"><a href="7-hypo.html#example-t-test-for-two-independent-samples"><i class="fa fa-check"></i><b>7.8.1</b> EXAMPLE: <span class="math inline">\(t\)</span>-test for two independent samples</a></li>
 <li class="chapter" data-level="7.8.2" data-path="7-hypo.html"><a href="7-hypo.html#conditions-for-t-test"><i class="fa fa-check"></i><b>7.8.2</b> Conditions for t-test</a></li>
 </ul></li>
-<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9</b> What’s to come?</a></li>
+<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#conclusion-3"><i class="fa fa-check"></i><b>7.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="7.9.1" data-path="7-hypo.html"><a href="7-hypo.html#script-of-r-code-3"><i class="fa fa-check"></i><b>7.9.1</b> Script of R code</a></li>
+<li class="chapter" data-level="7.9.2" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="chapter" data-level="8" data-path="8-ci.html"><a href="8-ci.html"><i class="fa fa-check"></i><b>8</b> Confidence Intervals</a><ul>
-<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a></li>
-<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#mean_rating"><i class="fa fa-check"></i><b>8.2</b> mean_rating</a><ul>
-<li class="chapter" data-level="8.2.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.2.1</b> Review of Bootstrapping</a></li>
-</ul></li>
-<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.3</b> Relation to hypothesis testing</a></li>
-<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.4</b> Effect size</a></li>
-<li class="chapter" data-level="8.5" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-3"><i class="fa fa-check"></i><b>8.5</b> Script of R code</a></li>
-<li class="chapter" data-level="8.6" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.6</b> What’s to come?</a></li>
-</ul></li>
-<li class="chapter" data-level="9" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html"><i class="fa fa-check"></i><b>9</b> Regression via <code id="regress">broom</code></a><ul>
-<li class="chapter" data-level="" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="9.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
-<li class="chapter" data-level="9.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
-<li class="chapter" data-level="9.2.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
-</ul></li>
-<li class="chapter" data-level="9.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
-<li class="chapter" data-level="9.3.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
-<li class="chapter" data-level="9.3.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
-<li class="chapter" data-level="9.3.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#interpretting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpretting the slope</a></li>
-<li class="chapter" data-level="9.3.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
-</ul></li>
-<li class="chapter" data-level="9.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
-<li class="chapter" data-level="9.4.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
-<li class="chapter" data-level="9.4.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
-<li class="chapter" data-level="9.4.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
-<li class="chapter" data-level="9.4.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
-<li class="chapter" data-level="9.4.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
-</ul></li>
-<li class="chapter" data-level="9.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
-<li class="chapter" data-level="9.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
-<li class="chapter" data-level="9.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#script-of-r-code-4"><i class="fa fa-check"></i><b>9.7</b> Script of R code</a></li>
-<li class="chapter" data-level="9.8" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.8</b> What’s to come?</a></li>
+<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a><ul>
+<li class="chapter" data-level="8.1.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.1.1</b> Review of Bootstrapping</a></li>
+</ul></li>
+<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.2</b> Relation to hypothesis testing</a></li>
+<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.3</b> Effect size</a></li>
+<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#conclusion-4"><i class="fa fa-check"></i><b>8.4</b> Conclusion</a><ul>
+<li class="chapter" data-level="8.4.1" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-4"><i class="fa fa-check"></i><b>8.4.1</b> Script of R code</a></li>
+<li class="chapter" data-level="8.4.2" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.4.2</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="9" data-path="9-regress.html"><a href="9-regress.html"><i class="fa fa-check"></i><b>9</b> Regression via broom</a><ul>
+<li class="chapter" data-level="" data-path="9-regress.html"><a href="9-regress.html#needed-packages-6"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="9.1" data-path="9-regress.html"><a href="9-regress.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
+<li class="chapter" data-level="9.2" data-path="9-regress.html"><a href="9-regress.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
+<li class="chapter" data-level="9.2.1" data-path="9-regress.html"><a href="9-regress.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
+</ul></li>
+<li class="chapter" data-level="9.3" data-path="9-regress.html"><a href="9-regress.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
+<li class="chapter" data-level="9.3.1" data-path="9-regress.html"><a href="9-regress.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
+<li class="chapter" data-level="9.3.2" data-path="9-regress.html"><a href="9-regress.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
+<li class="chapter" data-level="9.3.3" data-path="9-regress.html"><a href="9-regress.html#interpreting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpreting the slope</a></li>
+<li class="chapter" data-level="9.3.4" data-path="9-regress.html"><a href="9-regress.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
+</ul></li>
+<li class="chapter" data-level="9.4" data-path="9-regress.html"><a href="9-regress.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
+<li class="chapter" data-level="9.4.1" data-path="9-regress.html"><a href="9-regress.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
+<li class="chapter" data-level="9.4.2" data-path="9-regress.html"><a href="9-regress.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
+<li class="chapter" data-level="9.4.3" data-path="9-regress.html"><a href="9-regress.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
+<li class="chapter" data-level="9.4.4" data-path="9-regress.html"><a href="9-regress.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.5" data-path="9-regress.html"><a href="9-regress.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
+<li class="chapter" data-level="9.4.6" data-path="9-regress.html"><a href="9-regress.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.7" data-path="9-regress.html"><a href="9-regress.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
+</ul></li>
+<li class="chapter" data-level="9.5" data-path="9-regress.html"><a href="9-regress.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
+<li class="chapter" data-level="9.6" data-path="9-regress.html"><a href="9-regress.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
+<li class="chapter" data-level="9.7" data-path="9-regress.html"><a href="9-regress.html#conclusion-5"><i class="fa fa-check"></i><b>9.7</b> Conclusion</a><ul>
+<li class="chapter" data-level="9.7.1" data-path="9-regress.html"><a href="9-regress.html#script-of-r-code-5"><i class="fa fa-check"></i><b>9.7.1</b> Script of R code</a></li>
+<li class="chapter" data-level="9.7.2" data-path="9-regress.html"><a href="9-regress.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.7.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="part"><span><b>III Conclusion</b></span></li>
 <li class="chapter" data-level="10" data-path="10-effective-data-storytelling.html"><a href="10-effective-data-storytelling.html"><i class="fa fa-check"></i><b>10</b> Effective Data Storytelling</a><ul>
@@ -304,7 +323,8 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="B" data-path="B-appendixB.html"><a href="B-appendixB.html"><i class="fa fa-check"></i><b>B</b> Inference Examples</a><ul>
-<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-6"><i class="fa fa-check"></i><b>B.1</b> Needed packages</a></li>
+<li class="chapter" data-level="" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-7"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#inference-mind-map"><i class="fa fa-check"></i><b>B.1</b> Inference Mind Map</a></li>
 <li class="chapter" data-level="B.2" data-path="B-appendixB.html"><a href="B-appendixB.html#one-mean"><i class="fa fa-check"></i><b>B.2</b> One Mean</a><ul>
 <li class="chapter" data-level="B.2.1" data-path="B-appendixB.html"><a href="B-appendixB.html#problem-statement"><i class="fa fa-check"></i><b>B.2.1</b> Problem Statement</a></li>
 <li class="chapter" data-level="B.2.2" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses"><i class="fa fa-check"></i><b>B.2.2</b> Competing Hypotheses</a></li>
@@ -352,6 +372,7 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="C" data-path="C-appendixC.html"><a href="C-appendixC.html"><i class="fa fa-check"></i><b>C</b> Reach for the Starts</a><ul>
+<li class="chapter" data-level="" data-path="C-appendixC.html"><a href="C-appendixC.html#needed-packages-8"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="C.1" data-path="C-appendixC.html"><a href="C-appendixC.html#sorted-barplots"><i class="fa fa-check"></i><b>C.1</b> Sorted barplots</a></li>
 <li class="chapter" data-level="C.2" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-graphics"><i class="fa fa-check"></i><b>C.2</b> Interactive graphics</a><ul>
 <li class="chapter" data-level="C.2.1" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-line-graphs"><i class="fa fa-check"></i><b>C.2.1</b> Interactive line-graphs</a></li>
@@ -385,27 +406,27 @@ <h2><span class="header-section-number">2.1</span> Preamble</h2>
 <li>“Intro Stat with Randomization and Simulation” <span class="citation">(Diez, Barr, and Çetinkaya-Rundel <a href="#ref-isrs2014">2014</a>)</span>, and</li>
 <li>“R for Data Science” <span class="citation">(Grolemund and Wickham <a href="#ref-rds2016">2016</a>)</span>.</li>
 </ul>
-<p>The first book, while designed for upper-level undergraduates and graduate students, provides an excellent resource on how to use resampling to build statistical concepts like normal distributions using computers instead of focusing on memorization of formulas. The last two books also provide a path towards free alternatives to the traditionally expensive introductory statistics textbook. When looking over the vast number of introductory statistics textbooks we found that there wasn’t one that incorporated many of the new R packages directly into the text. Additionally, there wasn’t an open-source, free textbook available that showed new learners all of the following</p>
+<p>The first book, while designed for upper-level undergraduates and graduate students, provides an excellent resource on how to use resampling to build statistical concepts like normal distributions using computers instead of focusing on memorization of formulas. The last two books also provide a path towards free alternatives to the traditionally expensive introductory statistics textbook. When looking over the vast number of introductory statistics textbooks, we found that there wasn’t one that incorporated many of the new R packages directly into the text. Additionally, there wasn’t an open-source, free textbook available that showed new learners all of the following</p>
 <ol style="list-style-type: decimal">
 <li>how to use R to explore and visualize data</li>
 <li>how to use randomization and simulation to build inferential ideas</li>
 <li>how to effectively create stories using these ideas to convey information to a lay audience.</li>
 </ol>
 <p>We will introduce sometimes difficult statistics concepts through the medium of data visualization. In today’s world, we are bombarded with graphics that attempt to convey ideas. We will explore what makes a good graphic and what the standard ways are to convey relationships with data. You’ll also see the use of visualization to introduce concepts like mean, median, standard deviation, distributions, etc. In general, we’ll use visualization as a way of building almost all of the ideas in this book.</p>
-<p>Additionally, this book will focus on the triad of computational thinking, data thinking, and inferential thinking. We’ll see throughout the book how these three modes of thinking can build effective ways to work with, describe, and convey statistical knowledge. In order to do so, you’ll see the importance of literate programming to develop literate data science. In other words, you’ll see how to write code and descriptions that are useful not just for a computer to execute but also for readers to understand exactly what a statistical analysis is doing and how it works. Hal Abelson coined the phrase that we will follow throughout this book:</p>
+<p>Additionally, this book will focus on the triad of computational thinking, data thinking, and inferential thinking. We’ll see throughout the book how these three modes of thinking can build effective ways to work with, to describe, and to convey statistical knowledge. In order to do so, you’ll see the importance of literate programming to develop literate data science. In other words, you’ll see how to write code and descriptions that are useful not just for a computer to execute but also for readers to understand exactly what a statistical analysis is doing and how it works. Hal Abelson coined the phrase that we will follow throughout this book:</p>
 <blockquote>
 <p>“Programs must be written for people to read, and only incidentally for machines to execute.”</p>
 </blockquote>
 </div>
 <div id="three-driving-data-sources" class="section level2">
 <h2><span class="header-section-number">2.2</span> Three driving data sources</h2>
-<p>Instead of hopping from one data set to the next, we’ve decided to focus throughout the book on three different data sources:</p>
+<p>Instead of hopping from one data set to the next in the text of this book, we’ve decided to focus throughout on three different data sources:</p>
 <ul>
 <li>flights leaving New York City in 2013</li>
 <li>profiles of OKCupid users in San Francisco</li>
 <li>IMDB movie ratings</li>
 </ul>
-<p>By focusing on just three large data sources, it is our hope that you’ll be able to see how each of the chapters is interconnected. You’ll see how the data being tidy leads into data visualization and manipulation and how those concepts tie into inference and regression.</p>
+<p>By focusing on just three large data sources, it is our hope that you’ll be able to see how each of the chapters is interconnected. You’ll see how the data being tidy leads into data visualization and manipulation in exploratory data analysis and how those concepts tie into inference and regression.</p>
 </div>
 <div id="datascience-pipeline" class="section level2">
 <h2><span class="header-section-number">2.3</span> Data/science pipeline</h2>
@@ -416,6 +437,7 @@ <h2><span class="header-section-number">2.3</span> Data/science pipeline</h2>
 <li>data visualization</li>
 <li>data modeling</li>
 <li>inference</li>
+<li>correlation and regression</li>
 <li>interpretation of results</li>
 <li>data storytelling</li>
 </ul>
@@ -427,7 +449,7 @@ <h2><span class="header-section-number">2.3</span> Data/science pipeline</h2>
 </p>
 </div>
 <p>We will begin with a discussion on what is meant by tidy data and then dig into the gray <strong>Understand</strong> portion of the cycle and conclude by talking about interpreting and discussing the results of our models via <strong>Communication</strong>. These steps are vital to any statistical analysis. But why should you care about statistics? “Why did they make me take this class?”</p>
-<p>There’s a reason so many fields require a statistics course. Scientific knowledge grows through an understanding of statistical significance and data analysis. You needn’t be intimidated by statistics. It’s not the beast that it used to be and paired with computation you’ll see how reproducible research in the sciences particularly increases scientific knowledge.</p>
+<p>There’s a reason so many fields require a statistics course. Scientific knowledge grows through an understanding of statistical significance and data analysis. You needn’t be intimidated by statistics. It’s not the beast that it used to be and, paired with computation, you’ll see how reproducible research in the sciences particularly increases scientific knowledge.</p>
 </div>
 <div id="reproducibility" class="section level2">
 <h2><span class="header-section-number">2.4</span> Reproducibility</h2>
@@ -435,16 +457,16 @@ <h2><span class="header-section-number">2.4</span> Reproducibility</h2>
 <p>“The most important tool is the <em>mindset</em>, when starting, that the end product will be reproducible.” – Keith Baggerly</p>
 </blockquote>
 <p>Another large goal of this book is to help readers understand the importance of reproducible analyses. The hope is to get readers into the habit of making their analyses reproducible from the very beginning. This means we’ll be trying to help you build new habits. This will take practice and be difficult at times. You’ll see just why it is so important for you to keep track of your code and well-document it to help yourself later and any potential collaborators as well.</p>
-<p>Copying and pasting is not the way that efficient and effective scientific research is conducted. It’s much more important for time to be spent on data collection and data analysis and not on copying and pasting plots back and forth across a variety of programs.</p>
+<p>Copying and pasting results from one program into a word processor is not the way that efficient and effective scientific research is conducted. It’s much more important for time to be spent on data collection and data analysis and not on copying and pasting plots back and forth across a variety of programs.</p>
 <p>In a traditional analyses if an error was made with the original data, we’d need to step through the entire process again: recreate the plots and copy and paste all of the new plots and our statistical analysis into your document. This is error prone and a frustrating use of time. We’ll see how to use R Markdown to get away from this tedious activity so that we can spend more time doing science.</p>
 <blockquote>
 <p>“We are talking about <em>computational</em> reproducibility.” - Yihui Xie</p>
 </blockquote>
-<p>Reproducibility means a lot of things in terms of different scientific fields. Are experiments conducted in a way that another researcher could follow the steps and get similar results? In this book, we will focus on what is known as <strong>computational reproducibility</strong>. This refers to being able to pass all of one’s data analysis and conclusions to someone else and have them get exactly the same results on their machine. This allows for time to be spent doing actual science and interpreting of results and assumptions instead of the more error prone way of starting from scratch or follow a list of steps that may be different from machine to machine.</p>
+<p>Reproducibility means a lot of things in terms of different scientific fields. Are experiments conducted in a way that another researcher could follow the steps and get similar results? In this book, we will focus on what is known as <strong>computational reproducibility</strong>. This refers to being able to pass all of one’s data analysis, data sets, and conclusions to someone else and have them get exactly the same results on their machine. This allows for time to be spent doing actual science and interpreting of results and assumptions instead of the more error prone way of starting from scratch or following a list of steps that may be different from machine to machine.</p>
 </div>
 <div id="who-is-this-book-for" class="section level2">
 <h2><span class="header-section-number">2.5</span> Who is this book for?</h2>
-<p>This book is targeted at students taking a traditional intro stats class in a small college environment using RStudio and preferably RStudio Server. We assume no prerequisites: no calculus and no prior programming experience. This is intended to be a gentle and nice introduction to the practice of statistics in terms of how data scientists, statisticians, and other scientists analyze data and write stories about data. We have intentionally avoided the use of throwing formulas at you and instead have focused on developing statistical concepts via data visualization and statistical computing. We hope this is a more intuitive experience than the way statistics has traditionally been taught in the past (and how it is commonly perceived from the outside). We additionally hope that you see the value of reproducible research via R as you continue in your studies. We understand that there will initially be growing pains in learning to program but we are here to help you and you should know that there is a huge community of R users that are always happy to help newbies along.</p>
+<p>This book is targeted at students taking a traditional intro stats class in a small college environment using RStudio and preferably RStudio Server. We assume no prerequisites: no algebra, no calculus, and no prior programming experience. This is intended to be a gentle and nice introduction to the practice of statistics in terms of how data scientists, statisticians, data journalists, and other scientists analyze data and write stories about data. We have intentionally avoided the use of throwing formulas at you as much as possible and instead have focused on developing statistical concepts via data visualization and statistical computing. We hope this is a more intuitive experience than the way statistics has traditionally been taught in the past (and how it is commonly perceived from the outside). We additionally hope that you see the value of reproducible research via R as you continue in your studies. We understand that there will initially be growing pains in learning to program but we are here to help you and you should know that there is a huge community of R users that are always happy to help newbies along as well.</p>
 <p>Now let’s get into learning about how to create good stories about and with data!</p>
 
 </div>
diff --git a/docs/3-tidy.html b/docs/3-tidy.html
index 23fd8a6c6..8d32c6d3a 100644
--- a/docs/3-tidy.html
+++ b/docs/3-tidy.html
@@ -26,7 +26,7 @@
 <meta name="author" content="Chester Ismay and Albert Y. Kim">
 
 
-<meta name="date" content="2017-01-07">
+<meta name="date" content="2017-01-10">
 
   <meta name="viewport" content="width=device-width, initial-scale=1">
   <meta name="apple-mobile-web-app-capable" content="yes">
@@ -34,7 +34,7 @@
   
   
 <link rel="prev" href="2-intro.html">
-<link rel="next" href="4-data-visualization-via-ggplot2.html">
+<link rel="next" href="4-viz.html">
 
 <script src="libs/jquery-2.2.3/jquery.min.js"></script>
 <link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
@@ -119,9 +119,9 @@
 
 <ul class="summary">
 <li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> Preamble</a><ul>
-<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book</a></li>
+<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book---for-instructors"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book - For Instructors</a></li>
 <li class="chapter" data-level="1.2" data-path="index.html"><a href="index.html#contribute"><i class="fa fa-check"></i><b>1.2</b> Contribute</a></li>
-<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started"><i class="fa fa-check"></i><b>1.3</b> Getting Started</a></li>
+<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started---for-students"><i class="fa fa-check"></i><b>1.3</b> Getting Started - For Students</a></li>
 <li class="chapter" data-level="" data-path="index.html"><a href="index.html#colophon"><i class="fa fa-check"></i>Colophon</a></li>
 </ul></li>
 <li class="chapter" data-level="2" data-path="2-intro.html"><a href="2-intro.html"><i class="fa fa-check"></i><b>2</b> Introduction</a><ul>
@@ -133,87 +133,100 @@
 </ul></li>
 <li class="part"><span><b>I Data Exploration</b></span></li>
 <li class="chapter" data-level="3" data-path="3-tidy.html"><a href="3-tidy.html"><i class="fa fa-check"></i><b>3</b> Tidy Data</a><ul>
+<li class="chapter" data-level="" data-path="3-tidy.html"><a href="3-tidy.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="3.1" data-path="3-tidy.html"><a href="3-tidy.html#what-is-tidy-data"><i class="fa fa-check"></i><b>3.1</b> What is tidy data?</a></li>
-<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#the-nycflights13-datasets"><i class="fa fa-check"></i><b>3.2</b> The <code>nycflights13</code> datasets</a></li>
-<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a></li>
+<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#datasets-in-the-nycflights13-package"><i class="fa fa-check"></i><b>3.2</b> Datasets in the <code>nycflights13</code> package</a></li>
+<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a><ul>
+<li class="chapter" data-level="3.3.1" data-path="3-tidy.html"><a href="3-tidy.html#identification-variables"><i class="fa fa-check"></i><b>3.3.1</b> Identification variables</a></li>
+</ul></li>
 <li class="chapter" data-level="3.4" data-path="3-tidy.html"><a href="3-tidy.html#normal-forms-of-data"><i class="fa fa-check"></i><b>3.4</b> Normal forms of data</a></li>
 <li class="chapter" data-level="3.5" data-path="3-tidy.html"><a href="3-tidy.html#whats-to-come"><i class="fa fa-check"></i><b>3.5</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via <code id="viz">ggplot2</code></a><ul>
-<li class="chapter" data-level="" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
-<li class="chapter" data-level="4.1.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
-<li class="chapter" data-level="4.1.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
-<li class="chapter" data-level="4.1.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
-<li class="chapter" data-level="4.1.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The <code>ggplot2</code> Package</a></li>
-</ul></li>
-<li class="chapter" data-level="4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
-<li class="chapter" data-level="4.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
-<li class="chapter" data-level="4.3.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatter-plots-via-geom_point"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via <code id="geompoint">geom_point</code></a></li>
-<li class="chapter" data-level="4.3.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
-<li class="chapter" data-level="4.3.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
-<li class="chapter" data-level="4.4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#line-graphs-via-geom_line"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via <code id="geomline">geom_line</code></a></li>
-<li class="chapter" data-level="4.4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.5" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
-<li class="chapter" data-level="4.5.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms-via-geom_histogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via <code id="geomhistogram">geom_histogram</code></a></li>
-<li class="chapter" data-level="4.5.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
-<li class="chapter" data-level="4.5.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.6" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
-<li class="chapter" data-level="4.7" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
-<li class="chapter" data-level="4.7.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#boxplots-via-geom_boxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via <code id="geomboxplot">geom_boxplot</code></a></li>
-<li class="chapter" data-level="4.7.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.8" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
-<li class="chapter" data-level="4.8.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via <code>geom_bar</code></a></li>
-<li class="chapter" data-level="4.8.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
-<li class="chapter" data-level="4.8.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
-<li class="chapter" data-level="4.8.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.9" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
-<li class="chapter" data-level="4.9.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.1</b> What’s to come?</a></li>
-<li class="chapter" data-level="4.9.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
-</ul></li>
-</ul></li>
-<li class="chapter" data-level="5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via <code id="manip">dplyr</code></a><ul>
-<li class="chapter" data-level="" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="5.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
-<li class="chapter" data-level="5.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#four-main-verbs---the-4mv"><i class="fa fa-check"></i><b>5.2</b> Four Main Verbs - The 4MV</a><ul>
-<li class="chapter" data-level="5.2.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#filter-observations-using-filter"><i class="fa fa-check"></i><b>5.2.1</b> Filter observations using <code id="filter">filter</code></a></li>
-<li class="chapter" data-level="5.2.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> Summarize variables using <code>summarize</code></a></li>
-<li class="chapter" data-level="5.2.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.3</b> Create new variables/change old variables using <code>mutate</code></a></li>
-<li class="chapter" data-level="5.2.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#reorder-the-data-frame-using-arrange"><i class="fa fa-check"></i><b>5.2.4</b> Reorder the data frame using <code id="arrange">arrange</code></a></li>
-</ul></li>
-<li class="chapter" data-level="5.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#other-verbs"><i class="fa fa-check"></i><b>5.3</b> Other verbs</a><ul>
-<li class="chapter" data-level="5.3.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#select-variables-using-select"><i class="fa fa-check"></i><b>5.3.1</b> Select variables using <code id="select">select</code></a></li>
-<li class="chapter" data-level="5.3.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#rename-variables-using-rename"><i class="fa fa-check"></i><b>5.3.2</b> Rename variables using <code id="rename">rename</code></a></li>
-<li class="chapter" data-level="5.3.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.3.3</b> Find the top number of values using <code>top_n</code></a></li>
-</ul></li>
-<li class="chapter" data-level="5.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#joiningmerging-data-frames"><i class="fa fa-check"></i><b>5.4</b> Joining/merging data frames</a></li>
-<li class="chapter" data-level="5.5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5</b> Script of R code</a></li>
-<li class="chapter" data-level="5.6" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.6</b> What’s to come?</a></li>
+<li class="chapter" data-level="4" data-path="4-viz.html"><a href="4-viz.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via ggplot2</a><ul>
+<li class="chapter" data-level="" data-path="4-viz.html"><a href="4-viz.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="4.1" data-path="4-viz.html"><a href="4-viz.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
+<li class="chapter" data-level="4.1.1" data-path="4-viz.html"><a href="4-viz.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
+<li class="chapter" data-level="4.1.2" data-path="4-viz.html"><a href="4-viz.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
+<li class="chapter" data-level="4.1.3" data-path="4-viz.html"><a href="4-viz.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
+<li class="chapter" data-level="4.1.4" data-path="4-viz.html"><a href="4-viz.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The ggplot2 Package</a></li>
+</ul></li>
+<li class="chapter" data-level="4.2" data-path="4-viz.html"><a href="4-viz.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
+<li class="chapter" data-level="4.3" data-path="4-viz.html"><a href="4-viz.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
+<li class="chapter" data-level="4.3.1" data-path="4-viz.html"><a href="4-viz.html#geompoint"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via geom_point</a></li>
+<li class="chapter" data-level="4.3.2" data-path="4-viz.html"><a href="4-viz.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
+<li class="chapter" data-level="4.3.3" data-path="4-viz.html"><a href="4-viz.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.4" data-path="4-viz.html"><a href="4-viz.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
+<li class="chapter" data-level="4.4.1" data-path="4-viz.html"><a href="4-viz.html#geomline"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via geom_line</a></li>
+<li class="chapter" data-level="4.4.2" data-path="4-viz.html"><a href="4-viz.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.5" data-path="4-viz.html"><a href="4-viz.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
+<li class="chapter" data-level="4.5.1" data-path="4-viz.html"><a href="4-viz.html#geomhistogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via geom_histogram</a></li>
+<li class="chapter" data-level="4.5.2" data-path="4-viz.html"><a href="4-viz.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
+<li class="chapter" data-level="4.5.3" data-path="4-viz.html"><a href="4-viz.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.6" data-path="4-viz.html"><a href="4-viz.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
+<li class="chapter" data-level="4.7" data-path="4-viz.html"><a href="4-viz.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
+<li class="chapter" data-level="4.7.1" data-path="4-viz.html"><a href="4-viz.html#geomboxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via geom_boxplot</a></li>
+<li class="chapter" data-level="4.7.2" data-path="4-viz.html"><a href="4-viz.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.8" data-path="4-viz.html"><a href="4-viz.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
+<li class="chapter" data-level="4.8.1" data-path="4-viz.html"><a href="4-viz.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via geom_bar</a></li>
+<li class="chapter" data-level="4.8.2" data-path="4-viz.html"><a href="4-viz.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
+<li class="chapter" data-level="4.8.3" data-path="4-viz.html"><a href="4-viz.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
+<li class="chapter" data-level="4.8.4" data-path="4-viz.html"><a href="4-viz.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.9" data-path="4-viz.html"><a href="4-viz.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="4.9.1" data-path="4-viz.html"><a href="4-viz.html#resources"><i class="fa fa-check"></i><b>4.9.1</b> Resources</a></li>
+<li class="chapter" data-level="4.9.2" data-path="4-viz.html"><a href="4-viz.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
+<li class="chapter" data-level="4.9.3" data-path="4-viz.html"><a href="4-viz.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.3</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="5" data-path="5-manip.html"><a href="5-manip.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via dplyr</a><ul>
+<li class="chapter" data-level="" data-path="5-manip.html"><a href="5-manip.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="5.1" data-path="5-manip.html"><a href="5-manip.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
+<li class="chapter" data-level="5.2" data-path="5-manip.html"><a href="5-manip.html#five-main-verbs---the-5mv"><i class="fa fa-check"></i><b>5.2</b> Five Main Verbs - The 5MV</a><ul>
+<li class="chapter" data-level="5.2.1" data-path="5-manip.html"><a href="5-manip.html#filter"><i class="fa fa-check"></i><b>5.2.1</b> 5MV#1: Filter observations using filter</a></li>
+<li class="chapter" data-level="5.2.2" data-path="5-manip.html"><a href="5-manip.html#mv2-summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> 5MV#2: Summarize variables using summarize</a></li>
+<li class="chapter" data-level="5.2.3" data-path="5-manip.html"><a href="5-manip.html#mv3-group-rows-using-group_by"><i class="fa fa-check"></i><b>5.2.3</b> 5MV#3: Group rows using group_by</a></li>
+<li class="chapter" data-level="5.2.4" data-path="5-manip.html"><a href="5-manip.html#mv4-create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.4</b> 5MV#4: Create new variables/change old variables using mutate</a></li>
+<li class="chapter" data-level="5.2.5" data-path="5-manip.html"><a href="5-manip.html#arrange"><i class="fa fa-check"></i><b>5.2.5</b> 5MV#5: Reorder the data frame using arrange</a></li>
+</ul></li>
+<li class="chapter" data-level="5.3" data-path="5-manip.html"><a href="5-manip.html#joining-data-frames"><i class="fa fa-check"></i><b>5.3</b> Joining data frames</a><ul>
+<li class="chapter" data-level="5.3.1" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables"><i class="fa fa-check"></i><b>5.3.1</b> Joining by Key Variables</a></li>
+<li class="chapter" data-level="5.3.2" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables-with-different-names"><i class="fa fa-check"></i><b>5.3.2</b> Joining by Key Variables with Different Names</a></li>
+</ul></li>
+<li class="chapter" data-level="5.4" data-path="5-manip.html"><a href="5-manip.html#optional-other-verbs"><i class="fa fa-check"></i><b>5.4</b> Optional: Other verbs</a><ul>
+<li class="chapter" data-level="5.4.1" data-path="5-manip.html"><a href="5-manip.html#select"><i class="fa fa-check"></i><b>5.4.1</b> Select variables using select</a></li>
+<li class="chapter" data-level="5.4.2" data-path="5-manip.html"><a href="5-manip.html#rename"><i class="fa fa-check"></i><b>5.4.2</b> Rename variables using rename</a></li>
+<li class="chapter" data-level="5.4.3" data-path="5-manip.html"><a href="5-manip.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.4.3</b> Find the top number of values using top_n</a></li>
+</ul></li>
+<li class="chapter" data-level="5.5" data-path="5-manip.html"><a href="5-manip.html#conclusion-1"><i class="fa fa-check"></i><b>5.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="5.5.1" data-path="5-manip.html"><a href="5-manip.html#resources-1"><i class="fa fa-check"></i><b>5.5.1</b> Resources</a></li>
+<li class="chapter" data-level="5.5.2" data-path="5-manip.html"><a href="5-manip.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5.2</b> Script of R code</a></li>
+<li class="chapter" data-level="5.5.3" data-path="5-manip.html"><a href="5-manip.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.5.3</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="part"><span><b>II Inference</b></span></li>
-<li class="chapter" data-level="6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via <code id="sim">mosaic</code></a><ul>
-<li class="chapter" data-level="" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="6.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
-<li class="chapter" data-level="6.1.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
-<li class="chapter" data-level="6.1.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+<li class="chapter" data-level="6" data-path="6-sim.html"><a href="6-sim.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via mosaic</a><ul>
+<li class="chapter" data-level="" data-path="6-sim.html"><a href="6-sim.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="6.1" data-path="6-sim.html"><a href="6-sim.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
+<li class="chapter" data-level="6.1.1" data-path="6-sim.html"><a href="6-sim.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
+<li class="chapter" data-level="6.1.2" data-path="6-sim.html"><a href="6-sim.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+</ul></li>
+<li class="chapter" data-level="6.2" data-path="6-sim.html"><a href="6-sim.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
+<li class="chapter" data-level="6.2.1" data-path="6-sim.html"><a href="6-sim.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
+<li class="chapter" data-level="6.2.2" data-path="6-sim.html"><a href="6-sim.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
 </ul></li>
-<li class="chapter" data-level="6.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
-<li class="chapter" data-level="6.2.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
-<li class="chapter" data-level="6.2.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
+<li class="chapter" data-level="6.3" data-path="6-sim.html"><a href="6-sim.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
+<li class="chapter" data-level="6.4" data-path="6-sim.html"><a href="6-sim.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
+<li class="chapter" data-level="6.5" data-path="6-sim.html"><a href="6-sim.html#conclusion-2"><i class="fa fa-check"></i><b>6.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="6.5.1" data-path="6-sim.html"><a href="6-sim.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5.1</b> Script of R code</a></li>
+<li class="chapter" data-level="6.5.2" data-path="6-sim.html"><a href="6-sim.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.5.2</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="6.3" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
-<li class="chapter" data-level="6.4" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
-<li class="chapter" data-level="6.5" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5</b> Script of R code</a></li>
-<li class="chapter" data-level="6.6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.6</b> What’s to come?</a></li>
 </ul></li>
 <li class="chapter" data-level="7" data-path="7-hypo.html"><a href="7-hypo.html"><i class="fa fa-check"></i><b>7</b> Hypothesis Testing</a><ul>
-<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="7.1" data-path="7-hypo.html"><a href="7-hypo.html#when-inference-is-not-needed"><i class="fa fa-check"></i><b>7.1</b> When Inference Is Not Needed</a></li>
 <li class="chapter" data-level="7.2" data-path="7-hypo.html"><a href="7-hypo.html#basics-of-hypothesis-testing"><i class="fa fa-check"></i><b>7.2</b> Basics of Hypothesis Testing</a></li>
 <li class="chapter" data-level="7.3" data-path="7-hypo.html"><a href="7-hypo.html#trial"><i class="fa fa-check"></i><b>7.3</b> Criminal trial analogy</a><ul>
@@ -249,44 +262,50 @@
 <li class="chapter" data-level="7.8.1" data-path="7-hypo.html"><a href="7-hypo.html#example-t-test-for-two-independent-samples"><i class="fa fa-check"></i><b>7.8.1</b> EXAMPLE: <span class="math inline">\(t\)</span>-test for two independent samples</a></li>
 <li class="chapter" data-level="7.8.2" data-path="7-hypo.html"><a href="7-hypo.html#conditions-for-t-test"><i class="fa fa-check"></i><b>7.8.2</b> Conditions for t-test</a></li>
 </ul></li>
-<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9</b> What’s to come?</a></li>
+<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#conclusion-3"><i class="fa fa-check"></i><b>7.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="7.9.1" data-path="7-hypo.html"><a href="7-hypo.html#script-of-r-code-3"><i class="fa fa-check"></i><b>7.9.1</b> Script of R code</a></li>
+<li class="chapter" data-level="7.9.2" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="chapter" data-level="8" data-path="8-ci.html"><a href="8-ci.html"><i class="fa fa-check"></i><b>8</b> Confidence Intervals</a><ul>
-<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a></li>
-<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#mean_rating"><i class="fa fa-check"></i><b>8.2</b> mean_rating</a><ul>
-<li class="chapter" data-level="8.2.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.2.1</b> Review of Bootstrapping</a></li>
-</ul></li>
-<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.3</b> Relation to hypothesis testing</a></li>
-<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.4</b> Effect size</a></li>
-<li class="chapter" data-level="8.5" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-3"><i class="fa fa-check"></i><b>8.5</b> Script of R code</a></li>
-<li class="chapter" data-level="8.6" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.6</b> What’s to come?</a></li>
-</ul></li>
-<li class="chapter" data-level="9" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html"><i class="fa fa-check"></i><b>9</b> Regression via <code id="regress">broom</code></a><ul>
-<li class="chapter" data-level="" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="9.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
-<li class="chapter" data-level="9.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
-<li class="chapter" data-level="9.2.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
-</ul></li>
-<li class="chapter" data-level="9.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
-<li class="chapter" data-level="9.3.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
-<li class="chapter" data-level="9.3.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
-<li class="chapter" data-level="9.3.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#interpretting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpretting the slope</a></li>
-<li class="chapter" data-level="9.3.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
-</ul></li>
-<li class="chapter" data-level="9.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
-<li class="chapter" data-level="9.4.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
-<li class="chapter" data-level="9.4.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
-<li class="chapter" data-level="9.4.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
-<li class="chapter" data-level="9.4.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
-<li class="chapter" data-level="9.4.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
-</ul></li>
-<li class="chapter" data-level="9.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
-<li class="chapter" data-level="9.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
-<li class="chapter" data-level="9.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#script-of-r-code-4"><i class="fa fa-check"></i><b>9.7</b> Script of R code</a></li>
-<li class="chapter" data-level="9.8" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.8</b> What’s to come?</a></li>
+<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a><ul>
+<li class="chapter" data-level="8.1.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.1.1</b> Review of Bootstrapping</a></li>
+</ul></li>
+<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.2</b> Relation to hypothesis testing</a></li>
+<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.3</b> Effect size</a></li>
+<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#conclusion-4"><i class="fa fa-check"></i><b>8.4</b> Conclusion</a><ul>
+<li class="chapter" data-level="8.4.1" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-4"><i class="fa fa-check"></i><b>8.4.1</b> Script of R code</a></li>
+<li class="chapter" data-level="8.4.2" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.4.2</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="9" data-path="9-regress.html"><a href="9-regress.html"><i class="fa fa-check"></i><b>9</b> Regression via broom</a><ul>
+<li class="chapter" data-level="" data-path="9-regress.html"><a href="9-regress.html#needed-packages-6"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="9.1" data-path="9-regress.html"><a href="9-regress.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
+<li class="chapter" data-level="9.2" data-path="9-regress.html"><a href="9-regress.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
+<li class="chapter" data-level="9.2.1" data-path="9-regress.html"><a href="9-regress.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
+</ul></li>
+<li class="chapter" data-level="9.3" data-path="9-regress.html"><a href="9-regress.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
+<li class="chapter" data-level="9.3.1" data-path="9-regress.html"><a href="9-regress.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
+<li class="chapter" data-level="9.3.2" data-path="9-regress.html"><a href="9-regress.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
+<li class="chapter" data-level="9.3.3" data-path="9-regress.html"><a href="9-regress.html#interpreting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpreting the slope</a></li>
+<li class="chapter" data-level="9.3.4" data-path="9-regress.html"><a href="9-regress.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
+</ul></li>
+<li class="chapter" data-level="9.4" data-path="9-regress.html"><a href="9-regress.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
+<li class="chapter" data-level="9.4.1" data-path="9-regress.html"><a href="9-regress.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
+<li class="chapter" data-level="9.4.2" data-path="9-regress.html"><a href="9-regress.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
+<li class="chapter" data-level="9.4.3" data-path="9-regress.html"><a href="9-regress.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
+<li class="chapter" data-level="9.4.4" data-path="9-regress.html"><a href="9-regress.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.5" data-path="9-regress.html"><a href="9-regress.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
+<li class="chapter" data-level="9.4.6" data-path="9-regress.html"><a href="9-regress.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.7" data-path="9-regress.html"><a href="9-regress.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
+</ul></li>
+<li class="chapter" data-level="9.5" data-path="9-regress.html"><a href="9-regress.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
+<li class="chapter" data-level="9.6" data-path="9-regress.html"><a href="9-regress.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
+<li class="chapter" data-level="9.7" data-path="9-regress.html"><a href="9-regress.html#conclusion-5"><i class="fa fa-check"></i><b>9.7</b> Conclusion</a><ul>
+<li class="chapter" data-level="9.7.1" data-path="9-regress.html"><a href="9-regress.html#script-of-r-code-5"><i class="fa fa-check"></i><b>9.7.1</b> Script of R code</a></li>
+<li class="chapter" data-level="9.7.2" data-path="9-regress.html"><a href="9-regress.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.7.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="part"><span><b>III Conclusion</b></span></li>
 <li class="chapter" data-level="10" data-path="10-effective-data-storytelling.html"><a href="10-effective-data-storytelling.html"><i class="fa fa-check"></i><b>10</b> Effective Data Storytelling</a><ul>
@@ -304,7 +323,8 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="B" data-path="B-appendixB.html"><a href="B-appendixB.html"><i class="fa fa-check"></i><b>B</b> Inference Examples</a><ul>
-<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-6"><i class="fa fa-check"></i><b>B.1</b> Needed packages</a></li>
+<li class="chapter" data-level="" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-7"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#inference-mind-map"><i class="fa fa-check"></i><b>B.1</b> Inference Mind Map</a></li>
 <li class="chapter" data-level="B.2" data-path="B-appendixB.html"><a href="B-appendixB.html#one-mean"><i class="fa fa-check"></i><b>B.2</b> One Mean</a><ul>
 <li class="chapter" data-level="B.2.1" data-path="B-appendixB.html"><a href="B-appendixB.html#problem-statement"><i class="fa fa-check"></i><b>B.2.1</b> Problem Statement</a></li>
 <li class="chapter" data-level="B.2.2" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses"><i class="fa fa-check"></i><b>B.2.2</b> Competing Hypotheses</a></li>
@@ -352,6 +372,7 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="C" data-path="C-appendixC.html"><a href="C-appendixC.html"><i class="fa fa-check"></i><b>C</b> Reach for the Starts</a><ul>
+<li class="chapter" data-level="" data-path="C-appendixC.html"><a href="C-appendixC.html#needed-packages-8"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="C.1" data-path="C-appendixC.html"><a href="C-appendixC.html#sorted-barplots"><i class="fa fa-check"></i><b>C.1</b> Sorted barplots</a></li>
 <li class="chapter" data-level="C.2" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-graphics"><i class="fa fa-check"></i><b>C.2</b> Interactive graphics</a><ul>
 <li class="chapter" data-level="C.2.1" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-line-graphs"><i class="fa fa-check"></i><b>C.2.1</b> Interactive line-graphs</a></li>
@@ -377,8 +398,15 @@ <h1>
             <section class="normal" id="section-">
 <div id="tidy" class="section level1">
 <h1><span class="header-section-number">3</span> Tidy Data</h1>
-<p>In this chapter, we’ll discuss the importance of tidy data. You may think that this means just having your data in a spreadsheet, but you’ll see that it is actually more specific than that. Data actually comes to us in a variety of formats from pictures to text and to just numbers. We’ll focus on datasets that can be stored in a spreadsheet throughout this book as that is the most common way data is collected in the sciences.</p>
-<p>Having tidy data will allow us to more easily create data visualizations as we will see in Chapter <a href="4-data-visualization-via-ggplot2.html#viz"><strong>??</strong></a>. It will also help us with manipulating data in Chapter <a href="5-data-manipulation-via-dplyr.html#manip"><strong>??</strong></a> and in all subsequent chapters when we discuss statistical inference. You may not necessarily understand the importance for <strong>tidy data</strong> but it will become more and more apparent as we proceed through the book.</p>
+<p>In this chapter, we’ll discuss the importance of tidy data. You may think that this means just having your data in a spreadsheet, but you’ll see that it is actually more specific than that. Data actually comes to us in a variety of formats from pictures to text to just numbers. We’ll focus on datasets that can be stored in a spreadsheet throughout this book as that is the most common way data is collected in the sciences.</p>
+<p>Having tidy data will allow us to more easily create data visualizations as we will see in Chapter <a href="4-viz.html#viz">4</a>. It will also help us with manipulating data in Chapter <a href="5-manip.html#manip">5</a> and in all subsequent chapters when we discuss statistical inference. You may not necessarily understand the importance for <strong>tidy data</strong> immediately but it will become more and more apparent as we proceed through the book.</p>
+<div id="needed-packages" class="section level3 unnumbered">
+<h3>Needed packages</h3>
+<p>At the beginning of this and all subsequent chapters, we’ll always have a list of packages you should have installed and loaded. In particular we load the <code>nycflights13</code> package which we’ll discuss shortly and the <code>dplyr</code> package for data manipulation, the subject of Chapter <a href="5-manip.html#manip">5</a>.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(nycflights13)
+<span class="kw">library</span>(dplyr)</code></pre></div>
+<!--Subsection on Tidy Data -->
+</div>
 <div id="what-is-tidy-data" class="section level2">
 <h2><span class="header-section-number">3.1</span> What is tidy data?</h2>
 <p>You have surely heard the word “tidy” in your life:</p>
@@ -388,7 +416,7 @@ <h2><span class="header-section-number">3.1</span> What is tidy data?</h2>
 <li>Marie Kondo’s best-selling book <a href="https://www.amazon.com/Life-Changing-Magic-Tidying-Decluttering-Organizing/dp/1607747308/ref=sr_1_1?ie=UTF8&amp;qid=1469400636&amp;sr=8-1&amp;keywords=tidying+up"><em>The Life-Changing Magic of Tidying Up: The Japanese Art of Decluttering and Organizing</em></a></li>
 <li>“I am not by any stretch of the imagination a tidy person, and the piles of unread books on the coffee table and by my bed have a plaintive, pleading quality to me - ‘Read me, please!’” - Linda Grant</li>
 </ul>
-<p>So what does it mean for your data to be <strong>tidy</strong>? Put simply: it means that your data is organized. But it’s more than just that. It means that your data follows the same standard format making it easy for others to find elements of your data, to manipulate and transform your data, and for our purposes continuing with the common theme: it makes it easier to visualize your data and the relationships between different variables in your data.</p>
+<p>So what does it mean for your data to be <strong>tidy</strong>? Put simply, it means that your data is organized. But it’s more than just that. It means that your data follows the same standard format making it easy for others to find elements of your data, to manipulate and transform your data, and, for our purposes, continuing with the common theme: it makes it easier to visualize your data and the relationships between different variables in your data.</p>
 <p>We will follow Hadley Wickham’s definition of <strong>tidy data</strong> here <span class="citation">(Wickham <a href="#ref-tidy">2014</a>)</span>:</p>
 <blockquote>
 <p>A dataset is a collection of values, usually either numbers (if quantitative) or strings (if qualitative). Values are organised in two ways. Every value belongs to a variable and an observation. A variable contains all values that measure the same underlying attribute (like height, temperature, duration) across units. An observation contains all values measured on the same unit (like a person, or a day, or a race) across attributes.</p>
@@ -409,7 +437,7 @@ <h2><span class="header-section-number">3.1</span> What is tidy data?</h2>
 Figure 3.1: Tidy data graphic from <a href="http://r4ds.had.co.nz/tidy-data.html" class="uri">http://r4ds.had.co.nz/tidy-data.html</a>
 </p>
 </div>
-<p>Reading over this definition, you can begin to think about datasets that won’t follow this nice format.</p>
+<p>Reading over this definition, you can begin to think about datasets that won’t follow this nice format. This format of data is also known as “long” format.</p>
 <hr />
 <div class="learncheck">
 <p>
@@ -422,27 +450,74 @@ <h2><span class="header-section-number">3.1</span> What is tidy data?</h2>
 </li>
 <li>How could the dataset be tweaked to make it <strong>tidy</strong>?</li>
 </ul>
+<p><strong>(LC3.2)</strong> Say the following table are stock prices, how would you make this tidy?</p>
+<table>
+<thead>
+<tr class="header">
+<th align="left">time</th>
+<th align="right">x</th>
+<th align="right">y</th>
+<th align="right">z</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td align="left">2009-01-01</td>
+<td align="right">-1.346</td>
+<td align="right">-2.241</td>
+<td align="right">4.412</td>
+</tr>
+<tr class="even">
+<td align="left">2009-01-02</td>
+<td align="right">-0.777</td>
+<td align="right">-2.111</td>
+<td align="right">1.202</td>
+</tr>
+<tr class="odd">
+<td align="left">2009-01-03</td>
+<td align="right">0.304</td>
+<td align="right">-7.305</td>
+<td align="right">-4.859</td>
+</tr>
+<tr class="even">
+<td align="left">2009-01-04</td>
+<td align="right">2.510</td>
+<td align="right">0.213</td>
+<td align="right">0.720</td>
+</tr>
+<tr class="odd">
+<td align="left">2009-01-05</td>
+<td align="right">-0.484</td>
+<td align="right">-0.008</td>
+<td align="right">7.705</td>
+</tr>
+</tbody>
+</table>
 <hr />
+<!--Subsection on nycflights13 -->
 </div>
-<div id="the-nycflights13-datasets" class="section level2">
-<h2><span class="header-section-number">3.2</span> The <code>nycflights13</code> datasets</h2>
+<div id="datasets-in-the-nycflights13-package" class="section level2">
+<h2><span class="header-section-number">3.2</span> Datasets in the <code>nycflights13</code> package</h2>
 <p>We likely have all flown on airplanes or know someone that has. Air travel has become an ever-present aspect of our daily lives. If you live in or are visiting a relatively large city and you walk around that city’s airport, you see gates showing flight information from many different airlines. And you will frequently see that some flights are delayed because of a variety of conditions. Are there ways that we can avoid having to deal with these flight delays?</p>
-<p>We’d all like to arrive at our destinations on time whenever possible. (Unless you secretly love hanging out at airports. If you are one of these people, pretend for the moment that you are very much anticipating being at your final destination.) Hadley Wickham (herein just referred to as “Hadley”) created multiple datasets containing information about departing flights from the New York City area in 2013 <span class="citation">(Wickham <a href="#ref-R-nycflights13">2016</a>)</span>. We will begin by loading in one of these datasets, the <code>flights</code> dataset, and getting an idea of its structure:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(nycflights13)
-<span class="kw">data</span>(flights)</code></pre></div>
-<p>The <code>library</code> function here loads the R package <code>nycflights13</code> into the current R environment in which you are working. The <code>data(flights)</code> loads in the <code>flights</code> dataset that is stored in the <code>nycflights13</code> package. Note that you’ll get an error if you try to load this package in and it hasn’t been downloaded and installed. You can ensure it is installed by running the code below:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">if(!<span class="kw">require</span>(nycflights13))
-  <span class="kw">install.packages</span>(<span class="st">&quot;nycflights13&quot;</span>, <span class="dt">repos =</span> <span class="st">&quot;http://cran.rstudio.org&quot;</span>)</code></pre></div>
-<p>This code checks to see if <code>nycflights13</code> is installed and, if not, then goes to the specified repository of “<a href="http://cran.rstudio.org" class="uri">http://cran.rstudio.org</a>” and downloads the package from there and installs it. If it is already installed you can see it listed in the <strong>Packages</strong> tab in the bottom right portion of RStudio and the code will not install the package again since this is redundant and you won’t need to do it over and over again.</p>
-<p>This dataset and most others presented in this book will be in the <code>data.frame</code> format in R. Data frames are ways to look at collections of variables that are tightly coupled together. Frequently, the best way to get a feel for a data frame is to use the <code>View</code> function in RStudio. This command will be given throughout the book as a reminder, but the actual output will be hidden.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">View</span>(flights)</code></pre></div>
+<p>We’d all like to arrive at our destinations on time whenever possible. (Unless you secretly love hanging out at airports. If you are one of these people, pretend for the moment that you are very much anticipating being at your final destination.) Throughout this book, we’re going to analyze data related to flights contained in the <code>nycflights13</code> package we loaded earlier <span class="citation">(Wickham <a href="#ref-R-nycflights13">2016</a>)</span>. Specifically, this package contains information about all flights that departed from NYC (e.g. EWR, JFK and LGA) in 2013 in 5 data sets:</p>
+<ul>
+<li><code>flights</code>: information on all 336,776 flights</li>
+<li><code>weather</code>: hourly meterological data for each airport</li>
+<li><code>planes</code>: construction information about each plane</li>
+<li><code>airports</code>: airport names and locations</li>
+<li><code>airlines</code>: translation between two letter carrier codes and names</li>
+</ul>
+<p>We will begin by loading in the <code>flights</code> dataset and getting an idea of its structure. Run the following in your console</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">data</span>(flights)</code></pre></div>
+<p>This line of code loads in the <code>flights</code> dataset that is stored in the <code>nycflights13</code> package. This dataset and most others presented in this book will be in the “data frame” format in R. Data frames are essentially spreadsheets and allow us to look at collections of variables that are tightly coupled together.</p>
+<p>The best way to get a feel for a data frame is to use the <code>View</code> function in RStudio. This command will be given throughout the book as a reminder, but the actual output will be hidden. Run <code>View(flights)</code> in R and look over this data frame. You should slowly get into the habit of always <code>View</code>ing any data frames that come your way.</p>
 <hr />
 <div class="learncheck">
 <p>
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC3.2)</strong> What does any <em>ONE</em> row in this <code>flights</code> dataset refer to?</p>
+<p><strong>(LC3.3)</strong> What does any <em>ONE</em> row in this <code>flights</code> dataset refer to?</p>
 <ul>
 <li>A. Data on an airline</li>
 <li>B. Data on a flight</li>
@@ -451,55 +526,57 @@ <h2><span class="header-section-number">3.2</span> The <code>nycflights13</code>
 </ul>
 <hr />
 <p>By running <code>View(flights)</code>, we see the different <strong>variables</strong> listed in the columns and we see that there are different types of variables. Some of the variables like <code>distance</code>, <code>day</code>, and <code>arr_delay</code> are what we will call <strong>quantitative</strong> variables. These variables vary in a numerical way. Other variables here are <strong>categorical</strong>.</p>
-<p>Note that if you look in the leftmost column of the <code>View(flights)</code> output, you will see a column of numbers. These are the row numbers of the dataset. If you glance across a row with the same number, say row 5, you can get an idea of what each row corresponds to. In other words, this will allow you to identify what object is being referred to in a given row. This is often called the <strong>observational unit</strong>. The <strong>observational unit</strong> in this example is an individual flight departing New York City in 2013.</p>
+<p>Note that if you look in the leftmost column of the <code>View(flights)</code> output, you will see a column of numbers. These are the row numbers of the dataset. If you glance across a row with the same number, say row 5, you can get an idea of what each row corresponds to. In other words, this will allow you to identify what object is being referred to in a given row. This is often called the <strong>observational unit</strong>. The <strong>observational unit</strong> in this example is an individual flight departing New York City in 2013. You can identify the observational unit by determining what the <strong>thing</strong> is that is being measured in each of the variables.</p>
 <p><strong>Note</strong>: Frequently the first thing you should do when given a dataset is to</p>
 <ul>
 <li>identify the observation unit,</li>
 <li>specify the variables, and</li>
 <li>give the types of variables you are presented with.</li>
 </ul>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">str</span>(flights)</code></pre></div>
-<pre><code>## Classes &#39;tbl_df&#39;, &#39;tbl&#39; and &#39;data.frame&#39;:    336776 obs. of  19 variables:
-##  $ year          : int  2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
-##  $ month         : int  1 1 1 1 1 1 1 1 1 1 ...
-##  $ day           : int  1 1 1 1 1 1 1 1 1 1 ...
-##  $ dep_time      : int  517 533 542 544 554 554 555 557 557 558 ...
-##  $ sched_dep_time: int  515 529 540 545 600 558 600 600 600 600 ...
-##  $ dep_delay     : num  2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
-##  $ arr_time      : int  830 850 923 1004 812 740 913 709 838 753 ...
-##  $ sched_arr_time: int  819 830 850 1022 837 728 854 723 846 745 ...
-##  $ arr_delay     : num  11 20 33 -18 -25 12 19 -14 -8 8 ...
-##  $ carrier       : chr  &quot;UA&quot; &quot;UA&quot; &quot;AA&quot; &quot;B6&quot; ...
-##  $ flight        : int  1545 1714 1141 725 461 1696 507 5708 79 301 ...
-##  $ tailnum       : chr  &quot;N14228&quot; &quot;N24211&quot; &quot;N619AA&quot; &quot;N804JB&quot; ...
-##  $ origin        : chr  &quot;EWR&quot; &quot;LGA&quot; &quot;JFK&quot; &quot;JFK&quot; ...
-##  $ dest          : chr  &quot;IAH&quot; &quot;IAH&quot; &quot;MIA&quot; &quot;BQN&quot; ...
-##  $ air_time      : num  227 227 160 183 116 150 158 53 140 138 ...
-##  $ distance      : num  1400 1416 1089 1576 762 ...
-##  $ hour          : num  5 5 5 5 6 5 6 6 6 6 ...
-##  $ minute        : num  15 29 40 45 0 58 0 0 0 0 ...
-##  $ time_hour     : POSIXct, format: &quot;2013-01-01 05:00:00&quot; ...</code></pre>
+<p>The <code>glimpse()</code> command in the <code>dplyr</code> package provides us with much of the above information and more:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">glimpse</span>(flights)</code></pre></div>
+<pre><code>## Observations: 336,776
+## Variables: 19
+## $ year           &lt;int&gt; 2013, 2013, 2013, 2013, 2013, 2013, 2013, 20...
+## $ month          &lt;int&gt; 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
+## $ day            &lt;int&gt; 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
+## $ dep_time       &lt;int&gt; 517, 533, 542, 544, 554, 554, 555, 557, 557,...
+## $ sched_dep_time &lt;int&gt; 515, 529, 540, 545, 600, 558, 600, 600, 600,...
+## $ dep_delay      &lt;dbl&gt; 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2,...
+## $ arr_time       &lt;int&gt; 830, 850, 923, 1004, 812, 740, 913, 709, 838...
+## $ sched_arr_time &lt;int&gt; 819, 830, 850, 1022, 837, 728, 854, 723, 846...
+## $ arr_delay      &lt;dbl&gt; 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2...
+## $ carrier        &lt;chr&gt; &quot;UA&quot;, &quot;UA&quot;, &quot;AA&quot;, &quot;B6&quot;, &quot;DL&quot;, &quot;UA&quot;, &quot;B6&quot;, &quot;E...
+## $ flight         &lt;int&gt; 1545, 1714, 1141, 725, 461, 1696, 507, 5708,...
+## $ tailnum        &lt;chr&gt; &quot;N14228&quot;, &quot;N24211&quot;, &quot;N619AA&quot;, &quot;N804JB&quot;, &quot;N66...
+## $ origin         &lt;chr&gt; &quot;EWR&quot;, &quot;LGA&quot;, &quot;JFK&quot;, &quot;JFK&quot;, &quot;LGA&quot;, &quot;EWR&quot;, &quot;E...
+## $ dest           &lt;chr&gt; &quot;IAH&quot;, &quot;IAH&quot;, &quot;MIA&quot;, &quot;BQN&quot;, &quot;ATL&quot;, &quot;ORD&quot;, &quot;F...
+## $ air_time       &lt;dbl&gt; 227, 227, 160, 183, 116, 150, 158, 53, 140, ...
+## $ distance       &lt;dbl&gt; 1400, 1416, 1089, 1576, 762, 719, 1065, 229,...
+## $ hour           &lt;dbl&gt; 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6,...
+## $ minute         &lt;dbl&gt; 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, ...
+## $ time_hour      &lt;dttm&gt; 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2...</code></pre>
 <hr />
 <div class="learncheck">
 <p>
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC3.3)</strong> What are some examples in this dataset of <strong>categorical</strong> variables? What makes them different than <strong>quantitative</strong> variables?</p>
-<p><strong>(LC3.4)</strong> What does <code>int</code>, <code>num</code>, and <code>chr</code> mean in the output above?</p>
-<p><strong>(LC3.5)</strong> How many different columns are in this dataset?</p>
-<p><strong>(LC3.6)</strong> How many different rows are in this dataset?</p>
+<p><strong>(LC3.4)</strong> What are some examples in this dataset of <strong>categorical</strong> variables? What makes them different than <strong>quantitative</strong> variables?</p>
+<p><strong>(LC3.5)</strong> What does <code>int</code>, <code>num</code>, and <code>chr</code> mean in the output above?</p>
+<p><strong>(LC3.6)</strong> How many different columns are in this dataset?</p>
+<p><strong>(LC3.7)</strong> How many different rows are in this dataset?</p>
 <hr />
-<p>Another way to view the properties of a dataset is to use the <code>str</code> function (“str” is short for “structure”). The <code>str</code> function is expecting an object for its argument. In this case, the object is a data frame named <code>flights</code>. You can use the <code>str</code> function on other objects and data frames using the syntax <code>str(object)</code> where <code>object</code> is the name of an object in R. This will give you the first few entries of each variable in a row after the variable. In addition, the type of the variable is given immediately after the <code>:</code> following each variable’s name. Here, <code>int</code> and <code>num</code> refer to quantitative variables. In contrast, <code>chr</code> refers to categorical variables. One more type of variable is given here with the <code>time_hour</code> variable: <strong>POSIXct</strong>. As you may suspect, this variable corresponds to a specific date and time of day.</p>
+<p>We see that <code>glimpse</code> will give you the first few entries of each variable in a row after the variable. In addition, the type of the variable is given immediately after each variable’s name inside <code>&lt; &gt;</code>. Here, <code>int</code> and <code>num</code> refer to quantitative variables. In contrast, <code>chr</code> refers to categorical variables. One more type of variable is given here with the <code>time_hour</code> variable: <strong>dttm</strong>. As you may suspect, this variable corresponds to a specific date and time of day.</p>
 <p>Another nice feature of R is the help system. You can get help in R by simply entering a question mark before the name of a function or an object and you will be presented with a page showing the documentation. Note that this output help file is omitted here but can be accessed <a href="https://cran.r-project.org/web/packages/nycflights13/nycflights13.pdf">here</a> on page 3 of the PDF document.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">?str
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">?glimpse
 ?flights</code></pre></div>
 <p>Another aspect of tidy data is a description of what each variable in the dataset represents. This helps others to understand what your variable names mean and what they correspond to. If we look at the output of <code>?flights</code>, we can see that a description of each variable by name is given.</p>
-<p>An important feature to <strong>ALWAYS</strong> include with your data is the appropriate units of measurement. We’ll see this further when we work with the <code>dep_delay</code> variable in Chapter <a href="4-data-visualization-via-ggplot2.html#viz"><strong>??</strong></a>. (It’s in minutes, but you’d get some really strange interpretations if you thought it was in hours or seconds. UNITS MATTER!)</p>
+<p>An important feature to <strong>ALWAYS</strong> include with your data is the appropriate units of measurement. We’ll see this further when we work with the <code>dep_delay</code> variable in Chapter <a href="4-viz.html#viz">4</a>. (It’s in minutes, but you’d get some really strange interpretations if you thought it was in hours or seconds. UNITS MATTER!)</p>
 </div>
 <div id="how-is-flights-tidy" class="section level2">
 <h2><span class="header-section-number">3.3</span> How is <code>flights</code> tidy?</h2>
-<p>We see that <code>flights</code> has a rectangular shape with each row corresponding to a different flight and each column corresponding to a characteristic of that flight. This matches exactly with how Hadley defined tidy data:</p>
+<p>We see that <code>flights</code> has a rectangular shape with each row corresponding to a different flight and each column corresponding to a characteristic of that flight. This matches exactly with how Hadley Wickham defined tidy data:</p>
 <ol style="list-style-type: decimal">
 <li>Each variable forms a column.</li>
 <li>Each observation forms a row.</li>
@@ -518,7 +595,7 @@ <h2><span class="header-section-number">3.3</span> How is <code>flights</code> t
 <li><code>airports</code>: airport names and locations</li>
 <li><code>airlines</code>: translation between two letter carrier codes and names</li>
 </ul>
-<p>You may have been asking yourself what <code>carrier</code> refers to in the <code>str(flights)</code> output above. The <code>airlines</code> dataset provides a description of this with each airline being the observational unit:</p>
+<p>You may have been asking yourself what <code>carrier</code> refers to in the <code>glimpse(flights)</code> output above. The <code>airlines</code> dataset provides a description of this with each airline being the observational unit:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">data</span>(airlines)
 airlines</code></pre></div>
 <pre><code>## # A tibble: 16 × 2
@@ -541,19 +618,57 @@ <h2><span class="header-section-number">3.3</span> How is <code>flights</code> t
 ## 15      WN      Southwest Airlines Co.
 ## 16      YV          Mesa Airlines Inc.</code></pre>
 <p>As can be seen here when you just enter the name of an object in R, by default it will print the contents of that object to the screen. Be careful! It’s usually better to use the <code>View()</code> function in RStudio since larger objects may take awhile to print to the screen and it likely won’t be helpful to you to have hundreds of lines outputted.</p>
+<hr />
+<div class="learncheck">
+<p>
+<strong><em>Learning check</em></strong>
+</p>
+</div>
+<p><strong>(LC3.8)</strong> Run the following block of code in R to load and view each of the four data frames in the <code>nycflights13</code> package. Switch between the different tabs that have opened to view each of the four data frames. Describe in two sentences for each data frame what stands out to you and what the most important features are of each.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">data</span>(weather)
+<span class="kw">data</span>(planes)
+<span class="kw">data</span>(airports)
+<span class="kw">data</span>(airlines)
+<span class="kw">View</span>(weather)
+<span class="kw">View</span>(planes)
+<span class="kw">View</span>(airports)
+<span class="kw">View</span>(airlines)</code></pre></div>
+<hr />
+<div id="identification-variables" class="section level3">
+<h3><span class="header-section-number">3.3.1</span> Identification variables</h3>
+<p>There is a subtle difference between the kinds of variables that you will encounter in data frames. The <code>airports</code> data frame you worked with above contains data in these different kinds. Let’s pull them apart using the <code>glimpse</code> function:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">glimpse</span>(airports)</code></pre></div>
+<pre><code>## Observations: 1,458
+## Variables: 8
+## $ faa   &lt;chr&gt; &quot;04G&quot;, &quot;06A&quot;, &quot;06C&quot;, &quot;06N&quot;, &quot;09J&quot;, &quot;0A9&quot;, &quot;0G6&quot;, &quot;0G7...
+## $ name  &lt;chr&gt; &quot;Lansdowne Airport&quot;, &quot;Moton Field Municipal Airport&quot;,...
+## $ lat   &lt;dbl&gt; 41.13, 32.46, 41.99, 41.43, 31.07, 36.37, 41.47, 42.8...
+## $ lon   &lt;dbl&gt; -80.62, -85.68, -88.10, -74.39, -81.43, -82.17, -84.5...
+## $ alt   &lt;int&gt; 1044, 264, 801, 523, 11, 1593, 730, 492, 1000, 108, 4...
+## $ tz    &lt;dbl&gt; -5, -6, -6, -5, -5, -5, -5, -5, -5, -8, -5, -6, -5, -...
+## $ dst   &lt;chr&gt; &quot;A&quot;, &quot;A&quot;, &quot;A&quot;, &quot;A&quot;, &quot;A&quot;, &quot;A&quot;, &quot;A&quot;, &quot;A&quot;, &quot;U&quot;, &quot;A&quot;, &quot;A&quot;...
+## $ tzone &lt;chr&gt; &quot;America/New_York&quot;, &quot;America/Chicago&quot;, &quot;America/Chica...</code></pre>
+<p>The variables <code>faa</code> and <code>name</code> are what we will call <em>identification variables</em>. They are mainly used to provide a name to the observational unit. Here the observational unit is an airport and the <code>faa</code> gives the code provided by the FAA for that airport while the <code>name</code> variable gives the longer more natural name of the airport. These ID variables differ from the other variables that are often called <em>measurement</em> or <em>characteristic</em> variables. The remaining variables (aside from <code>faa</code> and <code>name</code>) are of this type in <code>airports</code>. They don’t uniquely identify the observational unit, but instead describe properties of the observational unit. For organizational purposes, it is best practice to have your identification variables in the far leftmost columns of your data frame.</p>
+<hr />
+<div class="learncheck">
+<p>
+<strong><em>Learning check</em></strong>
+</p>
+</div>
+<p><strong>(LC3.9)</strong> What properties of the observational unit do each of <code>lat</code>, <code>lon</code>, <code>alt</code>, <code>tz</code>, <code>dst</code>, and <code>tzone</code> describe for the <code>airports</code> data frame?</p>
+<p><strong>(LC3.10)</strong> Provide the names of variables in a data frame with at least three variables in which one of them is an identification variable and the other two are not.</p>
+<hr />
+</div>
 </div>
 <div id="normal-forms-of-data" class="section level2">
 <h2><span class="header-section-number">3.4</span> Normal forms of data</h2>
 <p>The datasets included in the <code>nycflights13</code> package are in a form that minimizes redundancy of data. We will see that there are ways to <em>merge</em> (or <em>join</em>) the different tables together easily. We are capable of doing so because each of the tables have <em>keys</em> in common to relate one to another. This is an important property of <strong>normal forms</strong> of data. The process of decomposing data frames into less redundant tables without losing information is called <strong>normalization</strong>. More information is available on <a href="https://en.wikipedia.org/wiki/Database_normalization">Wikipedia</a>.</p>
 <p>We saw an example of this above with the <code>airlines</code> dataset. While the <code>flights</code> data frame could also include a column with the names of the airlines instead of the carrier code, this would be repetitive since there is a unique mapping of the carrier code to the name of the airline/carrier.</p>
-<p>Below an example is given showing how to <strong>join</strong> the <code>airlines</code> data frame together with the <code>flights</code> data frame by linking together the two datasets via a common <strong>key</strong> of <code>&quot;carrier&quot;</code>. Note that this “joined” data frame is assigned to a new data frame called <code>joined_flights</code>.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">if(!<span class="kw">require</span>(nycflights13))
-  <span class="kw">install.packages</span>(<span class="st">&quot;nycflights13&quot;</span>, <span class="dt">repos =</span> <span class="st">&quot;http://cran.rstudio.org&quot;</span>)
-<span class="kw">library</span>(dplyr)
+<p>Below an example is given showing how to <strong>join</strong> the <code>airlines</code> data frame together with the <code>flights</code> data frame by linking together the two datasets via a common <strong>key</strong> of <code>&quot;carrier&quot;</code>. Note that this “joined” data frame is assigned to a new data frame called <code>joined_flights</code>. The <strong>key</strong> variable that we frequently join by is one of the <em>identification variables</em> mentioned above.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(dplyr)
 joined_flights &lt;-<span class="st"> </span><span class="kw">inner_join</span>(<span class="dt">x =</span> flights, <span class="dt">y =</span> airlines, <span class="dt">by =</span> <span class="st">&quot;carrier&quot;</span>)</code></pre></div>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">View</span>(joined_flights)</code></pre></div>
-<p>If we <code>View</code> this dataset, we see a new variable has been created called (We will see in Subsection 5.1.1 ways to change <code>name</code> to a more descriptive variable name.)</p>
-<p>More discussion about joining data frames together will be given in Chapter <a href="5-data-manipulation-via-dplyr.html#manip"><strong>??</strong></a>. We will see there that the names of the columns to be linked need not match as they did here with <code>&quot;carrier&quot;</code>.</p>
+<p>If we <code>View</code> this dataset, we see a new variable has been created called <code>name</code>. (We will see in Subsection <a href="5-manip.html#rename">5.4.2</a> ways to change <code>name</code> to a more descriptive variable name.) More discussion about joining data frames together will be given in Chapter <a href="5-manip.html#manip">5</a>. We will see there that the names of the columns to be linked need not match as they did here with <code>&quot;carrier&quot;</code>.</p>
 <hr />
 <hr />
 <div class="review">
@@ -683,7 +798,7 @@ <h2><span class="header-section-number">3.4</span> Normal forms of data</h2>
 </div>
 <div id="whats-to-come" class="section level2">
 <h2><span class="header-section-number">3.5</span> What’s to come?</h2>
-<p>In Chapter <a href="4-data-visualization-via-ggplot2.html#viz"><strong>??</strong></a>, we will further explore the distribution of a variable in a related dataset to <code>flights</code>: the <code>temp</code> variable in the <code>weather</code> dataset. We’ll be interested in understanding how this variable varies in relation to the values of other variables in the dataset. We will see that visualization is often a powerful tool in helping us see what is going on in a dataset. It will be a useful way to expand on the <code>str</code> function we have seen here for tidy data.</p>
+<p>In Chapter <a href="4-viz.html#viz">4</a>, we will further explore the distribution of a variable in a related dataset to <code>flights</code>: the <code>temp</code> variable in the <code>weather</code> dataset. We’ll be interested in understanding how this variable varies in relation to the values of other variables in the dataset. We will see that visualization is often a powerful tool in helping us see what is going on in a dataset. It will be a useful way to expand on the <code>glimpse</code> function we have seen here for tidy data.</p>
 
 </div>
 </div>
@@ -702,7 +817,7 @@ <h3>References</h3>
         </div>
       </div>
 <a href="2-intro.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
-<a href="4-data-visualization-via-ggplot2.html" class="navigation navigation-next " aria-label="Next page""><i class="fa fa-angle-right"></i></a>
+<a href="4-viz.html" class="navigation navigation-next " aria-label="Next page""><i class="fa fa-angle-right"></i></a>
 
 <script src="libs/gitbook-2.6.7/js/app.min.js"></script>
 <script src="libs/gitbook-2.6.7/js/lunr.js"></script>
diff --git a/docs/4-data-visualization-via-ggplot2.html b/docs/4-viz.html
similarity index 65%
rename from docs/4-data-visualization-via-ggplot2.html
rename to docs/4-viz.html
index 4aa72ff3b..a5328b2e3 100644
--- a/docs/4-data-visualization-via-ggplot2.html
+++ b/docs/4-viz.html
@@ -26,7 +26,7 @@
 <meta name="author" content="Chester Ismay and Albert Y. Kim">
 
 
-<meta name="date" content="2017-01-07">
+<meta name="date" content="2017-01-10">
 
   <meta name="viewport" content="width=device-width, initial-scale=1">
   <meta name="apple-mobile-web-app-capable" content="yes">
@@ -34,7 +34,7 @@
   
   
 <link rel="prev" href="3-tidy.html">
-<link rel="next" href="5-data-manipulation-via-dplyr.html">
+<link rel="next" href="5-manip.html">
 
 <script src="libs/jquery-2.2.3/jquery.min.js"></script>
 <link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
@@ -119,9 +119,9 @@
 
 <ul class="summary">
 <li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> Preamble</a><ul>
-<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book</a></li>
+<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book---for-instructors"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book - For Instructors</a></li>
 <li class="chapter" data-level="1.2" data-path="index.html"><a href="index.html#contribute"><i class="fa fa-check"></i><b>1.2</b> Contribute</a></li>
-<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started"><i class="fa fa-check"></i><b>1.3</b> Getting Started</a></li>
+<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started---for-students"><i class="fa fa-check"></i><b>1.3</b> Getting Started - For Students</a></li>
 <li class="chapter" data-level="" data-path="index.html"><a href="index.html#colophon"><i class="fa fa-check"></i>Colophon</a></li>
 </ul></li>
 <li class="chapter" data-level="2" data-path="2-intro.html"><a href="2-intro.html"><i class="fa fa-check"></i><b>2</b> Introduction</a><ul>
@@ -133,87 +133,100 @@
 </ul></li>
 <li class="part"><span><b>I Data Exploration</b></span></li>
 <li class="chapter" data-level="3" data-path="3-tidy.html"><a href="3-tidy.html"><i class="fa fa-check"></i><b>3</b> Tidy Data</a><ul>
+<li class="chapter" data-level="" data-path="3-tidy.html"><a href="3-tidy.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="3.1" data-path="3-tidy.html"><a href="3-tidy.html#what-is-tidy-data"><i class="fa fa-check"></i><b>3.1</b> What is tidy data?</a></li>
-<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#the-nycflights13-datasets"><i class="fa fa-check"></i><b>3.2</b> The <code>nycflights13</code> datasets</a></li>
-<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a></li>
+<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#datasets-in-the-nycflights13-package"><i class="fa fa-check"></i><b>3.2</b> Datasets in the <code>nycflights13</code> package</a></li>
+<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a><ul>
+<li class="chapter" data-level="3.3.1" data-path="3-tidy.html"><a href="3-tidy.html#identification-variables"><i class="fa fa-check"></i><b>3.3.1</b> Identification variables</a></li>
+</ul></li>
 <li class="chapter" data-level="3.4" data-path="3-tidy.html"><a href="3-tidy.html#normal-forms-of-data"><i class="fa fa-check"></i><b>3.4</b> Normal forms of data</a></li>
 <li class="chapter" data-level="3.5" data-path="3-tidy.html"><a href="3-tidy.html#whats-to-come"><i class="fa fa-check"></i><b>3.5</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via <code id="viz">ggplot2</code></a><ul>
-<li class="chapter" data-level="" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
-<li class="chapter" data-level="4.1.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
-<li class="chapter" data-level="4.1.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
-<li class="chapter" data-level="4.1.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
-<li class="chapter" data-level="4.1.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The <code>ggplot2</code> Package</a></li>
+<li class="chapter" data-level="4" data-path="4-viz.html"><a href="4-viz.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via ggplot2</a><ul>
+<li class="chapter" data-level="" data-path="4-viz.html"><a href="4-viz.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="4.1" data-path="4-viz.html"><a href="4-viz.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
+<li class="chapter" data-level="4.1.1" data-path="4-viz.html"><a href="4-viz.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
+<li class="chapter" data-level="4.1.2" data-path="4-viz.html"><a href="4-viz.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
+<li class="chapter" data-level="4.1.3" data-path="4-viz.html"><a href="4-viz.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
+<li class="chapter" data-level="4.1.4" data-path="4-viz.html"><a href="4-viz.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The ggplot2 Package</a></li>
+</ul></li>
+<li class="chapter" data-level="4.2" data-path="4-viz.html"><a href="4-viz.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
+<li class="chapter" data-level="4.3" data-path="4-viz.html"><a href="4-viz.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
+<li class="chapter" data-level="4.3.1" data-path="4-viz.html"><a href="4-viz.html#geompoint"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via geom_point</a></li>
+<li class="chapter" data-level="4.3.2" data-path="4-viz.html"><a href="4-viz.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
+<li class="chapter" data-level="4.3.3" data-path="4-viz.html"><a href="4-viz.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
 </ul></li>
-<li class="chapter" data-level="4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
-<li class="chapter" data-level="4.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
-<li class="chapter" data-level="4.3.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatter-plots-via-geom_point"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via <code id="geompoint">geom_point</code></a></li>
-<li class="chapter" data-level="4.3.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
-<li class="chapter" data-level="4.3.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
+<li class="chapter" data-level="4.4" data-path="4-viz.html"><a href="4-viz.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
+<li class="chapter" data-level="4.4.1" data-path="4-viz.html"><a href="4-viz.html#geomline"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via geom_line</a></li>
+<li class="chapter" data-level="4.4.2" data-path="4-viz.html"><a href="4-viz.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
 </ul></li>
-<li class="chapter" data-level="4.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
-<li class="chapter" data-level="4.4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#line-graphs-via-geom_line"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via <code id="geomline">geom_line</code></a></li>
-<li class="chapter" data-level="4.4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
+<li class="chapter" data-level="4.5" data-path="4-viz.html"><a href="4-viz.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
+<li class="chapter" data-level="4.5.1" data-path="4-viz.html"><a href="4-viz.html#geomhistogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via geom_histogram</a></li>
+<li class="chapter" data-level="4.5.2" data-path="4-viz.html"><a href="4-viz.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
+<li class="chapter" data-level="4.5.3" data-path="4-viz.html"><a href="4-viz.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
 </ul></li>
-<li class="chapter" data-level="4.5" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
-<li class="chapter" data-level="4.5.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms-via-geom_histogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via <code id="geomhistogram">geom_histogram</code></a></li>
-<li class="chapter" data-level="4.5.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
-<li class="chapter" data-level="4.5.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
+<li class="chapter" data-level="4.6" data-path="4-viz.html"><a href="4-viz.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
+<li class="chapter" data-level="4.7" data-path="4-viz.html"><a href="4-viz.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
+<li class="chapter" data-level="4.7.1" data-path="4-viz.html"><a href="4-viz.html#geomboxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via geom_boxplot</a></li>
+<li class="chapter" data-level="4.7.2" data-path="4-viz.html"><a href="4-viz.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
 </ul></li>
-<li class="chapter" data-level="4.6" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
-<li class="chapter" data-level="4.7" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
-<li class="chapter" data-level="4.7.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#boxplots-via-geom_boxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via <code id="geomboxplot">geom_boxplot</code></a></li>
-<li class="chapter" data-level="4.7.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
+<li class="chapter" data-level="4.8" data-path="4-viz.html"><a href="4-viz.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
+<li class="chapter" data-level="4.8.1" data-path="4-viz.html"><a href="4-viz.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via geom_bar</a></li>
+<li class="chapter" data-level="4.8.2" data-path="4-viz.html"><a href="4-viz.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
+<li class="chapter" data-level="4.8.3" data-path="4-viz.html"><a href="4-viz.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
+<li class="chapter" data-level="4.8.4" data-path="4-viz.html"><a href="4-viz.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
 </ul></li>
-<li class="chapter" data-level="4.8" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
-<li class="chapter" data-level="4.8.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via <code>geom_bar</code></a></li>
-<li class="chapter" data-level="4.8.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
-<li class="chapter" data-level="4.8.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
-<li class="chapter" data-level="4.8.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
+<li class="chapter" data-level="4.9" data-path="4-viz.html"><a href="4-viz.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="4.9.1" data-path="4-viz.html"><a href="4-viz.html#resources"><i class="fa fa-check"></i><b>4.9.1</b> Resources</a></li>
+<li class="chapter" data-level="4.9.2" data-path="4-viz.html"><a href="4-viz.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
+<li class="chapter" data-level="4.9.3" data-path="4-viz.html"><a href="4-viz.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.3</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="4.9" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
-<li class="chapter" data-level="4.9.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.1</b> What’s to come?</a></li>
-<li class="chapter" data-level="4.9.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
 </ul></li>
+<li class="chapter" data-level="5" data-path="5-manip.html"><a href="5-manip.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via dplyr</a><ul>
+<li class="chapter" data-level="" data-path="5-manip.html"><a href="5-manip.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="5.1" data-path="5-manip.html"><a href="5-manip.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
+<li class="chapter" data-level="5.2" data-path="5-manip.html"><a href="5-manip.html#five-main-verbs---the-5mv"><i class="fa fa-check"></i><b>5.2</b> Five Main Verbs - The 5MV</a><ul>
+<li class="chapter" data-level="5.2.1" data-path="5-manip.html"><a href="5-manip.html#filter"><i class="fa fa-check"></i><b>5.2.1</b> 5MV#1: Filter observations using filter</a></li>
+<li class="chapter" data-level="5.2.2" data-path="5-manip.html"><a href="5-manip.html#mv2-summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> 5MV#2: Summarize variables using summarize</a></li>
+<li class="chapter" data-level="5.2.3" data-path="5-manip.html"><a href="5-manip.html#mv3-group-rows-using-group_by"><i class="fa fa-check"></i><b>5.2.3</b> 5MV#3: Group rows using group_by</a></li>
+<li class="chapter" data-level="5.2.4" data-path="5-manip.html"><a href="5-manip.html#mv4-create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.4</b> 5MV#4: Create new variables/change old variables using mutate</a></li>
+<li class="chapter" data-level="5.2.5" data-path="5-manip.html"><a href="5-manip.html#arrange"><i class="fa fa-check"></i><b>5.2.5</b> 5MV#5: Reorder the data frame using arrange</a></li>
 </ul></li>
-<li class="chapter" data-level="5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via <code id="manip">dplyr</code></a><ul>
-<li class="chapter" data-level="" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="5.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
-<li class="chapter" data-level="5.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#four-main-verbs---the-4mv"><i class="fa fa-check"></i><b>5.2</b> Four Main Verbs - The 4MV</a><ul>
-<li class="chapter" data-level="5.2.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#filter-observations-using-filter"><i class="fa fa-check"></i><b>5.2.1</b> Filter observations using <code id="filter">filter</code></a></li>
-<li class="chapter" data-level="5.2.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> Summarize variables using <code>summarize</code></a></li>
-<li class="chapter" data-level="5.2.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.3</b> Create new variables/change old variables using <code>mutate</code></a></li>
-<li class="chapter" data-level="5.2.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#reorder-the-data-frame-using-arrange"><i class="fa fa-check"></i><b>5.2.4</b> Reorder the data frame using <code id="arrange">arrange</code></a></li>
+<li class="chapter" data-level="5.3" data-path="5-manip.html"><a href="5-manip.html#joining-data-frames"><i class="fa fa-check"></i><b>5.3</b> Joining data frames</a><ul>
+<li class="chapter" data-level="5.3.1" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables"><i class="fa fa-check"></i><b>5.3.1</b> Joining by Key Variables</a></li>
+<li class="chapter" data-level="5.3.2" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables-with-different-names"><i class="fa fa-check"></i><b>5.3.2</b> Joining by Key Variables with Different Names</a></li>
 </ul></li>
-<li class="chapter" data-level="5.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#other-verbs"><i class="fa fa-check"></i><b>5.3</b> Other verbs</a><ul>
-<li class="chapter" data-level="5.3.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#select-variables-using-select"><i class="fa fa-check"></i><b>5.3.1</b> Select variables using <code id="select">select</code></a></li>
-<li class="chapter" data-level="5.3.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#rename-variables-using-rename"><i class="fa fa-check"></i><b>5.3.2</b> Rename variables using <code id="rename">rename</code></a></li>
-<li class="chapter" data-level="5.3.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.3.3</b> Find the top number of values using <code>top_n</code></a></li>
+<li class="chapter" data-level="5.4" data-path="5-manip.html"><a href="5-manip.html#optional-other-verbs"><i class="fa fa-check"></i><b>5.4</b> Optional: Other verbs</a><ul>
+<li class="chapter" data-level="5.4.1" data-path="5-manip.html"><a href="5-manip.html#select"><i class="fa fa-check"></i><b>5.4.1</b> Select variables using select</a></li>
+<li class="chapter" data-level="5.4.2" data-path="5-manip.html"><a href="5-manip.html#rename"><i class="fa fa-check"></i><b>5.4.2</b> Rename variables using rename</a></li>
+<li class="chapter" data-level="5.4.3" data-path="5-manip.html"><a href="5-manip.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.4.3</b> Find the top number of values using top_n</a></li>
+</ul></li>
+<li class="chapter" data-level="5.5" data-path="5-manip.html"><a href="5-manip.html#conclusion-1"><i class="fa fa-check"></i><b>5.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="5.5.1" data-path="5-manip.html"><a href="5-manip.html#resources-1"><i class="fa fa-check"></i><b>5.5.1</b> Resources</a></li>
+<li class="chapter" data-level="5.5.2" data-path="5-manip.html"><a href="5-manip.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5.2</b> Script of R code</a></li>
+<li class="chapter" data-level="5.5.3" data-path="5-manip.html"><a href="5-manip.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.5.3</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="5.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#joiningmerging-data-frames"><i class="fa fa-check"></i><b>5.4</b> Joining/merging data frames</a></li>
-<li class="chapter" data-level="5.5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5</b> Script of R code</a></li>
-<li class="chapter" data-level="5.6" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.6</b> What’s to come?</a></li>
 </ul></li>
 <li class="part"><span><b>II Inference</b></span></li>
-<li class="chapter" data-level="6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via <code id="sim">mosaic</code></a><ul>
-<li class="chapter" data-level="" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="6.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
-<li class="chapter" data-level="6.1.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
-<li class="chapter" data-level="6.1.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+<li class="chapter" data-level="6" data-path="6-sim.html"><a href="6-sim.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via mosaic</a><ul>
+<li class="chapter" data-level="" data-path="6-sim.html"><a href="6-sim.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="6.1" data-path="6-sim.html"><a href="6-sim.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
+<li class="chapter" data-level="6.1.1" data-path="6-sim.html"><a href="6-sim.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
+<li class="chapter" data-level="6.1.2" data-path="6-sim.html"><a href="6-sim.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+</ul></li>
+<li class="chapter" data-level="6.2" data-path="6-sim.html"><a href="6-sim.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
+<li class="chapter" data-level="6.2.1" data-path="6-sim.html"><a href="6-sim.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
+<li class="chapter" data-level="6.2.2" data-path="6-sim.html"><a href="6-sim.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
 </ul></li>
-<li class="chapter" data-level="6.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
-<li class="chapter" data-level="6.2.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
-<li class="chapter" data-level="6.2.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
+<li class="chapter" data-level="6.3" data-path="6-sim.html"><a href="6-sim.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
+<li class="chapter" data-level="6.4" data-path="6-sim.html"><a href="6-sim.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
+<li class="chapter" data-level="6.5" data-path="6-sim.html"><a href="6-sim.html#conclusion-2"><i class="fa fa-check"></i><b>6.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="6.5.1" data-path="6-sim.html"><a href="6-sim.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5.1</b> Script of R code</a></li>
+<li class="chapter" data-level="6.5.2" data-path="6-sim.html"><a href="6-sim.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.5.2</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="6.3" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
-<li class="chapter" data-level="6.4" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
-<li class="chapter" data-level="6.5" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5</b> Script of R code</a></li>
-<li class="chapter" data-level="6.6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.6</b> What’s to come?</a></li>
 </ul></li>
 <li class="chapter" data-level="7" data-path="7-hypo.html"><a href="7-hypo.html"><i class="fa fa-check"></i><b>7</b> Hypothesis Testing</a><ul>
-<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="7.1" data-path="7-hypo.html"><a href="7-hypo.html#when-inference-is-not-needed"><i class="fa fa-check"></i><b>7.1</b> When Inference Is Not Needed</a></li>
 <li class="chapter" data-level="7.2" data-path="7-hypo.html"><a href="7-hypo.html#basics-of-hypothesis-testing"><i class="fa fa-check"></i><b>7.2</b> Basics of Hypothesis Testing</a></li>
 <li class="chapter" data-level="7.3" data-path="7-hypo.html"><a href="7-hypo.html#trial"><i class="fa fa-check"></i><b>7.3</b> Criminal trial analogy</a><ul>
@@ -249,44 +262,50 @@
 <li class="chapter" data-level="7.8.1" data-path="7-hypo.html"><a href="7-hypo.html#example-t-test-for-two-independent-samples"><i class="fa fa-check"></i><b>7.8.1</b> EXAMPLE: <span class="math inline">\(t\)</span>-test for two independent samples</a></li>
 <li class="chapter" data-level="7.8.2" data-path="7-hypo.html"><a href="7-hypo.html#conditions-for-t-test"><i class="fa fa-check"></i><b>7.8.2</b> Conditions for t-test</a></li>
 </ul></li>
-<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9</b> What’s to come?</a></li>
+<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#conclusion-3"><i class="fa fa-check"></i><b>7.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="7.9.1" data-path="7-hypo.html"><a href="7-hypo.html#script-of-r-code-3"><i class="fa fa-check"></i><b>7.9.1</b> Script of R code</a></li>
+<li class="chapter" data-level="7.9.2" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="chapter" data-level="8" data-path="8-ci.html"><a href="8-ci.html"><i class="fa fa-check"></i><b>8</b> Confidence Intervals</a><ul>
-<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a></li>
-<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#mean_rating"><i class="fa fa-check"></i><b>8.2</b> mean_rating</a><ul>
-<li class="chapter" data-level="8.2.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.2.1</b> Review of Bootstrapping</a></li>
+<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a><ul>
+<li class="chapter" data-level="8.1.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.1.1</b> Review of Bootstrapping</a></li>
+</ul></li>
+<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.2</b> Relation to hypothesis testing</a></li>
+<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.3</b> Effect size</a></li>
+<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#conclusion-4"><i class="fa fa-check"></i><b>8.4</b> Conclusion</a><ul>
+<li class="chapter" data-level="8.4.1" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-4"><i class="fa fa-check"></i><b>8.4.1</b> Script of R code</a></li>
+<li class="chapter" data-level="8.4.2" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.4.2</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.3</b> Relation to hypothesis testing</a></li>
-<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.4</b> Effect size</a></li>
-<li class="chapter" data-level="8.5" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-3"><i class="fa fa-check"></i><b>8.5</b> Script of R code</a></li>
-<li class="chapter" data-level="8.6" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.6</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="9" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html"><i class="fa fa-check"></i><b>9</b> Regression via <code id="regress">broom</code></a><ul>
-<li class="chapter" data-level="" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="9.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
-<li class="chapter" data-level="9.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
-<li class="chapter" data-level="9.2.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
+<li class="chapter" data-level="9" data-path="9-regress.html"><a href="9-regress.html"><i class="fa fa-check"></i><b>9</b> Regression via broom</a><ul>
+<li class="chapter" data-level="" data-path="9-regress.html"><a href="9-regress.html#needed-packages-6"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="9.1" data-path="9-regress.html"><a href="9-regress.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
+<li class="chapter" data-level="9.2" data-path="9-regress.html"><a href="9-regress.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
+<li class="chapter" data-level="9.2.1" data-path="9-regress.html"><a href="9-regress.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
 </ul></li>
-<li class="chapter" data-level="9.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
-<li class="chapter" data-level="9.3.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
-<li class="chapter" data-level="9.3.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
-<li class="chapter" data-level="9.3.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#interpretting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpretting the slope</a></li>
-<li class="chapter" data-level="9.3.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
+<li class="chapter" data-level="9.3" data-path="9-regress.html"><a href="9-regress.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
+<li class="chapter" data-level="9.3.1" data-path="9-regress.html"><a href="9-regress.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
+<li class="chapter" data-level="9.3.2" data-path="9-regress.html"><a href="9-regress.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
+<li class="chapter" data-level="9.3.3" data-path="9-regress.html"><a href="9-regress.html#interpreting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpreting the slope</a></li>
+<li class="chapter" data-level="9.3.4" data-path="9-regress.html"><a href="9-regress.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
 </ul></li>
-<li class="chapter" data-level="9.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
-<li class="chapter" data-level="9.4.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
-<li class="chapter" data-level="9.4.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
-<li class="chapter" data-level="9.4.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
-<li class="chapter" data-level="9.4.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
-<li class="chapter" data-level="9.4.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
+<li class="chapter" data-level="9.4" data-path="9-regress.html"><a href="9-regress.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
+<li class="chapter" data-level="9.4.1" data-path="9-regress.html"><a href="9-regress.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
+<li class="chapter" data-level="9.4.2" data-path="9-regress.html"><a href="9-regress.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
+<li class="chapter" data-level="9.4.3" data-path="9-regress.html"><a href="9-regress.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
+<li class="chapter" data-level="9.4.4" data-path="9-regress.html"><a href="9-regress.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.5" data-path="9-regress.html"><a href="9-regress.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
+<li class="chapter" data-level="9.4.6" data-path="9-regress.html"><a href="9-regress.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.7" data-path="9-regress.html"><a href="9-regress.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
+</ul></li>
+<li class="chapter" data-level="9.5" data-path="9-regress.html"><a href="9-regress.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
+<li class="chapter" data-level="9.6" data-path="9-regress.html"><a href="9-regress.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
+<li class="chapter" data-level="9.7" data-path="9-regress.html"><a href="9-regress.html#conclusion-5"><i class="fa fa-check"></i><b>9.7</b> Conclusion</a><ul>
+<li class="chapter" data-level="9.7.1" data-path="9-regress.html"><a href="9-regress.html#script-of-r-code-5"><i class="fa fa-check"></i><b>9.7.1</b> Script of R code</a></li>
+<li class="chapter" data-level="9.7.2" data-path="9-regress.html"><a href="9-regress.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.7.2</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="9.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
-<li class="chapter" data-level="9.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
-<li class="chapter" data-level="9.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#script-of-r-code-4"><i class="fa fa-check"></i><b>9.7</b> Script of R code</a></li>
-<li class="chapter" data-level="9.8" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.8</b> What’s to come?</a></li>
 </ul></li>
 <li class="part"><span><b>III Conclusion</b></span></li>
 <li class="chapter" data-level="10" data-path="10-effective-data-storytelling.html"><a href="10-effective-data-storytelling.html"><i class="fa fa-check"></i><b>10</b> Effective Data Storytelling</a><ul>
@@ -304,7 +323,8 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="B" data-path="B-appendixB.html"><a href="B-appendixB.html"><i class="fa fa-check"></i><b>B</b> Inference Examples</a><ul>
-<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-6"><i class="fa fa-check"></i><b>B.1</b> Needed packages</a></li>
+<li class="chapter" data-level="" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-7"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#inference-mind-map"><i class="fa fa-check"></i><b>B.1</b> Inference Mind Map</a></li>
 <li class="chapter" data-level="B.2" data-path="B-appendixB.html"><a href="B-appendixB.html#one-mean"><i class="fa fa-check"></i><b>B.2</b> One Mean</a><ul>
 <li class="chapter" data-level="B.2.1" data-path="B-appendixB.html"><a href="B-appendixB.html#problem-statement"><i class="fa fa-check"></i><b>B.2.1</b> Problem Statement</a></li>
 <li class="chapter" data-level="B.2.2" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses"><i class="fa fa-check"></i><b>B.2.2</b> Competing Hypotheses</a></li>
@@ -352,6 +372,7 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="C" data-path="C-appendixC.html"><a href="C-appendixC.html"><i class="fa fa-check"></i><b>C</b> Reach for the Starts</a><ul>
+<li class="chapter" data-level="" data-path="C-appendixC.html"><a href="C-appendixC.html#needed-packages-8"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="C.1" data-path="C-appendixC.html"><a href="C-appendixC.html#sorted-barplots"><i class="fa fa-check"></i><b>C.1</b> Sorted barplots</a></li>
 <li class="chapter" data-level="C.2" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-graphics"><i class="fa fa-check"></i><b>C.2</b> Interactive graphics</a><ul>
 <li class="chapter" data-level="C.2.1" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-line-graphs"><i class="fa fa-check"></i><b>C.2.1</b> Interactive line-graphs</a></li>
@@ -375,21 +396,22 @@ <h1>
           <div class="page-inner">
 
             <section class="normal" id="section-">
-<div id="data-visualization-via-ggplot2" class="section level1">
-<h1><span class="header-section-number">4</span> Data Visualization via <code id="viz">ggplot2</code></h1>
+<div id="viz" class="section level1">
+<h1><span class="header-section-number">4</span> Data Visualization via ggplot2</h1>
 <p>In Chapter <a href="3-tidy.html#tidy">3</a>, we discussed the importance of datasets being <strong>tidy</strong>. You will see in examples here why having a tidy dataset helps us immensely when plotting our data. In plotting our data, we will be able to gain valuable insights from our data that we couldn’t initially see from just looking at the raw data. We will focus on using Hadley Wickham’s <code>ggplot2</code> package in doing so, which was developed to work specifically on datasets that are <strong>tidy</strong>. It provides an easy way to customize your plots and is based on data visualization theory given in <em>The Grammar of Graphics</em> <span class="citation">(Wilkinson <a href="#ref-wilkinson2005">2005</a>)</span>.</p>
-<p>At the most basic level, graphics/plots/charts provide a nice way for us to get a sense for how quantitative variables compare in terms of their center and their spread. The most important thing to know about graphics is that they should be created to make it obvious for your audience to see the findings you want to get across. This requires a balance of not including too much in your plots, but also including enough so that relationships and interesting findings can be easily seen. As we will see, plots/graphics also help us to identify patterns and outliers in our data. We will see that a common extension of these ideas is to compare the distribution of one quantitative variable (i.e., what the spread of a variable looks like) as we go across the levels of a different categorical variable.</p>
-<div id="needed-packages" class="section level2 unnumbered">
-<h2>Needed packages</h2>
-<p>Before we proceed with this chapter, let’s load all the necessary packages, in particular the <code>nycflights13</code> package introduced in Chapter <a href="3-tidy.html#tidy">3</a> containing various data sets.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(dplyr)
-<span class="kw">library</span>(ggplot2)
-<span class="kw">library</span>(nycflights13)</code></pre></div>
+<p>At the most basic level, graphics/plots/charts provide a nice way for us to get a sense for how quantitative variables compare in terms of their center and their spread. The most important thing to know about graphics is that they should be created to make it obvious for your audience to see the findings you want to get across. This requires a balance of not including too much in your plots, but also including enough so that relationships and interesting findings can be easily seen. As we will see, plots/graphics also help us to identify patterns and outliers in our data. We will see that a common extension of these ideas is to compare the <strong>distribution</strong> of one quantitative variable (i.e., what the spread of a variable looks like or how the variable is <em>distributed</em> in terms of its values) as we go across the levels of a different categorical variable.</p>
+<div id="needed-packages-1" class="section level3 unnumbered">
+<h3>Needed packages</h3>
+<p>Before we proceed with this chapter, let’s load all the necessary packages.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(ggplot2)
+<span class="kw">library</span>(nycflights13)
+<span class="kw">library</span>(knitr)
+<span class="kw">library</span>(dplyr)</code></pre></div>
 <!--Subsection on Grammar of Graphics -->
 </div>
 <div id="grammarofgraphics" class="section level2">
 <h2><span class="header-section-number">4.1</span> The Grammar of Graphics</h2>
-<p>We begin with a discussion of a theoretical framework for data visualization known as the “The Grammar of Graphics”, which serves as the basis for the <code>ggplot2</code> package. Much like the way we construct sentences in any language using a linguistic grammar (nouns, verbs, subjects, objects, etc.), the theoretical framework given by Leland Wilkinson <span class="citation">(Wilkinson <a href="#ref-wilkinson2005">2005</a>)</span> allows us to specify the components of a statistical graphic.</p>
+<p>We begin with a discussion of a theoretical framework for data visualization known as the “The Grammar of Graphics,” which serves as the basis for the <code>ggplot2</code> package. Much like the way we construct sentences in any language using a linguistic grammar (nouns, verbs, subjects, objects, etc.), the theoretical framework given by Leland Wilkinson <span class="citation">(Wilkinson <a href="#ref-wilkinson2005">2005</a>)</span> allows us to specify the components of a statistical graphic.</p>
 <div id="components-of-grammar" class="section level3">
 <h3><span class="header-section-number">4.1.1</span> Components of Grammar</h3>
 <p>In short, the grammar tells us that:</p>
@@ -405,7 +427,7 @@ <h3><span class="header-section-number">4.1.1</span> Components of Grammar</h3>
 </div>
 <div id="napoleans-march-on-moscow" class="section level3">
 <h3><span class="header-section-number">4.1.2</span> Napolean’s March on Moscow</h3>
-<p>In 1812, Napoleon led a French invasion of Russia, marching on Moscow. It was one of the biggest military disasters due in large part to the Russian winter. In 1869, a French civil engineer named Charles Joseph Minard published arguably one of the greatest statistical visualizations of all time which summarized this march:</p>
+<p>In 1812, Napoleon led a French invasion of Russia, marching on Moscow. It was one of the biggest military disasters due in large part to the Russian winter. In 1869, a French civil engineer named Charles Joseph Minard published arguably one of the greatest statistical visualizations of all-time, which summarized this march:</p>
 <div class="figure" style="text-align: center"><span id="fig:minard"></span>
 <img src="images/Minard.png" alt="Minard's Visualization of Napolean's March" width="\textwidth" />
 <p class="caption">
@@ -415,7 +437,7 @@ <h3><span class="header-section-number">4.1.2</span> Napolean’s March on Mosco
 <p>This was considered a revolution in statistical graphics because between the map on top and the line graph on the bottom, there are 6 dimensions of information (i.e. variables) being displayed on a 2-dimensional page. Let’s view this graphic through the lens of the Grammar of Graphics:</p>
 <table class="kable_wrapper">
 <caption>
-<span id="tab:unnamed-chunk-13">Table 4.1: </span>Grammar of Map (Top) and Line-Graph (Bottom) in Minard’s Graphic of Napolean’s March
+<span id="tab:unnamed-chunk-16">Table 4.1: </span>Grammar of Map (Top) and Line-Graph (Bottom) in Minard’s Graphic of Napolean’s March
 </caption>
 <tbody>
 <tr>
@@ -478,41 +500,36 @@ <h3><span class="header-section-number">4.1.2</span> Napolean’s March on Mosco
 </tr>
 </tbody>
 </table>
-<p>For example, the data variable <code>longitude</code> gets mapped to <code>x</code> <code>aes</code>thetic of the points <code>geom</code>etric objects on the map while the annotated line-graph displays <code>date</code> and <code>temperature</code> variable information via its mapping to the <code>x</code> and <code>y</code> aesthetic of the line <code>geom</code>etric object.</p>
+<p>For example, the data variable <code>longitude</code> gets mapped to the <code>x</code> <code>aes</code>thetic of the points <code>geom</code>etric objects on the map while the annotated line-graph displays <code>date</code> and <code>temperature</code> variable information via its mapping to the <code>x</code> and <code>y</code> <code>aes</code>thetic of the line <code>geom</code>etric object.</p>
 </div>
 <div id="other-components-of-the-grammar" class="section level3">
 <h3><span class="header-section-number">4.1.3</span> Other Components of the Grammar</h3>
 <p>There are other components of the Grammar of Graphics we can control:</p>
 <ul>
 <li><code>facet</code>: how to break up a plot into subsets</li>
-<li><code>stat</code>istical transformations: this includes smoothing, binning values into a histogram, or just itself untransformed <code>&quot;identity&quot;</code>.</li>
+<li><code>stat</code>istical transformations: this includes smoothing, binning values into a histogram, or just itself un-transformed as <code>&quot;identity&quot;</code>.</li>
 <li><code>scales</code> both
 <ul>
 <li>convert <strong>data units</strong> to <strong>physical units</strong> the computer can display</li>
 <li>draw a legend and/or axes, which provide an inverse mapping to make it possible to read the original data values from the graph.</li>
 </ul></li>
-<li><code>coord</code>inate system for x/y values: typically <code>cartesian</code>, but can also be <code>polar</code>, <code>map</code></li>
+<li><code>coord</code>inate system for x/y values: typically <code>cartesian</code>, but can also be <code>polar</code> or <code>map</code></li>
 <li><code>position</code> adjustments</li>
 </ul>
-<p>In this text, we will only focus on the first two: <code>facet</code>ing (introduced in Section <a href="4-data-visualization-via-ggplot2.html#facets">4.6</a>) and <code>stat</code>istical transformations (in a limited sense when consider Barplots in Section <a href="4-data-visualization-via-ggplot2.html#geombar">4.8</a>) ; the other components are left to a more advanced text. This is not a problem when producing a plot as each of these components have default settings.</p>
+<p>In this text, we will only focus on the first two: <code>facet</code>ing (introduced in Section <a href="4-viz.html#facets">4.6</a>) and <code>stat</code>istical transformations (in a limited sense, when consider Barplots in Section <a href="4-viz.html#geombar">4.8</a>); the other components are left to a more advanced text. This is not a problem when producing a plot as each of these components have default settings.</p>
 <p>There are other extra attributes that can be tweaked as well including the plot title, axes labels, and over-arching themes for the plot. In general, the Grammar of Graphics allows for customization but also a consistent framework that allows the user to easily tweak their creations as needed in order to convey a message about their data.</p>
 </div>
 <div id="the-ggplot2-package" class="section level3">
-<h3><span class="header-section-number">4.1.4</span> The <code>ggplot2</code> Package</h3>
-<p>We introduce Hadley Wickham’s <code>ggplot2</code> package, which is an implementation of the Grammar of Graphics for R <span class="citation">(Wickham and Chang <a href="#ref-R-ggplot2">2016</a>)</span>. You may have noticed that a lot of previous text in this chapter is written in computer font. This is because the various components of the Grammar of Graphics are specified using the <code>ggplot</code> function, which expects at a bare minimal as arguments</p>
+<h3><span class="header-section-number">4.1.4</span> The ggplot2 Package</h3>
+<p>We next introduce Hadley Wickham’s <code>ggplot2</code> package, which is an implementation of the Grammar of Graphics for R <span class="citation">(Wickham and Chang <a href="#ref-R-ggplot2">2016</a>)</span>. You may have noticed that a lot of previous text in this chapter is written in computer font. This is because the various components of the Grammar of Graphics are specified using the <code>ggplot</code> function, which expects at a bare minimal as arguments</p>
 <ul>
 <li>the data frame where the variables exist (the <code>data</code> argument) and</li>
 <li>the names of the variables to be plotted (the <code>mapping</code> argument).</li>
 </ul>
 <p>The names of the variables will be entered into the <code>aes</code> function as arguments where <code>aes</code> stands for “aesthetics”.</p>
-<p>The plot given above is not a histogram, but the output does show us a bit of what is going on with <code>ggplot(data = weather, mapping = aes(x = temp))</code>. It is producing a backdrop onto which we will “paint” elements. We next proceed by adding a layer—hence, the use of the <code>+</code> symbol—to the plot to produce a histogram. (Note also here that we don’t have to specify the <code>data =</code> and <code>mapping =</code> text in our function calls. This is covered in more detail in Chapter 5 of the “Getting Used to R, RStudio, and R Markdown” book <span class="citation">(Ismay <a href="#ref-usedtor2016">2016</a>)</span>).</p>
-<p>You are encouraged to enter <strong>Return</strong> on your keyboard after entering the <code>+</code>. As we add more and more elements, it will be nice to keep them indented as you see below. Note that this will not work if you begin the line with the <code>+</code>.</p>
-<p>An excellent resource as you begin to create plots using the <code>ggplot2</code> package is a cheatsheet that RStudio has put together entitled “Data Visualization with ggplot2” available</p>
-<ul>
-<li>By clicking <a href="https://www.rstudio.com/wp-content/uploads/2015/12/ggplot2-cheatsheet-2.0.pdf">here</a></li>
-<li>or by clicking the RStudio Menu Bar -&gt; Help -&gt; Cheatsheets -&gt; “Data Visualization with ggplot2”</li>
-</ul>
-<p>This covers more than what we’ve discussed in this chapter but provides nice visual descriptions of what each function produces.</p>
+<!--
+The plot given above is not a histogram, but the output does show us a bit of what is going on with `ggplot(data = weather, mapping = aes(x = temp))`.  It is producing a backdrop onto which we will "paint" elements.  We next proceed by adding a layer---hence, the use of the `+` symbol---to the plot to produce a histogram.  (Note also here that we don't have to specify the `data = ` and `mapping = ` text in our function calls.  This is covered in more detail in Chapter 5 of the "Getting Used to R, RStudio, and R Markdown" book [@usedtor2016]).
+-->
 <!--
 <div class="review">
 <p><strong><em>Review questions</em></strong></p>
@@ -543,26 +560,26 @@ <h2><span class="header-section-number">4.2</span> Five Named Graphs - The 5NG</
 <h2><span class="header-section-number">4.3</span> 5NG#1: Scatter-plots</h2>
 <p>The simplest of the 5NG are <strong>scatter-plots</strong> (also called bivariate plots); they allow you to investigate the relationship between two continuous variables. While you may already be familiar with such plots, let’s view it through the lens of the Grammar of Graphics. Specifically, we will graphically investigate the relationship between the following two continuous variables in the <code>flights</code> data frame:</p>
 <ol style="list-style-type: decimal">
-<li><code>dep_delay</code>: departure delay on the horizontal “x” axis</li>
+<li><code>dep_delay</code>: departure delay on the horizontal “x” axis and</li>
 <li><code>arr_delay</code>: arrival delay on the vertical “y” axis</li>
 </ol>
-<p>for Alaska Airlines flights leaving NYC in 2013. This requires paring down the <code>flights</code> data frame to a smaller data frame <code>alaska_flights</code> consisting of only Alaska Airlines (carrier code “AS”) flights.</p>
+<p>for Alaska Airlines flights leaving NYC in 2013. This requires paring down the <code>flights</code> data frame to a smaller data frame <code>all_alaska_flights</code> consisting of only Alaska Airlines (carrier code “AS”) flights.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">data</span>(flights)
-alaska_flights &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
+all_alaska_flights &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
 <span class="st">  </span><span class="kw">filter</span>(carrier ==<span class="st"> &quot;AS&quot;</span>)</code></pre></div>
-<p>This code snippet makes use of functions in the <code>dplyr</code> package for data manipulation to achieve our goal: it takes the <code>flights</code> data frame and <code>filter</code>s it to only return the rows which meet the condition <code>carrier == &quot;AS&quot;</code> (recall equality is specified with <code>==</code> and not <code>=</code>). You will see many more examples using this function in Chapter <a href="5-data-manipulation-via-dplyr.html#manip"><strong>??</strong></a>.</p>
+<p>This code snippet makes use of functions in the <code>dplyr</code> package for data manipulation to achieve our goal: it takes the <code>flights</code> data frame and <code>filter</code>s it to only return the rows which meet the condition <code>carrier == &quot;AS&quot;</code> (recall equality is specified with <code>==</code> and not <code>=</code>). You will see many more examples using this function in Chapter <a href="5-manip.html#manip">5</a>.</p>
 <hr />
 <div class="learncheck">
 <p>
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC3.1)</strong> Take a look at both the <code>flights</code> and <code>alaska_flights</code> data frames by running <code>View(flights)</code> and <code>View(alaska_flights)</code> in the console. In what respect do these data frames differ?</p>
+<p><strong>(LC4.1)</strong> Take a look at both the <code>flights</code> and <code>all_alaska_flights</code> data frames by running <code>View(flights)</code> and <code>View(all_alaska_flights)</code> in the console. In what respect do these data frames differ?</p>
 <hr />
-<div id="scatter-plots-via-geom_point" class="section level3">
-<h3><span class="header-section-number">4.3.1</span> Scatter-plots via <code id="geompoint">geom_point</code></h3>
+<div id="geompoint" class="section level3">
+<h3><span class="header-section-number">4.3.1</span> Scatter-plots via geom_point</h3>
 <p>We proceed to create the scatter-plot using the <code>ggplot()</code> function:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data=</span>alaska_flights, <span class="kw">aes</span>(<span class="dt">x =</span> dep_delay, <span class="dt">y =</span> arr_delay)) +<span class="st"> </span>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> all_alaska_flights, <span class="kw">aes</span>(<span class="dt">x =</span> dep_delay, <span class="dt">y =</span> arr_delay)) +<span class="st"> </span>
 <span class="st">  </span><span class="kw">geom_point</span>()</code></pre></div>
 <div class="figure" style="text-align: center"><span id="fig:noalpha"></span>
 <img src="ismaykim_files/figure-html/noalpha-1.png" alt="Arrival Delays vs Departure Delays for Alaska Airlines flights from NYC in 2013" width="\textwidth" />
@@ -570,11 +587,12 @@ <h3><span class="header-section-number">4.3.1</span> Scatter-plots via <code id=
 Figure 4.2: Arrival Delays vs Departure Delays for Alaska Airlines flights from NYC in 2013
 </p>
 </div>
-<p>Let’s break down this keeping in mind our discussion in Section <a href="4-data-visualization-via-ggplot2.html#grammarofgraphics">4.1</a>:</p>
+<p>You are encouraged to enter <strong>Return</strong> on your keyboard after entering the <code>+</code>. As we add more and more elements, it will be nice to keep them indented as you see below. Note that this will not work if you begin the line with the <code>+</code>.</p>
+<p>Let’s break down this keeping in mind our discussion in Section <a href="4-viz.html#grammarofgraphics">4.1</a>:</p>
 <ul>
 <li>Within the <code>ggplot()</code> function call, we specify two of the components of the grammar:
 <ol style="list-style-type: decimal">
-<li>The <code>data</code> frame to be <code>alaska_flights</code> by setting <code>data=alaska_flights</code></li>
+<li>The <code>data</code> frame to be <code>all_alaska_flights</code> by setting <code>data = all_alaska_flights</code></li>
 <li>The <code>aes</code>thetic mapping by setting <code>aes(x = dep_delay, y = arr_delay)</code>. Specifically
 <ul>
 <li><code>dep_delay</code> maps to the <code>x</code> position</li>
@@ -584,18 +602,18 @@ <h3><span class="header-section-number">4.3.1</span> Scatter-plots via <code id=
 <li>We add a <strong>layer</strong> to the <code>ggplot()</code> function call using the <code>+</code> sign</li>
 <li>The layer in question specifies the third component of the grammar: the <code>geom</code>etric object in question. In this case the geometric object are <code>point</code>s, set by specifying <code>geom_point()</code></li>
 </ul>
-<p>In Figure <a href="4-data-visualization-via-ggplot2.html#fig:noalpha">4.2</a> we see that a positive relationship exists between <code>dep_delay</code> and <code>arr_delay</code>: as departure delays increase, arrival delays tend to also increase. We also note that the majority of points fall near the point (0, 0). There is a large mass of points clustered there.</p>
+<p>In Figure <a href="4-viz.html#fig:noalpha">4.2</a> we see that a positive relationship exists between <code>dep_delay</code> and <code>arr_delay</code>: as departure delays increase, arrival delays tend to also increase. We also note that the majority of points fall near the point (0, 0). There is a large mass of points clustered there. (We will work more with this data set in Chapter <a href="9-regress.html#regress">9</a>, where we investigate correlation and linear regression.)</p>
 <hr />
 <div class="learncheck">
 <p>
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC3.2)</strong> What are some practical reasons why <code>dep_delay</code> and <code>arr_delay</code> have a positive relationship?</p>
-<p><strong>(LC3.3)</strong> What variables (not necessarily in the <code>flights</code> data frame) would you expect to have a negative correlation (i.e. a negative relationship) with <code>dep_delay</code>? Why? Remember that we are focusing on continuous variables here.</p>
-<p><strong>(LC3.4)</strong> Why do you believe there is a cluster of points near (0, 0)? What does (0, 0) correspond to in terms of the Alaskan flights?</p>
-<p><strong>(LC3.5)</strong> What are some other features of the plot that stand out to you?</p>
-<p><strong>(LC3.6)</strong> Create a new scatter-plot using different variables in the <code>alaska_flights</code> data frame by modifying the example above.</p>
+<p><strong>(LC4.2)</strong> What are some practical reasons why <code>dep_delay</code> and <code>arr_delay</code> have a positive relationship?</p>
+<p><strong>(LC4.3)</strong> What variables (not necessarily in the <code>flights</code> data frame) would you expect to have a negative correlation (i.e. a negative relationship) with <code>dep_delay</code>? Why? Remember that we are focusing on continuous variables here.</p>
+<p><strong>(LC4.4)</strong> Why do you believe there is a cluster of points near (0, 0)? What does (0, 0) correspond to in terms of the Alaskan flights?</p>
+<p><strong>(LC4.5)</strong> What are some other features of the plot that stand out to you?</p>
+<p><strong>(LC4.6)</strong> Create a new scatter-plot using different variables in the <code>all_alaska_flights</code> data frame by modifying the example above.</p>
 <hr />
 </div>
 <div id="over-plotting" class="section level3">
@@ -605,8 +623,8 @@ <h3><span class="header-section-number">4.3.2</span> Over-Plotting</h3>
 <li>By adjusting the transparency of the points via the <code>alpha</code> argument</li>
 <li>By jittering the points via <code>geom_jitter()</code></li>
 </ol>
-<p>The first way of relieving over-plotting is by changing the <code>alpha</code> argument to <code>geom_point()</code> which controls the transparency of the points. By default, this value is set to <code>1</code>. We can change this value to a smaller fraction to change the transparency of the points in the plot:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data=</span>alaska_flights, <span class="kw">aes</span>(<span class="dt">x =</span> dep_delay, <span class="dt">y =</span> arr_delay)) +<span class="st"> </span>
+<p>The first way of relieving over-plotting is by changing the <code>alpha</code> argument to <code>geom_point()</code> which controls the transparency of the points. By default, this value is set to <code>1</code>. We can change this value to a smaller fraction (greater than 0) to change the transparency of the points in the plot:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> all_alaska_flights, <span class="kw">aes</span>(<span class="dt">x =</span> dep_delay, <span class="dt">y =</span> arr_delay)) +<span class="st"> </span>
 <span class="st">  </span><span class="kw">geom_point</span>(<span class="dt">alpha =</span> <span class="fl">0.2</span>)</code></pre></div>
 <div class="figure" style="text-align: center"><span id="fig:alpha"></span>
 <img src="ismaykim_files/figure-html/alpha-1.png" alt="Delay scatterplot with alpha=0.2" width="\textwidth" />
@@ -614,9 +632,9 @@ <h3><span class="header-section-number">4.3.2</span> Over-Plotting</h3>
 Figure 4.3: Delay scatterplot with alpha=0.2
 </p>
 </div>
-<p>Note how this function call is identical to the one in Section <a href="4-data-visualization-via-ggplot2.html#scatterplots">4.3</a>, but with <code>geom_point()</code> replaced with <code>alpha=0.2</code> added.</p>
+<p>Note how this function call is identical to the one in Section <a href="4-viz.html#scatterplots">4.3</a>, but with <code>geom_point()</code> replaced with <code>alpha = 0.2</code> added.</p>
 <p>The second way of relieving over-plotting is to <strong>jitter</strong> the points a bit. In other words, we are going to add just a bit of random noise to the points to better see them and remove some of the over-plotting. You can think of “jittering” as shaking the points a bit on the plot. Instead of using <code>geom_point</code>, we use <code>geom_jitter</code> to perform this shaking and specify around how much jitter to add with the <code>width</code> and <code>height</code> arguments. This corresponds to how hard you’d like to shake the plot in units corresponding to those for both the horizontal and vertical variables (in this case minutes).</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data=</span>alaska_flights, <span class="kw">aes</span>(<span class="dt">x =</span> dep_delay, <span class="dt">y =</span> arr_delay)) +<span class="st"> </span>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> all_alaska_flights, <span class="kw">aes</span>(<span class="dt">x =</span> dep_delay, <span class="dt">y =</span> arr_delay)) +<span class="st"> </span>
 <span class="st">  </span><span class="kw">geom_jitter</span>(<span class="dt">width =</span> <span class="dv">30</span>, <span class="dt">height =</span> <span class="dv">30</span>)</code></pre></div>
 <div class="figure" style="text-align: center"><span id="fig:jitter"></span>
 <img src="ismaykim_files/figure-html/jitter-1.png" alt="Jittered delay scatterplot" width="\textwidth" />
@@ -624,15 +642,15 @@ <h3><span class="header-section-number">4.3.2</span> Over-Plotting</h3>
 Figure 4.4: Jittered delay scatterplot
 </p>
 </div>
-<p>Note how this function call is identical to the one in Section <a href="4-data-visualization-via-ggplot2.html#geompoint"><strong>??</strong></a>, but with <code>geom_point()</code> replaced with <code>geom_jitter()</code>. The plot in <a href="4-data-visualization-via-ggplot2.html#fig:jitter">4.4</a> helps us a little bit in getting a sense for the over-plotting, but with a relatively large dataset like this one (714 flights), it can be argued that changing the transparency of the points by setting <code>alpha</code> proved more effective.</p>
+<p>Note how this function call is identical to the one in Section <a href="4-viz.html#geompoint">4.3.1</a>, but with <code>geom_point()</code> replaced with <code>geom_jitter()</code>. The plot in <a href="4-viz.html#fig:jitter">4.4</a> helps us a little bit in getting a sense for the over-plotting, but with a relatively large dataset like this one (714 flights), it can be argued that changing the transparency of the points by setting <code>alpha</code> proved more effective.</p>
 <hr />
 <div class="learncheck">
 <p>
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC3.7)</strong> Why is setting the <code>alpha</code> argument value useful with scatter-plots? What further information does it give you that a regular scatter-plot cannot?</p>
-<p><strong>(LC3.8)</strong> After viewing the Figure <a href="4-data-visualization-via-ggplot2.html#fig:alpha">4.3</a> above, give a range of arrival times and departure times that occur most frequently? How has that region changed compared to when you observed the same plot without the <code>alpha = 0.2</code> set in Figure <a href="4-data-visualization-via-ggplot2.html#fig:noalpha">4.2</a>?</p>
+<p><strong>(LC4.7)</strong> Why is setting the <code>alpha</code> argument value useful with scatter-plots? What further information does it give you that a regular scatter-plot cannot?</p>
+<p><strong>(LC4.8)</strong> After viewing the Figure <a href="4-viz.html#fig:alpha">4.3</a> above, give a range of arrival times and departure times that occur most frequently? How has that region changed compared to when you observed the same plot without the <code>alpha = 0.2</code> set in Figure <a href="4-viz.html#fig:noalpha">4.2</a>?</p>
 <hr />
 <!--
 Maybe include a shading of the points by another variable example here for multivariate thinking?
@@ -647,30 +665,30 @@ <h3><span class="header-section-number">4.3.3</span> Summary</h3>
 </div>
 <div id="linegraphs" class="section level2">
 <h2><span class="header-section-number">4.4</span> 5NG#2: Line-graphs</h2>
-<p>The next of the 5NG is a line-graph. They are most frequently used when the x-axis represents time and the y-axis represents some other numerical variable; such plots are known as <strong>time series</strong>. Time represents a variable that is connected together by each day following the previous day. In other words, time has a natural ordering. Line-graphs should be avoided when there is not a clear sequential ordering to the explanatory variable i.e. the x-variable.</p>
+<p>The next of the 5NG is a line-graph. They are most frequently used when the x-axis represents time and the y-axis represents some other numerical variable; such plots are known as <strong>time series</strong>. Time represents a variable that is connected together by each day following the previous day. In other words, time has a natural ordering. Line-graphs should be avoided when there is not a clear sequential ordering to the explanatory variable, i.e. the x-variable or the <em>predictor</em> variable.</p>
 <p>Our focus turns to the <code>temp</code> variable in this <code>weather</code> dataset. By</p>
 <ul>
 <li>Looking over the <code>weather</code> dataset by typing <code>View(weather)</code> in the console.</li>
 <li>Running <code>?weather</code> to bring up the help file.</li>
 </ul>
-<p>We can see that the <code>temp</code> variable corresponds to hourly temperature (in Fahrenheit) recordings at weather stations near airports in New York City. Instead of considering all hours in 2013 for all three airports in NYC, let’s focus in the hourly temperature at Newark airport (<code>origin</code> code “EWR”) for the first 15 days in January 2013. The <code>weather</code> data frame in the <code>nycflights13</code> package contains this data, but we first need to filter it to only include those rows that correspond to Newark in the first 15 days of January.</p>
+<p>We can see that the <code>temp</code> variable corresponds to hourly temperature (in Fahrenheit) recordings at weather stations near airports in New York City. Instead of considering all hours in 2013 for all three airports in NYC, let’s focus on the hourly temperature at Newark airport (<code>origin</code> code “EWR”) for the first 15 days in January 2013. The <code>weather</code> data frame in the <code>nycflights13</code> package contains this data, but we first need to filter it to only include those rows that correspond to Newark in the first 15 days of January.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">data</span>(weather)
 early_january_weather &lt;-<span class="st"> </span>weather %&gt;%<span class="st"> </span>
-<span class="st">  </span><span class="kw">filter</span>(origin==<span class="st">&quot;EWR&quot;</span> &amp;<span class="st"> </span>month ==<span class="st"> </span><span class="dv">1</span> &amp;<span class="st"> </span>day &lt;=<span class="st"> </span><span class="dv">15</span>)</code></pre></div>
-<p>This is very similar to the previous use of the <code>filter</code> command in Section <a href="4-data-visualization-via-ggplot2.html#scatterplots">4.3</a>, however we now use the <code>&amp;</code> operator. The above selects only those rows in <code>weather</code> where <code>origin==&quot;EWR&quot; **and**</code>month=1<code>**and**</code>day &lt;= 15`.</p>
+<span class="st">  </span><span class="kw">filter</span>(origin ==<span class="st"> &quot;EWR&quot;</span> &amp;<span class="st"> </span>month ==<span class="st"> </span><span class="dv">1</span> &amp;<span class="st"> </span>day &lt;=<span class="st"> </span><span class="dv">15</span>)</code></pre></div>
+<p>This is similar to the previous use of the <code>filter</code> command in Section <a href="4-viz.html#scatterplots">4.3</a>, however we now use the <code>&amp;</code> operator. The above selects only those rows in <code>weather</code> where <code>origin == &quot;EWR&quot;</code> <strong>and</strong> <code>month = 1</code> <strong>and</strong> <code>day &lt;= 15</code>.</p>
 <hr />
 <div class="learncheck">
 <p>
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC3.9)</strong> Take a look at both the <code>weather</code> and <code>early_january_weather</code> data frames by running <code>View(weather)</code> and <code>View(early_january_weather)</code> in the console. In what respect do these data frames differ?</p>
-<p><strong>(LC3.10)</strong> The weather data is recorded hourly. Why does the <code>time_hour</code> variable correctly identify the hour of the measurement and not the just the <code>hour</code> variable?</p>
+<p><strong>(LC4.9)</strong> Take a look at both the <code>weather</code> and <code>early_january_weather</code> data frames by running <code>View(weather)</code> and <code>View(early_january_weather)</code> in the console. In what respect do these data frames differ?</p>
+<p><strong>(LC4.10)</strong> The weather data is recorded hourly. Why does the <code>time_hour</code> variable correctly identify the hour of the measurement whereas the <code>hour</code> variable does not?</p>
 <hr />
-<div id="line-graphs-via-geom_line" class="section level3">
-<h3><span class="header-section-number">4.4.1</span> Line-graphs via <code id="geomline">geom_line</code></h3>
+<div id="geomline" class="section level3">
+<h3><span class="header-section-number">4.4.1</span> Line-graphs via geom_line</h3>
 <p>We plot a line-graph of hourly temperature using <code>geom_line()</code>:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data=</span>early_january_weather, <span class="kw">aes</span>(<span class="dt">x=</span>time_hour, <span class="dt">y=</span>temp)) +
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> early_january_weather, <span class="kw">aes</span>(<span class="dt">x =</span> time_hour, <span class="dt">y =</span> temp)) +
 <span class="st">  </span><span class="kw">geom_line</span>()</code></pre></div>
 <div class="figure" style="text-align: center"><span id="fig:hourlytemp"></span>
 <img src="ismaykim_files/figure-html/hourlytemp-1.png" alt="Hourly Temperature in Newark for Jan 1-15 2013" width="\textwidth" />
@@ -678,11 +696,11 @@ <h3><span class="header-section-number">4.4.1</span> Line-graphs via <code id="g
 Figure 4.5: Hourly Temperature in Newark for Jan 1-15 2013
 </p>
 </div>
-<p>Much as with the <code>ggplot()</code> call in Section <a href="4-data-visualization-via-ggplot2.html#geompoint"><strong>??</strong></a>, we specify the components of the Grammar of Graphics:</p>
+<p>Much as with the <code>ggplot()</code> call in Section <a href="4-viz.html#geompoint">4.3.1</a>, we specify the components of the Grammar of Graphics:</p>
 <ul>
 <li>Within the <code>ggplot()</code> function call, we specify two of the components of the grammar:
 <ol style="list-style-type: decimal">
-<li>The <code>data</code> frame to be <code>early_january_weather</code> by setting <code>data=early_january_weather</code></li>
+<li>The <code>data</code> frame to be <code>early_january_weather</code> by setting <code>data = early_january_weather</code></li>
 <li>The <code>aes</code>thetic mapping by setting <code>aes(x = time_hour, y = temp)</code>. Specifically
 <ul>
 <li><code>time_hour</code> (i.e. the time variable) maps to the <code>x</code> position</li>
@@ -698,12 +716,9 @@ <h3><span class="header-section-number">4.4.1</span> Line-graphs via <code id="g
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC3.11)</strong> Why should line-graphs be avoided when there is not a clear ordering of the horizontal axis?</p>
-<p><strong>(LC3.12)</strong> Why are line-graphs frequently used when time is the explanatory variable?</p>
-<!--
-**`paste0("(LC", chap, ".", (lc <- lc + 1), ")")`** Why did we use the `flights_summarized` dataframe to produce the line-graph in Figure <a href="#fig:lineflights"><strong>??</strong></a> instead of `flights` or `flights_day`?
--->
-<p><strong>(LC3.13)</strong> Plot a time series of a variable other than <code>temp</code> for Newark Airport in the first 15 days of January 2013.</p>
+<p><strong>(LC4.11)</strong> Why should line-graphs be avoided when there is not a clear ordering of the horizontal axis?</p>
+<p><strong>(LC4.12)</strong> Why are line-graphs frequently used when time is the explanatory variable?</p>
+<p><strong>(LC4.13)</strong> Plot a time series of a variable other than <code>temp</code> for Newark Airport in the first 15 days of January 2013.</p>
 <hr />
 </div>
 <div id="summary-1" class="section level3">
@@ -714,23 +729,23 @@ <h3><span class="header-section-number">4.4.2</span> Summary</h3>
 </div>
 <div id="histograms" class="section level2">
 <h2><span class="header-section-number">4.5</span> 5NG#3: Histograms</h2>
-<p>Let’s consider the <code>temp</code> variable in the <code>weather</code> data frame once again, but now unlike with the line-graphs in Section <a href="4-data-visualization-via-ggplot2.html#linegraphs">4.4</a>, let’s say we don’t care about the relationship of temperature to time, but rather you care about the <strong>(statistical) distribution</strong> of temperatures. We could just produce points where each of the different values appear on something similar to a number line:</p>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-16"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-16-1.png" alt="Strip Plot of Hourly Temperature Recordings from NYC in 2013" width="\textwidth" />
+<p>Let’s consider the <code>temp</code> variable in the <code>weather</code> data frame once again, but now unlike with the line-graphs in Section <a href="4-viz.html#linegraphs">4.4</a>, let’s say we don’t care about the relationship of temperature to time, but rather you care about the <strong>(statistical) distribution</strong> of temperatures. We could just produce points where each of the different values appear on something similar to a number line:</p>
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-19"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-19-1.png" alt="Strip Plot of Hourly Temperature Recordings from NYC in 2013" width="\textwidth" />
 <p class="caption">
 Figure 4.6: Strip Plot of Hourly Temperature Recordings from NYC in 2013
 </p>
 </div>
 <p>This gives us a general idea of how the values of <code>temp</code> differ. We see that temperatures vary from around 11 up to 100 degrees Fahrenheit. The area between 40 and 60 degrees appears to have more points plotted than outside that range.</p>
-<div id="histograms-via-geom_histogram" class="section level3">
-<h3><span class="header-section-number">4.5.1</span> Histograms via <code id="geomhistogram">geom_histogram</code></h3>
+<div id="geomhistogram" class="section level3">
+<h3><span class="header-section-number">4.5.1</span> Histograms via geom_histogram</h3>
 <p>What is commonly produced instead of this strip plot is a plot known as a <strong>histogram</strong>. The <strong>histogram</strong> shows how many elements of a single numerical variable fall in specified <strong>bins</strong>. In this case, these <strong>bins</strong> may correspond to between 0-10°F, 10-20°F, etc. We produce a histogram of the hour temperatures at all three NYC airports in 2013:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> weather, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> temp)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>()</code></pre></div>
 <pre><code>## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.</code></pre>
 <pre><code>## Warning: Removed 1 rows containing non-finite values (stat_bin).</code></pre>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-17"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-17-1.png" alt="Histogram of Hourly Temperature Recordings from NYC in 2013" width="\textwidth" />
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-20"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-20-1.png" alt="Histogram of Hourly Temperature Recordings from NYC in 2013" width="\textwidth" />
 <p class="caption">
 Figure 4.7: Histogram of Hourly Temperature Recordings from NYC in 2013
 </p>
@@ -749,20 +764,21 @@ <h3><span class="header-section-number">4.5.2</span> Adjusting the Bins</h3>
 <li>By adjusting the number of bins via the <code>bins</code> argument</li>
 <li>By adjusting the width of the bins via the <code>binwidth</code> argument</li>
 </ol>
-<p>First, we have the power to specify how many bins we would like to put the data into as an argument in the <code>geom_histogram</code> function. By default, this is chosen to be 30 somewhat arbitrarily we have received a warning above our plot that this was done.</p>
+<p>First, we have the power to specify how many bins we would like to put the data into as an argument in the <code>geom_histogram</code> function. By default, this is chosen to be 30 somewhat arbitrarily; we have received a warning above our plot that this was done.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> weather, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> temp)) +
-<span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">bins =</span> <span class="dv">60</span>)</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-18"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-18-1.png" alt="Histogram of Hourly Temperature Recordings from NYC in 2013 - 60 Bins" width="\textwidth" />
+<span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">bins =</span> <span class="dv">60</span>, <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>)</code></pre></div>
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-21"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-21-1.png" alt="Histogram of Hourly Temperature Recordings from NYC in 2013 - 60 Bins" width="\textwidth" />
 <p class="caption">
 Figure 4.8: Histogram of Hourly Temperature Recordings from NYC in 2013 - 60 Bins
 </p>
 </div>
+<p>Note the addition of the <code>color</code> argument. If you’d like to be able to more easily differentiate each of the bins, you can specify the color of the outline as done above.</p>
 <p>Second, instead of specifying the number of bins, we can also specify the width of the bins by using the <code>binwidth</code> argument in the <code>geom_histogram</code> function.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> weather, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> temp)) +
-<span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">binwidth =</span> <span class="dv">10</span>)</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-19"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-19-1.png" alt="Histogram of Hourly Temperature Recordings from NYC in 2013 - Binwidth = 10" width="\textwidth" />
+<span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">binwidth =</span> <span class="dv">10</span>, <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>)</code></pre></div>
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-22"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-22-1.png" alt="Histogram of Hourly Temperature Recordings from NYC in 2013 - Binwidth = 10" width="\textwidth" />
 <p class="caption">
 Figure 4.9: Histogram of Hourly Temperature Recordings from NYC in 2013 - Binwidth = 10
 </p>
@@ -773,10 +789,10 @@ <h3><span class="header-section-number">4.5.2</span> Adjusting the Bins</h3>
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC3.14)</strong> What does changing the number of bins from 30 to 60 tell us about the distribution of temperatures?</p>
-<p><strong>(LC3.15)</strong> Would you classify the distribution of temperatures as symmetric or skewed?</p>
-<p><strong>(LC3.16)</strong> What would you guess is the “center” value in this distribution? Why did you make that choice?</p>
-<p><strong>(LC3.17)</strong> Is this data spread out greatly from the center or is it close? Why?</p>
+<p><strong>(LC4.14)</strong> What does changing the number of bins from 30 to 60 tell us about the distribution of temperatures?</p>
+<p><strong>(LC4.15)</strong> Would you classify the distribution of temperatures as symmetric or skewed?</p>
+<p><strong>(LC4.16)</strong> What would you guess is the “center” value in this distribution? Why did you make that choice?</p>
+<p><strong>(LC4.17)</strong> Is this data spread out greatly from the center or is it close? Why?</p>
 <hr />
 </div>
 <div id="summary-2" class="section level3">
@@ -787,10 +803,10 @@ <h3><span class="header-section-number">4.5.3</span> Summary</h3>
 <div id="facets" class="section level2">
 <h2><span class="header-section-number">4.6</span> Facets</h2>
 <p>Before continuing the 5NG, we briefly introduce a new concept called <strong>faceting</strong>. Faceting is used when we’d like to create small multiples of the same plot over a different categorical variable. By default, all of the small multiples will have the same vertical axis.</p>
-<p>For example, suppose we were interested in looking at how the temperature histograms we saw in Chapter <a href="4-data-visualization-via-ggplot2.html#histograms">4.5</a> varied by month. This is what is meant by “the distribution of a variable over another variable”: <code>temp</code> is one variable and <code>month</code> is the other variable. In order to look at histograms of <code>temp</code> for each month, we add a layer <code>facet_wrap(~month)</code>.</p>
+<p>For example, suppose we were interested in looking at how the temperature histograms we saw in Section <a href="4-viz.html#histograms">4.5</a> varied by month. This is what is meant by “the distribution of a variable over another variable”: <code>temp</code> is one variable and <code>month</code> is the other variable. In order to look at histograms of <code>temp</code> for each month, we add a layer <code>facet_wrap(~month)</code>. You can also specify how many rows you’d like the small multiple plots to be in using <code>nrow</code> inside of <code>facet_wrap</code>.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> weather, <span class="kw">aes</span>(<span class="dt">x =</span> temp)) +
-<span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">binwidth =</span> <span class="dv">5</span>) +
-<span class="st">  </span><span class="kw">facet_wrap</span>(~month)</code></pre></div>
+<span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">binwidth =</span> <span class="dv">5</span>, <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>) +
+<span class="st">  </span><span class="kw">facet_wrap</span>(~<span class="st"> </span>month, <span class="dt">nrow =</span> <span class="dv">4</span>)</code></pre></div>
 <div class="figure" style="text-align: center"><span id="fig:facethistogram"></span>
 <img src="ismaykim_files/figure-html/facethistogram-1.png" alt="Faceted histogram" width="\textwidth" />
 <p class="caption">
@@ -804,21 +820,18 @@ <h2><span class="header-section-number">4.6</span> Facets</h2>
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC3.18)</strong> What other things do you notice about the faceted plot above? How does a faceted plot help us see how relationships between two variables?</p>
-<p><strong>(LC3.19)</strong> What do the numbers 1-12 correspond to in the plot above? What about 25, 50, 75, 100?</p>
-<!--
-**(LC3.20)** What could be done to make the faceted plot above more readable?  (Focus on tweaking the histograms and not on making a different type of plot here.)
--->
-<p><strong>(LC3.21)</strong> For which types of datasets would these types of faceted plots not work well in comparing relationships between variables? Give an example describing the variability of the variables and other important characteristics.</p>
-<p><strong>(LC3.22)</strong> Does the <code>temp</code> variable in the <code>weather</code> data set have a lot of variability? Why do you say that?</p>
+<p><strong>(LC4.18)</strong> What other things do you notice about the faceted plot above? How does a faceted plot help us see how relationships between two variables?</p>
+<p><strong>(LC4.19)</strong> What do the numbers 1-12 correspond to in the plot above? What about 25, 50, 75, 100?</p>
+<p><strong>(LC4.20)</strong> For which types of datasets would these types of faceted plots not work well in comparing relationships between variables? Give an example describing the variability of the variables and other important characteristics.</p>
+<p><strong>(LC4.21)</strong> Does the <code>temp</code> variable in the <code>weather</code> data set have a lot of variability? Why do you say that?</p>
 <hr />
 <!--Subsection on boxplots -->
 </div>
 <div id="ng4-boxplots" class="section level2">
 <h2><span class="header-section-number">4.7</span> 5NG#4: Boxplots</h2>
-<p>While using faceted histograms can provide a way to compare distributions of a continuous variable split by groups of a categorical variable as in Chapter <a href="4-data-visualization-via-ggplot2.html#facets">4.6</a>, an alternative plot called a <strong>boxplot</strong> (also called a <strong>side-by-side boxplot</strong>) achieves the same task. The <strong>boxplot</strong> uses the information provided in the <strong>five-number summary</strong> referred to in Appendix <a href="A-appendixA.html#appendixA">A</a>. It gives a way to compare this summary information across the different levels of a categorical variable.</p>
-<div id="boxplots-via-geom_boxplot" class="section level3">
-<h3><span class="header-section-number">4.7.1</span> Boxplots via <code id="geomboxplot">geom_boxplot</code></h3>
+<p>While using faceted histograms can provide a way to compare distributions of a continuous variable split by groups of a categorical variable as in Chapter <a href="4-viz.html#facets">4.6</a>, an alternative plot called a <strong>boxplot</strong> (also called a <strong>side-by-side boxplot</strong>) achieves the same task and is frequently preferred. The <strong>boxplot</strong> uses the information provided in the <strong>five-number summary</strong> referred to in Appendix <a href="A-appendixA.html#appendixA">A</a>. It gives a way to compare this summary information across the different levels of a categorical variable.</p>
+<div id="geomboxplot" class="section level3">
+<h3><span class="header-section-number">4.7.1</span> Boxplots via geom_boxplot</h3>
 <p>Let’s create a boxplot to compare the monthly temperatures as we did above with the faceted histograms.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> weather, <span class="kw">aes</span>(<span class="dt">x =</span> month, <span class="dt">y =</span> temp)) +
 <span class="st">  </span><span class="kw">geom_boxplot</span>()</code></pre></div>
@@ -844,15 +857,15 @@ <h3><span class="header-section-number">4.7.1</span> Boxplots via <code id="geom
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC3.23)</strong> What does the dot at the bottom of the plot for May correspond to? Explain what might have occurred in May to produce this point.</p>
-<p><strong>(LC3.24)</strong> Which months have the highest variability in temperature? What reasons do you think this is?</p>
-<p><strong>(LC3.25)</strong> We looked at the distribution of a continuous variable over a categorical variable here with this boxplot. Why can’t we look at the distribution of one continuous variable over the distribution of another continuous variable? Say temperature across pressure, for example?</p>
-<p><strong>(LC3.26)</strong> Boxplots provide a simple way to identify outliers. Why may outliers be easier to identify when looking at a boxplot instead of a faceted histogram?</p>
+<p><strong>(LC4.22)</strong> What does the dot at the bottom of the plot for May correspond to? Explain what might have occurred in May to produce this point.</p>
+<p><strong>(LC4.23)</strong> Which months have the highest variability in temperature? What reasons do you think this is?</p>
+<p><strong>(LC4.24)</strong> We looked at the distribution of a continuous variable over a categorical variable here with this boxplot. Why can’t we look at the distribution of one continuous variable over the distribution of another continuous variable? Say, temperature across pressure, for example?</p>
+<p><strong>(LC4.25)</strong> Boxplots provide a simple way to identify outliers. Why may outliers be easier to identify when looking at a boxplot instead of a faceted histogram?</p>
 <hr />
 </div>
 <div id="summary-3" class="section level3">
 <h3><span class="header-section-number">4.7.2</span> Summary</h3>
-<p>Boxplots provide a way to compare and contrast the distribution of one quantitative variable across multiple levels of one categorical variable. One can easily look to see where the median falls across the different groups by looking at the center line in the box. You can also see how spread out the variable is across the different groups by looking at the width of the box and also how far out the lines stretch from the box. If the lines stretch far from the box but the box has a small width, the variability of the values closer to the center is much smaller than the variable of the outer ends of the variable. Lastly, outliers are even more easily identified when looking at a boxplot than when looking at a histogram.</p>
+<p>Boxplots provide a way to compare and contrast the distribution of one quantitative variable across multiple levels of one categorical variable. One can easily look to see where the median falls across the different groups by looking at the center line in the box. You can also see how spread out the variable is across the different groups by looking at the width of the box and also how far out the lines stretch from the box. If the lines stretch far from the box but the box has a small width, the variability of the values closer to the center is much smaller than the variability of the outer ends of the variable. Lastly, outliers are even more easily identified when looking at a boxplot than when looking at a histogram.</p>
 <!--Subsection on barplots -->
 </div>
 </div>
@@ -860,7 +873,7 @@ <h3><span class="header-section-number">4.7.2</span> Summary</h3>
 <h2><span class="header-section-number">4.8</span> 5NG#5: Barplots</h2>
 <p>Both histograms and boxplots represent ways to visualize the variability of continuous variables. Another common task is to present the distribution of a categorical variable. This is a simpler task since we will be interested in how many elements from our data fall into the different categories of the categorical variable.</p>
 <div id="barplots-via-geom_bar" class="section level3">
-<h3><span class="header-section-number">4.8.1</span> Barplots via <code>geom_bar</code></h3>
+<h3><span class="header-section-number">4.8.1</span> Barplots via geom_bar</h3>
 <p>Frequently, the best way to visualize these different counts (also known as <strong>frequencies</strong>) is via a barplot. Consider the distribution of airlines that flew out of New York City in 2013. Here we explore the number of flights from each airline/<code>carrier</code>. This can be plotted by invoking the <code>geom_bar</code> function in <code>ggplot2</code>:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> flights, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> carrier)) +
 <span class="st">  </span><span class="kw">geom_bar</span>()</code></pre></div>
@@ -870,21 +883,171 @@ <h3><span class="header-section-number">4.8.1</span> Barplots via <code>geom_bar
 Figure 4.13: Number of flights departing NYC in 2013 by airline
 </p>
 </div>
-<p>We see that United Air Lines, JetBlue Airways, and ExpressJet Airlines had the most flights depart New York City in 2013. To get the actual number of flights by each airline we can use the <code>count</code> function in the <code>dplyr</code> package on the <code>carrier</code> variable in <code>flights</code>, which we will introduce formally in Chapter @ref{manip}.</p>
-<pre><code>## # A tibble: 1 × 1
-##    `1.n`
-##    &lt;int&gt;
-## 1 336776</code></pre>
+<p>To get an understanding of what the names of these airlines are corresponding to these <code>carrier</code> codes, we can look at the <code>airlines</code> data frame in the <code>nycflights13</code> package. Note the use of the <code>kable</code> function here in the <code>knitr</code> package, which produces a nicely-formatted table of the values in the <code>airlines</code> data frame.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">data</span>(airlines)
+<span class="kw">kable</span>(airlines)</code></pre></div>
+<table>
+<thead>
+<tr class="header">
+<th align="left">carrier</th>
+<th align="left">name</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td align="left">9E</td>
+<td align="left">Endeavor Air Inc.</td>
+</tr>
+<tr class="even">
+<td align="left">AA</td>
+<td align="left">American Airlines Inc.</td>
+</tr>
+<tr class="odd">
+<td align="left">AS</td>
+<td align="left">Alaska Airlines Inc.</td>
+</tr>
+<tr class="even">
+<td align="left">B6</td>
+<td align="left">JetBlue Airways</td>
+</tr>
+<tr class="odd">
+<td align="left">DL</td>
+<td align="left">Delta Air Lines Inc.</td>
+</tr>
+<tr class="even">
+<td align="left">EV</td>
+<td align="left">ExpressJet Airlines Inc.</td>
+</tr>
+<tr class="odd">
+<td align="left">F9</td>
+<td align="left">Frontier Airlines Inc.</td>
+</tr>
+<tr class="even">
+<td align="left">FL</td>
+<td align="left">AirTran Airways Corporation</td>
+</tr>
+<tr class="odd">
+<td align="left">HA</td>
+<td align="left">Hawaiian Airlines Inc.</td>
+</tr>
+<tr class="even">
+<td align="left">MQ</td>
+<td align="left">Envoy Air</td>
+</tr>
+<tr class="odd">
+<td align="left">OO</td>
+<td align="left">SkyWest Airlines Inc.</td>
+</tr>
+<tr class="even">
+<td align="left">UA</td>
+<td align="left">United Air Lines Inc.</td>
+</tr>
+<tr class="odd">
+<td align="left">US</td>
+<td align="left">US Airways Inc.</td>
+</tr>
+<tr class="even">
+<td align="left">VX</td>
+<td align="left">Virgin America</td>
+</tr>
+<tr class="odd">
+<td align="left">WN</td>
+<td align="left">Southwest Airlines Co.</td>
+</tr>
+<tr class="even">
+<td align="left">YV</td>
+<td align="left">Mesa Airlines Inc.</td>
+</tr>
+</tbody>
+</table>
+<p>Going back to our barplot, we see that United Air Lines, JetBlue Airways, and ExpressJet Airlines had the most flights depart New York City in 2013. To get the actual number of flights by each airline we can use the <code>count</code> function in the <code>dplyr</code> package on the <code>carrier</code> variable in <code>flights</code>, which we will introduce formally in Chapter <a href="5-manip.html#manip">5</a>.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights_table &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>dplyr::<span class="kw">count</span>(carrier)
+knitr::<span class="kw">kable</span>(flights_table)</code></pre></div>
+<table>
+<thead>
+<tr class="header">
+<th align="left">carrier</th>
+<th align="right">n</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td align="left">9E</td>
+<td align="right">18460</td>
+</tr>
+<tr class="even">
+<td align="left">AA</td>
+<td align="right">32729</td>
+</tr>
+<tr class="odd">
+<td align="left">AS</td>
+<td align="right">714</td>
+</tr>
+<tr class="even">
+<td align="left">B6</td>
+<td align="right">54635</td>
+</tr>
+<tr class="odd">
+<td align="left">DL</td>
+<td align="right">48110</td>
+</tr>
+<tr class="even">
+<td align="left">EV</td>
+<td align="right">54173</td>
+</tr>
+<tr class="odd">
+<td align="left">F9</td>
+<td align="right">685</td>
+</tr>
+<tr class="even">
+<td align="left">FL</td>
+<td align="right">3260</td>
+</tr>
+<tr class="odd">
+<td align="left">HA</td>
+<td align="right">342</td>
+</tr>
+<tr class="even">
+<td align="left">MQ</td>
+<td align="right">26397</td>
+</tr>
+<tr class="odd">
+<td align="left">OO</td>
+<td align="right">32</td>
+</tr>
+<tr class="even">
+<td align="left">UA</td>
+<td align="right">58665</td>
+</tr>
+<tr class="odd">
+<td align="left">US</td>
+<td align="right">20536</td>
+</tr>
+<tr class="even">
+<td align="left">VX</td>
+<td align="right">5162</td>
+</tr>
+<tr class="odd">
+<td align="left">WN</td>
+<td align="right">12275</td>
+</tr>
+<tr class="even">
+<td align="left">YV</td>
+<td align="right">601</td>
+</tr>
+</tbody>
+</table>
+<p><strong>Technical note</strong>: Refer to the use of <code>::</code> in both lines of code above. This is another way of ensuring the correct function is called. A <code>count</code> exists in a couple different packages and sometimes you’ll receive strange errors when a different instance of a function is used. This is a great way of telling R that “I want this one!”. You specify the name of the package directly before the <code>::</code> and then the name of the function immediately after <code>::</code>.</p>
 <hr />
 <div class="learncheck">
 <p>
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC3.27)</strong> Why are histograms inappropriate for visualizing categorical variables?</p>
-<p><strong>(LC3.28)</strong> What is the difference between histograms and barplots?</p>
-<p><strong>(LC3.29)</strong> How many Envoy Air flights departed NYC in 2013?</p>
-<p><strong>(LC3.30)</strong> What was the seventh highest airline in terms of departed flights from NYC in 2013? How can we better present the table to get this answer quickly.</p>
+<p><strong>(LC4.26)</strong> Why are histograms inappropriate for visualizing categorical variables?</p>
+<p><strong>(LC4.27)</strong> What is the difference between histograms and barplots?</p>
+<p><strong>(LC4.28)</strong> How many Envoy Air flights departed NYC in 2013?</p>
+<p><strong>(LC4.29)</strong> What was the seventh highest airline in terms of departed flights from NYC in 2013? How could we better present the table to get this answer quickly.</p>
 <hr />
 </div>
 <div id="must-avoid-pie-charts" class="section level3">
@@ -903,7 +1066,7 @@ <h3><span class="header-section-number">4.8.2</span> Must avoid pie charts!</h3>
 </p>
 </div>
 <p>While it is quite easy to look back at the barplot to get the answer to these questions, it’s quite difficult to get the answers correct when looking at the pie graph. Barplots can always present the information in a way that is easier for the eye to determine relative position. There may be one exception from Nathan Yau at <a href="https://flowingdata.com/2008/09/19/pie-i-have-eaten-and-pie-i-have-not-eaten/" title="Pie I Have Eaten and Pie I Have Not Eaten">FlowingData.com</a> but we will leave this for the reader to decide:</p>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-21"></span>
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-25"></span>
 <img src="images/Pie-I-have-Eaten.jpg" alt="The only good pie chart" width="\textwidth" />
 <p class="caption">
 Figure 4.15: The only good pie chart
@@ -915,21 +1078,22 @@ <h3><span class="header-section-number">4.8.2</span> Must avoid pie charts!</h3>
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC3.31)</strong> Why should pie charts be avoided and replaced by barplots?</p>
-<p><strong>(LC3.32)</strong> What is your opinion as to why pie charts continue to be used?</p>
+<p><strong>(LC4.30)</strong> Why should pie charts be avoided and replaced by barplots?</p>
+<p><strong>(LC4.31)</strong> What is your opinion as to why pie charts continue to be used?</p>
 <hr />
 </div>
 <div id="using-barplots-to-compare-two-variables" class="section level3">
 <h3><span class="header-section-number">4.8.3</span> Using barplots to compare two variables</h3>
 <p>Barplots are the go-to way to visualize the frequency of different categories of a categorical variable. They make it easy to order the counts and to compare one group’s frequency to another. Another use of barplots (unfortunately, sometimes inappropriately and confusingly) is to compare two categorical variables together. Let’s examine the distribution of outgoing flights from NYC by <code>carrier</code> and <code>airport</code>.</p>
-<p>We begin by getting the names of the airports in NYC that were included in the <code>flights</code> dataset. Remember from Chapter <a href="3-tidy.html#tidy">3</a> that this can be done by using the <code>inner_join</code> function (more in Chapter <a href="5-data-manipulation-via-dplyr.html#manip"><strong>??</strong></a>).</p>
+<p>We begin by getting the names of the airports in NYC that were included in the <code>flights</code> dataset. Remember from Chapter <a href="3-tidy.html#tidy">3</a> that this can be done by using the <code>inner_join</code> function (more in Chapter <a href="5-manip.html#manip">5</a>).</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights_namedports &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
 <span class="st">  </span><span class="kw">inner_join</span>(airports, <span class="dt">by =</span> <span class="kw">c</span>(<span class="st">&quot;origin&quot;</span> =<span class="st"> &quot;faa&quot;</span>))</code></pre></div>
 <p>After running <code>View(flights_namedports)</code>, we see that <code>name</code> now corresponds to the name of the airport as referenced by the <code>origin</code> variable. We will now plot <code>carrier</code> as the horizontal variable. When we specify <code>geom_bar</code>, it will specify <code>count</code> as being the vertical variable. A new addition here is <code>fill = name</code>. Look over what was produced from the plot to get an idea of what this argument gives.</p>
+<p>Note that <code>fill</code> is an <code>aes</code>thetic just like <code>x</code> is an <code>aes</code>thetic. We need to make the <code>name</code> variable to this <code>aes</code>thetic. Any time you use a variable like this, you need to make sure it is wrapped inside the <code>aes</code> function. <strong>This is a common error!</strong> Make note of this now so you don’t fall into this problem later.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> flights_namedports, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> carrier, <span class="dt">fill =</span> name)) +
 <span class="st">  </span><span class="kw">geom_bar</span>()</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-23"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-23-1.png" alt="Stacked barplot comparing the number of flights by carrier and airport" width="\textwidth" />
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-27"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-27-1.png" alt="Stacked barplot comparing the number of flights by carrier and airport" width="\textwidth" />
 <p class="caption">
 Figure 4.16: Stacked barplot comparing the number of flights by carrier and airport
 </p>
@@ -941,14 +1105,14 @@ <h3><span class="header-section-number">4.8.3</span> Using barplots to compare t
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC3.33)</strong> What kinds of questions are not easily answered by looking at the above figure?</p>
-<p><strong>(LC3.34)</strong> What can you say, if anything, about the relationship between airline and airport in NYC in 2013 in regards to the number of departing flights?</p>
+<p><strong>(LC4.32)</strong> What kinds of questions are not easily answered by looking at the above figure?</p>
+<p><strong>(LC4.33)</strong> What can you say, if anything, about the relationship between airline and airport in NYC in 2013 in regards to the number of departing flights?</p>
 <hr />
 <p>Another variation on the <strong>stacked barplot</strong> is the <strong>side-by-side barplot</strong>.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> flights_namedports, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> carrier, <span class="dt">fill =</span> name)) +
 <span class="st">  </span><span class="kw">geom_bar</span>(<span class="dt">position =</span> <span class="st">&quot;dodge&quot;</span>)</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-24"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-24-1.png" alt="Side-by-side barplot comparing the number of flights by carrier and airport" width="\textwidth" />
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-28"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-28-1.png" alt="Side-by-side barplot comparing the number of flights by carrier and airport" width="\textwidth" />
 <p class="caption">
 Figure 4.17: Side-by-side barplot comparing the number of flights by carrier and airport
 </p>
@@ -959,28 +1123,28 @@ <h3><span class="header-section-number">4.8.3</span> Using barplots to compare t
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC3.35)</strong> Why might the side-by-side barplot be preferable to a stacked barplot in this case?</p>
-<p><strong>(LC3.36)</strong> What are the disadvantages of using a side-by-side barplot, in general?</p>
+<p><strong>(LC4.34)</strong> Why might the side-by-side barplot be preferable to a stacked barplot in this case?</p>
+<p><strong>(LC4.35)</strong> What are the disadvantages of using a side-by-side barplot, in general?</p>
 <hr />
-<p>Lastly, an often preferred type of barplot is the <strong>faceted barplot</strong>. We already saw this concept of faceting and small multiples in Section <a href="4-data-visualization-via-ggplot2.html#facets">4.6</a>. This gives us a nicer way to compare the distributions across both <code>carrier</code> and airport/<code>name</code>.</p>
+<p>Lastly, an often preferred type of barplot is the <strong>faceted barplot</strong>. We already saw this concept of faceting and small multiples in Section <a href="4-viz.html#facets">4.6</a>. This gives us a nicer way to compare the distributions across both <code>carrier</code> and airport/<code>name</code>.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> flights_namedports, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> carrier, <span class="dt">fill =</span> name)) +
 <span class="st">  </span><span class="kw">geom_bar</span>() +
 <span class="st">  </span><span class="kw">facet_grid</span>(name ~<span class="st"> </span>.)</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-25"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-25-1.png" alt="Faceted barplot comparing the number of flights by carrier and airport" width="\textwidth" />
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-29"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-29-1.png" alt="Faceted barplot comparing the number of flights by carrier and airport" width="\textwidth" />
 <p class="caption">
 Figure 4.18: Faceted barplot comparing the number of flights by carrier and airport
 </p>
 </div>
-<p>Note how the <code>facet_grid</code> function arguments are written here. We are wanting the names of the airports vertically and the <code>carrier</code> listed horizontally. As you may have guessed, this argument and other <em>formulas</em> of this sort in R are in <code>y ~ x</code> order. We will see more examples of this in Chapter <a href="9-regression-via-broom.html#regress"><strong>??</strong></a>.</p>
+<p>Note how the <code>facet_grid</code> function arguments are written here. We are wanting the names of the airports vertically and the <code>carrier</code> listed horizontally. As you may have guessed, this argument and other <em>formulas</em> of this sort in R are in <code>y ~ x</code> order. We will see more examples of this in Chapter <a href="9-regress.html#regress">9</a>.</p>
 <hr />
 <div class="learncheck">
 <p>
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC3.37)</strong> Why is the faceted barplot preferred to the side-by-side and stacked barplots in this case?</p>
-<p><strong>(LC3.38)</strong> What information about the different carriers at different airports is more easily seen in the faceted barplot?</p>
+<p><strong>(LC4.36)</strong> Why is the faceted barplot preferred to the side-by-side and stacked barplots in this case?</p>
+<p><strong>(LC4.37)</strong> What information about the different carriers at different airports is more easily seen in the faceted barplot?</p>
 <hr />
 </div>
 <div id="summary-4" class="section level3">
@@ -991,13 +1155,29 @@ <h3><span class="header-section-number">4.8.4</span> Summary</h3>
 </div>
 <div id="conclusion" class="section level2">
 <h2><span class="header-section-number">4.9</span> Conclusion</h2>
-<div id="whats-to-come-1" class="section level3">
-<h3><span class="header-section-number">4.9.1</span> What’s to come?</h3>
-<p>In Chapter <a href="5-data-manipulation-via-dplyr.html#manip"><strong>??</strong></a>, we’ll further explore data by grouping our data, creating summaries based on those groupings, filtering our data to match conditions, selecting specific columns of our data, and other manipulations with our data including defining new columns/variables. These data manipulation procedures will go hand-in-hand with the data visualizations you’ve produced here.</p>
+<div id="resources" class="section level3">
+<h3><span class="header-section-number">4.9.1</span> Resources</h3>
+<p>An excellent resource as you begin to create plots using the <code>ggplot2</code> package is a cheatsheet that RStudio has put together entitled “Data Visualization with ggplot2” available</p>
+<ul>
+<li>by clicking <a href="https://www.rstudio.com/wp-content/uploads/2015/12/ggplot2-cheatsheet-2.0.pdf">here</a> or</li>
+<li>by clicking the RStudio Menu Bar -&gt; Help -&gt; Cheatsheets -&gt; “Data Visualization with <code>ggplot2</code>”</li>
+</ul>
+<p>This covers more than what we’ve discussed in this chapter but provides nice visual descriptions of what each function produces.</p>
+<p>In addition, we’ve created a mind map to help you remember which types of plots are most appropriate in a given situation by identifying the types of variables involved in the problem. It is available <a href="https://coggle.it/diagram/V_G2gzukTDoQ-aZt-">here</a> and below.</p>
+<div class="figure" style="text-align: center"><span id="fig:viz-map"></span>
+<img src="images/coggleviz.png" alt="Mind map for Data Visualization" width="200%" />
+<p class="caption">
+Figure 4.19: Mind map for Data Visualization
+</p>
+</div>
 </div>
 <div id="script-of-r-code" class="section level3">
 <h3><span class="header-section-number">4.9.2</span> Script of R code</h3>
 <p>An R script file of all R code used in this chapter is available <a href="http://ismayc.github.io/moderndiver-book/04-viz.R">here</a>.</p>
+</div>
+<div id="whats-to-come-1" class="section level3">
+<h3><span class="header-section-number">4.9.3</span> What’s to come?</h3>
+<p>In Chapter <a href="5-manip.html#manip">5</a>, we’ll further explore data by grouping our data, creating summaries based on those groupings, filtering our data to match conditions, and other manipulations with our data including defining new columns/variables. These data manipulation procedures will go hand-in-hand with the data visualizations you’ve produced here.</p>
 
 </div>
 </div>
@@ -1010,9 +1190,6 @@ <h3>References</h3>
 <div id="ref-R-ggplot2">
 <p>Wickham, Hadley, and Winston Chang. 2016. <em>Ggplot2: Create Elegant Data Visualisations Using the Grammar of Graphics</em>. <a href="https://CRAN.R-project.org/package=ggplot2" class="uri">https://CRAN.R-project.org/package=ggplot2</a>.</p>
 </div>
-<div id="ref-usedtor2016">
-<p>Ismay, Chester. 2016. <em>Getting Used to R, RStudio, and R Markdown</em>. <a href="http://ismayc.github.io/rbasics-book" class="uri">http://ismayc.github.io/rbasics-book</a>.</p>
-</div>
 <div id="ref-robbins2013">
 <p>Robbins, Naomi. 2013. <em>Creating More Effective Graphs</em>. Chart House.</p>
 </div>
@@ -1023,7 +1200,7 @@ <h3>References</h3>
         </div>
       </div>
 <a href="3-tidy.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
-<a href="5-data-manipulation-via-dplyr.html" class="navigation navigation-next " aria-label="Next page""><i class="fa fa-angle-right"></i></a>
+<a href="5-manip.html" class="navigation navigation-next " aria-label="Next page""><i class="fa fa-angle-right"></i></a>
 
 <script src="libs/gitbook-2.6.7/js/app.min.js"></script>
 <script src="libs/gitbook-2.6.7/js/lunr.js"></script>
diff --git a/docs/5-data-manipulation-via-dplyr.html b/docs/5-data-manipulation-via-dplyr.html
deleted file mode 100644
index afedb7a76..000000000
--- a/docs/5-data-manipulation-via-dplyr.html
+++ /dev/null
@@ -1,1031 +0,0 @@
-<!DOCTYPE html>
-<html >
-
-<head>
-
-  <meta charset="UTF-8">
-  <meta http-equiv="X-UA-Compatible" content="IE=edge">
-  <title>ModernDive</title>
-  <meta content="text/html; charset=UTF-8" http-equiv="Content-Type">
-  <meta name="description" content="Combining statistical and computational thinking to make sense of data. An evolution of the traditional introductory statistics curriculum, more focused on reproducible research, data visualization, and modern data analysis techniques and tools including resampling and bootstrapping using R, RStudio, and R Markdown">
-  <meta name="generator" content="bookdown 0.3 and GitBook 2.6.7">
-
-  <meta property="og:title" content="ModernDive" />
-  <meta property="og:type" content="book" />
-  
-  
-  <meta property="og:description" content="Combining statistical and computational thinking to make sense of data. An evolution of the traditional introductory statistics curriculum, more focused on reproducible research, data visualization, and modern data analysis techniques and tools including resampling and bootstrapping using R, RStudio, and R Markdown" />
-  <meta name="github-repo" content="ismayc/moderndiver-book" />
-
-  <meta name="twitter:card" content="summary" />
-  <meta name="twitter:title" content="ModernDive" />
-  
-  <meta name="twitter:description" content="Combining statistical and computational thinking to make sense of data. An evolution of the traditional introductory statistics curriculum, more focused on reproducible research, data visualization, and modern data analysis techniques and tools including resampling and bootstrapping using R, RStudio, and R Markdown" />
-  
-
-<meta name="author" content="Chester Ismay and Albert Y. Kim">
-
-
-<meta name="date" content="2017-01-07">
-
-  <meta name="viewport" content="width=device-width, initial-scale=1">
-  <meta name="apple-mobile-web-app-capable" content="yes">
-  <meta name="apple-mobile-web-app-status-bar-style" content="black">
-  
-  
-<link rel="prev" href="4-data-visualization-via-ggplot2.html">
-<link rel="next" href="6-simulating-randomness-via-mosaic.html">
-
-<script src="libs/jquery-2.2.3/jquery.min.js"></script>
-<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
-<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
-<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
-<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
-<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />
-
-
-
-
-
-
-
-<script src="libs/htmlwidgets-0.8/htmlwidgets.js"></script>
-<link href="libs/dygraphs-1.1.1/dygraph.css" rel="stylesheet" />
-<script src="libs/dygraphs-1.1.1/dygraph-combined.js"></script>
-<script src="libs/moment-2.8.4/moment.js"></script>
-<script src="libs/moment-timezone-0.2.5/moment-timezone-with-data.js"></script>
-<script src="libs/moment-fquarter-1.0.0/moment-fquarter.min.js"></script>
-<script src="libs/dygraphs-binding-1.1.1.4/dygraphs.js"></script>
-<script>
-  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
-  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
-  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
-  })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
-
-  ga('create', 'UA-89938436-1', 'auto');
-  ga('send', 'pageview');
-
-</script>
-
-
-<style type="text/css">
-div.sourceCode { overflow-x: auto; }
-table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
-  margin: 0; padding: 0; vertical-align: baseline; border: none; }
-table.sourceCode { width: 100%; line-height: 100%; }
-td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
-td.sourceCode { padding-left: 5px; }
-code > span.kw { color: #007020; font-weight: bold; } /* Keyword */
-code > span.dt { color: #902000; } /* DataType */
-code > span.dv { color: #40a070; } /* DecVal */
-code > span.bn { color: #40a070; } /* BaseN */
-code > span.fl { color: #40a070; } /* Float */
-code > span.ch { color: #4070a0; } /* Char */
-code > span.st { color: #4070a0; } /* String */
-code > span.co { color: #60a0b0; font-style: italic; } /* Comment */
-code > span.ot { color: #007020; } /* Other */
-code > span.al { color: #ff0000; font-weight: bold; } /* Alert */
-code > span.fu { color: #06287e; } /* Function */
-code > span.er { color: #ff0000; font-weight: bold; } /* Error */
-code > span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
-code > span.cn { color: #880000; } /* Constant */
-code > span.sc { color: #4070a0; } /* SpecialChar */
-code > span.vs { color: #4070a0; } /* VerbatimString */
-code > span.ss { color: #bb6688; } /* SpecialString */
-code > span.im { } /* Import */
-code > span.va { color: #19177c; } /* Variable */
-code > span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
-code > span.op { color: #666666; } /* Operator */
-code > span.bu { } /* BuiltIn */
-code > span.ex { } /* Extension */
-code > span.pp { color: #bc7a00; } /* Preprocessor */
-code > span.at { color: #7d9029; } /* Attribute */
-code > span.do { color: #ba2121; font-style: italic; } /* Documentation */
-code > span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
-code > span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
-code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
-</style>
-
-<link rel="stylesheet" href="style.css" type="text/css" />
-</head>
-
-<body>
-
-
-  <div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">
-
-    <div class="book-summary">
-      <nav role="navigation">
-
-<ul class="summary">
-<li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> Preamble</a><ul>
-<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book</a></li>
-<li class="chapter" data-level="1.2" data-path="index.html"><a href="index.html#contribute"><i class="fa fa-check"></i><b>1.2</b> Contribute</a></li>
-<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started"><i class="fa fa-check"></i><b>1.3</b> Getting Started</a></li>
-<li class="chapter" data-level="" data-path="index.html"><a href="index.html#colophon"><i class="fa fa-check"></i>Colophon</a></li>
-</ul></li>
-<li class="chapter" data-level="2" data-path="2-intro.html"><a href="2-intro.html"><i class="fa fa-check"></i><b>2</b> Introduction</a><ul>
-<li class="chapter" data-level="2.1" data-path="2-intro.html"><a href="2-intro.html#preamble-1"><i class="fa fa-check"></i><b>2.1</b> Preamble</a></li>
-<li class="chapter" data-level="2.2" data-path="2-intro.html"><a href="2-intro.html#three-driving-data-sources"><i class="fa fa-check"></i><b>2.2</b> Three driving data sources</a></li>
-<li class="chapter" data-level="2.3" data-path="2-intro.html"><a href="2-intro.html#datascience-pipeline"><i class="fa fa-check"></i><b>2.3</b> Data/science pipeline</a></li>
-<li class="chapter" data-level="2.4" data-path="2-intro.html"><a href="2-intro.html#reproducibility"><i class="fa fa-check"></i><b>2.4</b> Reproducibility</a></li>
-<li class="chapter" data-level="2.5" data-path="2-intro.html"><a href="2-intro.html#who-is-this-book-for"><i class="fa fa-check"></i><b>2.5</b> Who is this book for?</a></li>
-</ul></li>
-<li class="part"><span><b>I Data Exploration</b></span></li>
-<li class="chapter" data-level="3" data-path="3-tidy.html"><a href="3-tidy.html"><i class="fa fa-check"></i><b>3</b> Tidy Data</a><ul>
-<li class="chapter" data-level="3.1" data-path="3-tidy.html"><a href="3-tidy.html#what-is-tidy-data"><i class="fa fa-check"></i><b>3.1</b> What is tidy data?</a></li>
-<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#the-nycflights13-datasets"><i class="fa fa-check"></i><b>3.2</b> The <code>nycflights13</code> datasets</a></li>
-<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a></li>
-<li class="chapter" data-level="3.4" data-path="3-tidy.html"><a href="3-tidy.html#normal-forms-of-data"><i class="fa fa-check"></i><b>3.4</b> Normal forms of data</a></li>
-<li class="chapter" data-level="3.5" data-path="3-tidy.html"><a href="3-tidy.html#whats-to-come"><i class="fa fa-check"></i><b>3.5</b> What’s to come?</a></li>
-</ul></li>
-<li class="chapter" data-level="4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via <code id="viz">ggplot2</code></a><ul>
-<li class="chapter" data-level="" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
-<li class="chapter" data-level="4.1.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
-<li class="chapter" data-level="4.1.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
-<li class="chapter" data-level="4.1.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
-<li class="chapter" data-level="4.1.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The <code>ggplot2</code> Package</a></li>
-</ul></li>
-<li class="chapter" data-level="4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
-<li class="chapter" data-level="4.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
-<li class="chapter" data-level="4.3.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatter-plots-via-geom_point"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via <code id="geompoint">geom_point</code></a></li>
-<li class="chapter" data-level="4.3.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
-<li class="chapter" data-level="4.3.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
-<li class="chapter" data-level="4.4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#line-graphs-via-geom_line"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via <code id="geomline">geom_line</code></a></li>
-<li class="chapter" data-level="4.4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.5" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
-<li class="chapter" data-level="4.5.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms-via-geom_histogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via <code id="geomhistogram">geom_histogram</code></a></li>
-<li class="chapter" data-level="4.5.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
-<li class="chapter" data-level="4.5.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.6" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
-<li class="chapter" data-level="4.7" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
-<li class="chapter" data-level="4.7.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#boxplots-via-geom_boxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via <code id="geomboxplot">geom_boxplot</code></a></li>
-<li class="chapter" data-level="4.7.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.8" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
-<li class="chapter" data-level="4.8.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via <code>geom_bar</code></a></li>
-<li class="chapter" data-level="4.8.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
-<li class="chapter" data-level="4.8.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
-<li class="chapter" data-level="4.8.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.9" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
-<li class="chapter" data-level="4.9.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.1</b> What’s to come?</a></li>
-<li class="chapter" data-level="4.9.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
-</ul></li>
-</ul></li>
-<li class="chapter" data-level="5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via <code id="manip">dplyr</code></a><ul>
-<li class="chapter" data-level="" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="5.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
-<li class="chapter" data-level="5.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#four-main-verbs---the-4mv"><i class="fa fa-check"></i><b>5.2</b> Four Main Verbs - The 4MV</a><ul>
-<li class="chapter" data-level="5.2.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#filter-observations-using-filter"><i class="fa fa-check"></i><b>5.2.1</b> Filter observations using <code id="filter">filter</code></a></li>
-<li class="chapter" data-level="5.2.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> Summarize variables using <code>summarize</code></a></li>
-<li class="chapter" data-level="5.2.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.3</b> Create new variables/change old variables using <code>mutate</code></a></li>
-<li class="chapter" data-level="5.2.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#reorder-the-data-frame-using-arrange"><i class="fa fa-check"></i><b>5.2.4</b> Reorder the data frame using <code id="arrange">arrange</code></a></li>
-</ul></li>
-<li class="chapter" data-level="5.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#other-verbs"><i class="fa fa-check"></i><b>5.3</b> Other verbs</a><ul>
-<li class="chapter" data-level="5.3.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#select-variables-using-select"><i class="fa fa-check"></i><b>5.3.1</b> Select variables using <code id="select">select</code></a></li>
-<li class="chapter" data-level="5.3.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#rename-variables-using-rename"><i class="fa fa-check"></i><b>5.3.2</b> Rename variables using <code id="rename">rename</code></a></li>
-<li class="chapter" data-level="5.3.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.3.3</b> Find the top number of values using <code>top_n</code></a></li>
-</ul></li>
-<li class="chapter" data-level="5.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#joiningmerging-data-frames"><i class="fa fa-check"></i><b>5.4</b> Joining/merging data frames</a></li>
-<li class="chapter" data-level="5.5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5</b> Script of R code</a></li>
-<li class="chapter" data-level="5.6" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.6</b> What’s to come?</a></li>
-</ul></li>
-<li class="part"><span><b>II Inference</b></span></li>
-<li class="chapter" data-level="6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via <code id="sim">mosaic</code></a><ul>
-<li class="chapter" data-level="" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="6.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
-<li class="chapter" data-level="6.1.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
-<li class="chapter" data-level="6.1.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
-</ul></li>
-<li class="chapter" data-level="6.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
-<li class="chapter" data-level="6.2.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
-<li class="chapter" data-level="6.2.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
-</ul></li>
-<li class="chapter" data-level="6.3" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
-<li class="chapter" data-level="6.4" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
-<li class="chapter" data-level="6.5" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5</b> Script of R code</a></li>
-<li class="chapter" data-level="6.6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.6</b> What’s to come?</a></li>
-</ul></li>
-<li class="chapter" data-level="7" data-path="7-hypo.html"><a href="7-hypo.html"><i class="fa fa-check"></i><b>7</b> Hypothesis Testing</a><ul>
-<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="7.1" data-path="7-hypo.html"><a href="7-hypo.html#when-inference-is-not-needed"><i class="fa fa-check"></i><b>7.1</b> When Inference Is Not Needed</a></li>
-<li class="chapter" data-level="7.2" data-path="7-hypo.html"><a href="7-hypo.html#basics-of-hypothesis-testing"><i class="fa fa-check"></i><b>7.2</b> Basics of Hypothesis Testing</a></li>
-<li class="chapter" data-level="7.3" data-path="7-hypo.html"><a href="7-hypo.html#trial"><i class="fa fa-check"></i><b>7.3</b> Criminal trial analogy</a><ul>
-<li class="chapter" data-level="7.3.1" data-path="7-hypo.html"><a href="7-hypo.html#two-possible-conclusions"><i class="fa fa-check"></i><b>7.3.1</b> Two possible conclusions</a></li>
-</ul></li>
-<li class="chapter" data-level="7.4" data-path="7-hypo.html"><a href="7-hypo.html#types-of-errors-in-hypothesis-testing"><i class="fa fa-check"></i><b>7.4</b> Types of Errors in Hypothesis Testing</a><ul>
-<li class="chapter" data-level="7.4.1" data-path="7-hypo.html"><a href="7-hypo.html#logic-of-hypothesis-testing"><i class="fa fa-check"></i><b>7.4.1</b> Logic of Hypothesis Testing</a></li>
-</ul></li>
-<li class="chapter" data-level="7.5" data-path="7-hypo.html"><a href="7-hypo.html#statistical-significance"><i class="fa fa-check"></i><b>7.5</b> Statistical Significance</a></li>
-<li class="chapter" data-level="7.6" data-path="7-hypo.html"><a href="7-hypo.html#example-revisiting-the-lady-tasting-tea"><i class="fa fa-check"></i><b>7.6</b> EXAMPLE: Revisiting the Lady Tasting Tea</a><ul>
-<li class="chapter" data-level="7.6.1" data-path="7-hypo.html"><a href="7-hypo.html#data"><i class="fa fa-check"></i><b>7.6.1</b> Data</a></li>
-<li class="chapter" data-level="7.6.2" data-path="7-hypo.html"><a href="7-hypo.html#test-statistic-delta"><i class="fa fa-check"></i><b>7.6.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
-<li class="chapter" data-level="7.6.3" data-path="7-hypo.html"><a href="7-hypo.html#observed-effect-delta"><i class="fa fa-check"></i><b>7.6.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
-<li class="chapter" data-level="7.6.4" data-path="7-hypo.html"><a href="7-hypo.html#model-of-h_0"><i class="fa fa-check"></i><b>7.6.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="7.6.5" data-path="7-hypo.html"><a href="7-hypo.html#simulated-data"><i class="fa fa-check"></i><b>7.6.5</b> Simulated Data</a></li>
-<li class="chapter" data-level="7.6.6" data-path="7-hypo.html"><a href="7-hypo.html#distribution-of-delta-under-h_0"><i class="fa fa-check"></i><b>7.6.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="7.6.7" data-path="7-hypo.html"><a href="7-hypo.html#the-p-value"><i class="fa fa-check"></i><b>7.6.7</b> The p-value</a></li>
-</ul></li>
-<li class="chapter" data-level="7.7" data-path="7-hypo.html"><a href="7-hypo.html#example-comparing-two-means"><i class="fa fa-check"></i><b>7.7</b> EXAMPLE: Comparing two means</a><ul>
-<li class="chapter" data-level="7.7.1" data-path="7-hypo.html"><a href="7-hypo.html#randomizationpermutation"><i class="fa fa-check"></i><b>7.7.1</b> Randomization/Permutation</a></li>
-<li class="chapter" data-level="7.7.2" data-path="7-hypo.html"><a href="7-hypo.html#comparing-action-and-romance-movies"><i class="fa fa-check"></i><b>7.7.2</b> Comparing Action and Romance Movies</a></li>
-<li class="chapter" data-level="7.7.3" data-path="7-hypo.html"><a href="7-hypo.html#sampling-rightarrow-randomization"><i class="fa fa-check"></i><b>7.7.3</b> Sampling <span class="math inline">\(\rightarrow\)</span> Randomization</a></li>
-<li class="chapter" data-level="7.7.4" data-path="7-hypo.html"><a href="7-hypo.html#data-1"><i class="fa fa-check"></i><b>7.7.4</b> Data</a></li>
-<li class="chapter" data-level="7.7.5" data-path="7-hypo.html"><a href="7-hypo.html#model-of-h_0-1"><i class="fa fa-check"></i><b>7.7.5</b> Model of <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="7.7.6" data-path="7-hypo.html"><a href="7-hypo.html#test-statistic-delta-1"><i class="fa fa-check"></i><b>7.7.6</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
-<li class="chapter" data-level="7.7.7" data-path="7-hypo.html"><a href="7-hypo.html#observed-effect-delta-1"><i class="fa fa-check"></i><b>7.7.7</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
-<li class="chapter" data-level="7.7.8" data-path="7-hypo.html"><a href="7-hypo.html#simulated-data-1"><i class="fa fa-check"></i><b>7.7.8</b> Simulated Data</a></li>
-<li class="chapter" data-level="7.7.9" data-path="7-hypo.html"><a href="7-hypo.html#distribution-of-delta-under-h_0-1"><i class="fa fa-check"></i><b>7.7.9</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="7.7.10" data-path="7-hypo.html"><a href="7-hypo.html#the-p-value-1"><i class="fa fa-check"></i><b>7.7.10</b> The p-value</a></li>
-<li class="chapter" data-level="7.7.11" data-path="7-hypo.html"><a href="7-hypo.html#summary-5"><i class="fa fa-check"></i><b>7.7.11</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="7.8" data-path="7-hypo.html"><a href="7-hypo.html#theory-hypo"><i class="fa fa-check"></i><b>7.8</b> Building theory-based methods using computation</a><ul>
-<li class="chapter" data-level="7.8.1" data-path="7-hypo.html"><a href="7-hypo.html#example-t-test-for-two-independent-samples"><i class="fa fa-check"></i><b>7.8.1</b> EXAMPLE: <span class="math inline">\(t\)</span>-test for two independent samples</a></li>
-<li class="chapter" data-level="7.8.2" data-path="7-hypo.html"><a href="7-hypo.html#conditions-for-t-test"><i class="fa fa-check"></i><b>7.8.2</b> Conditions for t-test</a></li>
-</ul></li>
-<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9</b> What’s to come?</a></li>
-</ul></li>
-<li class="chapter" data-level="8" data-path="8-ci.html"><a href="8-ci.html"><i class="fa fa-check"></i><b>8</b> Confidence Intervals</a><ul>
-<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a></li>
-<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#mean_rating"><i class="fa fa-check"></i><b>8.2</b> mean_rating</a><ul>
-<li class="chapter" data-level="8.2.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.2.1</b> Review of Bootstrapping</a></li>
-</ul></li>
-<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.3</b> Relation to hypothesis testing</a></li>
-<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.4</b> Effect size</a></li>
-<li class="chapter" data-level="8.5" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-3"><i class="fa fa-check"></i><b>8.5</b> Script of R code</a></li>
-<li class="chapter" data-level="8.6" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.6</b> What’s to come?</a></li>
-</ul></li>
-<li class="chapter" data-level="9" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html"><i class="fa fa-check"></i><b>9</b> Regression via <code id="regress">broom</code></a><ul>
-<li class="chapter" data-level="" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="9.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
-<li class="chapter" data-level="9.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
-<li class="chapter" data-level="9.2.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
-</ul></li>
-<li class="chapter" data-level="9.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
-<li class="chapter" data-level="9.3.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
-<li class="chapter" data-level="9.3.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
-<li class="chapter" data-level="9.3.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#interpretting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpretting the slope</a></li>
-<li class="chapter" data-level="9.3.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
-</ul></li>
-<li class="chapter" data-level="9.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
-<li class="chapter" data-level="9.4.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
-<li class="chapter" data-level="9.4.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
-<li class="chapter" data-level="9.4.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
-<li class="chapter" data-level="9.4.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
-<li class="chapter" data-level="9.4.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
-</ul></li>
-<li class="chapter" data-level="9.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
-<li class="chapter" data-level="9.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
-<li class="chapter" data-level="9.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#script-of-r-code-4"><i class="fa fa-check"></i><b>9.7</b> Script of R code</a></li>
-<li class="chapter" data-level="9.8" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.8</b> What’s to come?</a></li>
-</ul></li>
-<li class="part"><span><b>III Conclusion</b></span></li>
-<li class="chapter" data-level="10" data-path="10-effective-data-storytelling.html"><a href="10-effective-data-storytelling.html"><i class="fa fa-check"></i><b>10</b> Effective Data Storytelling</a><ul>
-<li class="chapter" data-level="" data-path="10-effective-data-storytelling.html"><a href="10-effective-data-storytelling.html#concluding-remarks"><i class="fa fa-check"></i>Concluding Remarks</a></li>
-</ul></li>
-<li class="appendix"><span><b>Appendix</b></span></li>
-<li class="chapter" data-level="A" data-path="A-appendixA.html"><a href="A-appendixA.html"><i class="fa fa-check"></i><b>A</b> Statistical Background</a><ul>
-<li class="chapter" data-level="A.1" data-path="A-appendixA.html"><a href="A-appendixA.html#basic-statistical-terms"><i class="fa fa-check"></i><b>A.1</b> Basic statistical terms</a><ul>
-<li class="chapter" data-level="A.1.1" data-path="A-appendixA.html"><a href="A-appendixA.html#mean"><i class="fa fa-check"></i><b>A.1.1</b> Mean</a></li>
-<li class="chapter" data-level="A.1.2" data-path="A-appendixA.html"><a href="A-appendixA.html#median"><i class="fa fa-check"></i><b>A.1.2</b> Median</a></li>
-<li class="chapter" data-level="A.1.3" data-path="A-appendixA.html"><a href="A-appendixA.html#standard-deviation"><i class="fa fa-check"></i><b>A.1.3</b> Standard deviation</a></li>
-<li class="chapter" data-level="A.1.4" data-path="A-appendixA.html"><a href="A-appendixA.html#five-number-summary"><i class="fa fa-check"></i><b>A.1.4</b> Five-number summary</a></li>
-<li class="chapter" data-level="A.1.5" data-path="A-appendixA.html"><a href="A-appendixA.html#distribution"><i class="fa fa-check"></i><b>A.1.5</b> Distribution</a></li>
-<li class="chapter" data-level="A.1.6" data-path="A-appendixA.html"><a href="A-appendixA.html#outliers"><i class="fa fa-check"></i><b>A.1.6</b> Outliers</a></li>
-</ul></li>
-</ul></li>
-<li class="chapter" data-level="B" data-path="B-appendixB.html"><a href="B-appendixB.html"><i class="fa fa-check"></i><b>B</b> Inference Examples</a><ul>
-<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-6"><i class="fa fa-check"></i><b>B.1</b> Needed packages</a></li>
-<li class="chapter" data-level="B.2" data-path="B-appendixB.html"><a href="B-appendixB.html#one-mean"><i class="fa fa-check"></i><b>B.2</b> One Mean</a><ul>
-<li class="chapter" data-level="B.2.1" data-path="B-appendixB.html"><a href="B-appendixB.html#problem-statement"><i class="fa fa-check"></i><b>B.2.1</b> Problem Statement</a></li>
-<li class="chapter" data-level="B.2.2" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses"><i class="fa fa-check"></i><b>B.2.2</b> Competing Hypotheses</a></li>
-<li class="chapter" data-level="B.2.3" data-path="B-appendixB.html"><a href="B-appendixB.html#exploring-the-sample-data"><i class="fa fa-check"></i><b>B.2.3</b> Exploring the sample data</a></li>
-<li class="chapter" data-level="B.2.4" data-path="B-appendixB.html"><a href="B-appendixB.html#non-traditional-methods"><i class="fa fa-check"></i><b>B.2.4</b> Non-traditional methods</a></li>
-<li class="chapter" data-level="B.2.5" data-path="B-appendixB.html"><a href="B-appendixB.html#traditional-methods"><i class="fa fa-check"></i><b>B.2.5</b> Traditional methods</a></li>
-<li class="chapter" data-level="B.2.6" data-path="B-appendixB.html"><a href="B-appendixB.html#comparing-results"><i class="fa fa-check"></i><b>B.2.6</b> Comparing results</a></li>
-</ul></li>
-<li class="chapter" data-level="B.3" data-path="B-appendixB.html"><a href="B-appendixB.html#one-proportion"><i class="fa fa-check"></i><b>B.3</b> One Proportion</a><ul>
-<li class="chapter" data-level="B.3.1" data-path="B-appendixB.html"><a href="B-appendixB.html#problem-statement-1"><i class="fa fa-check"></i><b>B.3.1</b> Problem Statement</a></li>
-<li class="chapter" data-level="B.3.2" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses-1"><i class="fa fa-check"></i><b>B.3.2</b> Competing Hypotheses</a></li>
-<li class="chapter" data-level="B.3.3" data-path="B-appendixB.html"><a href="B-appendixB.html#exploring-the-sample-data-1"><i class="fa fa-check"></i><b>B.3.3</b> Exploring the sample data</a></li>
-<li class="chapter" data-level="B.3.4" data-path="B-appendixB.html"><a href="B-appendixB.html#non-traditional-methods-1"><i class="fa fa-check"></i><b>B.3.4</b> Non-traditional methods</a></li>
-<li class="chapter" data-level="B.3.5" data-path="B-appendixB.html"><a href="B-appendixB.html#traditional-methods-1"><i class="fa fa-check"></i><b>B.3.5</b> Traditional methods</a></li>
-<li class="chapter" data-level="B.3.6" data-path="B-appendixB.html"><a href="B-appendixB.html#comparing-results-1"><i class="fa fa-check"></i><b>B.3.6</b> Comparing results</a></li>
-</ul></li>
-<li class="chapter" data-level="B.4" data-path="B-appendixB.html"><a href="B-appendixB.html#two-proportions"><i class="fa fa-check"></i><b>B.4</b> Two Proportions</a><ul>
-<li class="chapter" data-level="B.4.1" data-path="B-appendixB.html"><a href="B-appendixB.html#problem-statement-2"><i class="fa fa-check"></i><b>B.4.1</b> Problem Statement</a></li>
-<li class="chapter" data-level="B.4.2" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses-2"><i class="fa fa-check"></i><b>B.4.2</b> Competing Hypotheses</a></li>
-<li class="chapter" data-level="B.4.3" data-path="B-appendixB.html"><a href="B-appendixB.html#exploring-the-sample-data-2"><i class="fa fa-check"></i><b>B.4.3</b> Exploring the sample data</a></li>
-<li class="chapter" data-level="B.4.4" data-path="B-appendixB.html"><a href="B-appendixB.html#non-traditional-methods-2"><i class="fa fa-check"></i><b>B.4.4</b> Non-traditional methods</a></li>
-<li class="chapter" data-level="B.4.5" data-path="B-appendixB.html"><a href="B-appendixB.html#traditional-methods-2"><i class="fa fa-check"></i><b>B.4.5</b> Traditional methods</a></li>
-<li class="chapter" data-level="B.4.6" data-path="B-appendixB.html"><a href="B-appendixB.html#check-conditions-2"><i class="fa fa-check"></i><b>B.4.6</b> Check conditions</a></li>
-<li class="chapter" data-level="B.4.7" data-path="B-appendixB.html"><a href="B-appendixB.html#test-statistic-2"><i class="fa fa-check"></i><b>B.4.7</b> Test statistic</a></li>
-<li class="chapter" data-level="B.4.8" data-path="B-appendixB.html"><a href="B-appendixB.html#state-conclusion-2"><i class="fa fa-check"></i><b>B.4.8</b> State conclusion</a></li>
-<li class="chapter" data-level="B.4.9" data-path="B-appendixB.html"><a href="B-appendixB.html#comparing-results-2"><i class="fa fa-check"></i><b>B.4.9</b> Comparing results</a></li>
-</ul></li>
-<li class="chapter" data-level="B.5" data-path="B-appendixB.html"><a href="B-appendixB.html#two-means-independent-samples"><i class="fa fa-check"></i><b>B.5</b> Two Means (Independent Samples)</a><ul>
-<li class="chapter" data-level="B.5.1" data-path="B-appendixB.html"><a href="B-appendixB.html#problem-statement-3"><i class="fa fa-check"></i><b>B.5.1</b> Problem Statement</a></li>
-<li class="chapter" data-level="B.5.2" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses-3"><i class="fa fa-check"></i><b>B.5.2</b> Competing Hypotheses</a></li>
-<li class="chapter" data-level="B.5.3" data-path="B-appendixB.html"><a href="B-appendixB.html#exploring-the-sample-data-3"><i class="fa fa-check"></i><b>B.5.3</b> Exploring the sample data</a></li>
-<li class="chapter" data-level="B.5.4" data-path="B-appendixB.html"><a href="B-appendixB.html#non-traditional-methods-3"><i class="fa fa-check"></i><b>B.5.4</b> Non-traditional methods</a></li>
-<li class="chapter" data-level="B.5.5" data-path="B-appendixB.html"><a href="B-appendixB.html#traditional-methods-3"><i class="fa fa-check"></i><b>B.5.5</b> Traditional methods</a></li>
-<li class="chapter" data-level="B.5.6" data-path="B-appendixB.html"><a href="B-appendixB.html#test-statistic-3"><i class="fa fa-check"></i><b>B.5.6</b> Test statistic</a></li>
-<li class="chapter" data-level="B.5.7" data-path="B-appendixB.html"><a href="B-appendixB.html#compute-p-value-2"><i class="fa fa-check"></i><b>B.5.7</b> Compute <span class="math inline">\(p\)</span>-value</a></li>
-<li class="chapter" data-level="B.5.8" data-path="B-appendixB.html"><a href="B-appendixB.html#state-conclusion-3"><i class="fa fa-check"></i><b>B.5.8</b> State conclusion</a></li>
-<li class="chapter" data-level="B.5.9" data-path="B-appendixB.html"><a href="B-appendixB.html#comparing-results-3"><i class="fa fa-check"></i><b>B.5.9</b> Comparing results</a></li>
-</ul></li>
-<li class="chapter" data-level="B.6" data-path="B-appendixB.html"><a href="B-appendixB.html#two-means-paired-samples"><i class="fa fa-check"></i><b>B.6</b> Two Means (Paired Samples)</a><ul>
-<li class="chapter" data-level="B.6.1" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses-4"><i class="fa fa-check"></i><b>B.6.1</b> Competing Hypotheses</a></li>
-<li class="chapter" data-level="B.6.2" data-path="B-appendixB.html"><a href="B-appendixB.html#exploring-the-sample-data-4"><i class="fa fa-check"></i><b>B.6.2</b> Exploring the sample data</a></li>
-<li class="chapter" data-level="B.6.3" data-path="B-appendixB.html"><a href="B-appendixB.html#non-traditional-methods-4"><i class="fa fa-check"></i><b>B.6.3</b> Non-traditional methods</a></li>
-<li class="chapter" data-level="B.6.4" data-path="B-appendixB.html"><a href="B-appendixB.html#traditional-methods-4"><i class="fa fa-check"></i><b>B.6.4</b> Traditional methods</a></li>
-<li class="chapter" data-level="B.6.5" data-path="B-appendixB.html"><a href="B-appendixB.html#comparing-results-4"><i class="fa fa-check"></i><b>B.6.5</b> Comparing results</a></li>
-</ul></li>
-</ul></li>
-<li class="chapter" data-level="C" data-path="C-appendixC.html"><a href="C-appendixC.html"><i class="fa fa-check"></i><b>C</b> Reach for the Starts</a><ul>
-<li class="chapter" data-level="C.1" data-path="C-appendixC.html"><a href="C-appendixC.html#sorted-barplots"><i class="fa fa-check"></i><b>C.1</b> Sorted barplots</a></li>
-<li class="chapter" data-level="C.2" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-graphics"><i class="fa fa-check"></i><b>C.2</b> Interactive graphics</a><ul>
-<li class="chapter" data-level="C.2.1" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-line-graphs"><i class="fa fa-check"></i><b>C.2.1</b> Interactive line-graphs</a></li>
-</ul></li>
-</ul></li>
-<li class="chapter" data-level="" data-path="references.html"><a href="references.html"><i class="fa fa-check"></i>References</a></li>
-</ul>
-
-      </nav>
-    </div>
-
-    <div class="book-body">
-      <div class="body-inner">
-        <div class="book-header" role="navigation">
-          <h1>
-            <i class="fa fa-circle-o-notch fa-spin"></i><a href="./">ModernDive</a>
-          </h1>
-        </div>
-
-        <div class="page-wrapper" tabindex="-1" role="main">
-          <div class="page-inner">
-
-            <section class="normal" id="section-">
-<div id="data-manipulation-via-dplyr" class="section level1">
-<h1><span class="header-section-number">5</span> Data Manipulation via <code id="manip">dplyr</code></h1>
-<!--
-- Make sure to refer back to plots in the viz chapter and how the
-  material here relates to answering those questions
--->
-<p>Let’s briefly recap where we have been so far and where we are headed. In Chapter <a href="3-tidy.html#tidy">3</a>, we discussed what it means for data to be tidy. We saw that this refers to observational units corresponding to rows and variables being stored in columns. The entries in the data frame correspond to different combinations of observational units and variables. In the <code>flights</code> data frame, we saw that each row corresponded to a different flight leaving New York City. (In other words, the observational unit of that tidy data frame is a flight.) The variables are listed as columns and for <code>flights</code> they include both quantitative variables like <code>dep_delay</code> and <code>distance</code> but also categorical variables like <code>carrier</code> and <code>origin</code>. An entry in the table corresponds to a particular flight on a given day and a particular value of a given variable representing that flight.</p>
-<p>We saw in Chapter <a href="4-data-visualization-via-ggplot2.html#viz"><strong>??</strong></a> that organizing data in this tidy way makes it easy for us to produce graphics. We can simply specify what variable/column we would like on one axis, what variable we’d like on the other axis, and what type of plot we’d like to make. We can also do things such as changing the color by another variable or change the size of our points by a fourth variable given this tidy data set.</p>
-<p>In Chapter <a href="4-data-visualization-via-ggplot2.html#viz"><strong>??</strong></a>, we also introduced some ways to summarize and manipulate data to suit your needs. This chapter focuses more on the details of this by giving a variety of examples using the four main verbs in the <code>dplyr</code> package <span class="citation">(Wickham and Francois <a href="#ref-R-dplyr">2016</a>)</span>. There are more advanced operations that can be done than these and you’ll see some examples of this near the end of the chapter.</p>
-<div id="needed-packages-1" class="section level2 unnumbered">
-<h2>Needed packages</h2>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(dplyr)
-<span class="kw">library</span>(ggplot2)
-<span class="kw">library</span>(nycflights13)
-<span class="kw">library</span>(knitr)</code></pre></div>
-</div>
-<div id="the-pipe" class="section level2">
-<h2><span class="header-section-number">5.1</span> The pipe <code>%&gt;%</code></h2>
-<p>Just as the <code>+</code> sign was used to add layers to a plot created using <code>ggplot</code> we will use the pipe operator (<code>%&gt;%</code>) to chain together <code>dplyr</code> functions. We read the pipe operator as “and then”. The <code>%&gt;%</code> operator allows us to go from one step in <code>dplyr</code> to the next easily so we can <code>filter</code> our data frame to only focus on a few rows, and then take that filtered data set, and <code>group_by</code> another variable, and then lastly <code>summarize</code> this grouped data to calculate the mean for each level of the group.</p>
-<p>The piping syntax will be our major focus throughout the rest of this book and you’ll find that you’ll quickly be addicted to the chaining with some practice. If you’d like to see more examples on using <code>dplyr</code>, the 4MV (in addition to some other <code>dplyr</code> verbs), and <code>%&gt;%</code> with the <code>nycflights13</code> data set, you can check out Chapter 5 of Hadley and Garrett’s book <span class="citation">(Grolemund and Wickham <a href="#ref-rds2016">2016</a>)</span>.</p>
-</div>
-<div id="four-main-verbs---the-4mv" class="section level2">
-<h2><span class="header-section-number">5.2</span> Four Main Verbs - The 4MV</h2>
-<p>The <code>d</code> in <code>dplyr</code> stands for data frames so the functions here work when you are working with objects of the data frame type. It’s most important for you to focus on the four most commonly used functions that help us manipulate and summarize data. A description of these verbs follows with each subsection devoted to seeing an example of that verb in play (or a combination of a few verbs):</p>
-<ul>
-<li><code>filter</code>: Pick rows based on conditions about their values</li>
-<li><code>summarize</code>: Create summary measures of variables (or groups of observations on variables using <code>group_by</code>)</li>
-<li><code>mutate</code>: Make a new variable in the data frame</li>
-<li><code>arrange</code>: Sort the rows based on one or more variables</li>
-</ul>
-<p>Just as we had the 5NG (The Five Named Graphs in Chapter <a href="4-data-visualization-via-ggplot2.html#viz"><strong>??</strong></a> using <code>ggplot2</code>), we have the 4MV here (The Four Main Verbs in <code>dplyr</code>):</p>
-<div id="filter-observations-using-filter" class="section level3">
-<h3><span class="header-section-number">5.2.1</span> Filter observations using <code id="filter">filter</code></h3>
-<div class="figure" style="text-align: center"><span id="fig:filter"></span>
-<img src="images/filter.png" alt="Filter diagram from Data Wrangling with dplyr and tidyr cheatsheet" width="\textwidth" />
-<p class="caption">
-Figure 5.1: Filter diagram from Data Wrangling with dplyr and tidyr cheatsheet
-</p>
-</div>
-<p>All of the 4MVs follow the same syntax with the argument before the pipe being the name of the data frame and then the name of the verb with other arguments specifying which criteria you’d like the verb to work with in parantheses.</p>
-<p>The <code>filter</code> function here works much like the “Filter” option in Microsoft Excel. It allows you to specify criteria about values of a variable in your data set and then chooses only those rows that match that criteria. We begin by focusing only on flights from New York City to Portland, Oregon. The <code>dest</code> code (or airport code) for Portland, Oregon is <code>&quot;PDX&quot;</code>:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">portland_flights &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span><span class="kw">filter</span>(dest ==<span class="st"> &quot;PDX&quot;</span>)
-portland_flights</code></pre></div>
-<pre><code>## # A tibble: 1,354 × 19
-##     year month   day dep_time sched_dep_time dep_delay arr_time
-##    &lt;int&gt; &lt;int&gt; &lt;int&gt;    &lt;int&gt;          &lt;int&gt;     &lt;dbl&gt;    &lt;int&gt;
-## 1   2013     1     1     1739           1740        -1     2051
-## 2   2013     1     1     1805           1757         8     2117
-## 3   2013     1     1     2052           2029        23     2349
-## 4   2013     1     2      804            805        -1     1039
-## 5   2013     1     2     1552           1550         2     1853
-## 6   2013     1     2     1727           1720         7     2042
-## 7   2013     1     2     1738           1740        -2     2028
-## 8   2013     1     2     2024           2029        -5     2314
-## 9   2013     1     3     1755           1745        10     2110
-## 10  2013     1     3     1814           1727        47     2108
-## # ... with 1,344 more rows, and 12 more variables:
-## #   sched_arr_time &lt;int&gt;, arr_delay &lt;dbl&gt;, carrier &lt;chr&gt;, flight &lt;int&gt;,
-## #   tailnum &lt;chr&gt;, origin &lt;chr&gt;, dest &lt;chr&gt;, air_time &lt;dbl&gt;,
-## #   distance &lt;dbl&gt;, hour &lt;dbl&gt;, minute &lt;dbl&gt;, time_hour &lt;dttm&gt;</code></pre>
-<p>Note the second equals sign here. You are almost guaranteed to make the mistake at least once of only including one equals sign. Let’s see what happens when we make this error:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">portland_flights &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span><span class="kw">filter</span>(<span class="dt">dest =</span> <span class="st">&quot;PDX&quot;</span>)</code></pre></div>
-<pre><code>Error: filter() takes unnamed arguments. Do you need `==`?</code></pre>
-<p>You should run <code>View(pdx_flights)</code> to glance at the data in spreadsheet form and ensure that only flights heading to Portland are chosen here.</p>
-<p>You can combine multiple criteria together using operators that make comparisons:</p>
-<ul>
-<li><code>|</code> corresponds to “or”</li>
-<li><code>&amp;</code> corresponds to “and”</li>
-</ul>
-<p>We can often skip the use of <code>&amp;</code> and just separate our conditions with a comma. You’ll see this in the example below.</p>
-<p>In addition, you can use other mathematical checks (similar to <code>==</code>):</p>
-<ul>
-<li><code>&gt;</code> corresponds to “greater than”</li>
-<li><code>&lt;</code> corresponds to “less than”</li>
-<li><code>&gt;=</code> corresponds to “greater than or equal to”</li>
-<li><code>&lt;=</code> corresponds to “less than or equal to”</li>
-<li><code>!=</code> corresponds to “not equal to”</li>
-</ul>
-<p>To see many of these in action, let’s select all flights that left JFK airport heading to Burlington, Vermont (<code>&quot;BTV&quot;</code>) or Seattle, Washington (<code>&quot;SEA&quot;</code>) in the months of October, November, or December:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">btv_sea_flights_fall &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span><span class="kw">filter</span>(
-                               origin ==<span class="st"> &quot;JFK&quot;</span>, 
-                               (dest ==<span class="st"> &quot;BTV&quot;</span>) |<span class="st"> </span>(dest ==<span class="st"> &quot;SEA&quot;</span>),
-                               month &gt;=<span class="st"> </span><span class="dv">10</span>)</code></pre></div>
-<p>Another example uses the <code>!</code> to pick rows that <strong>DON’T</strong> match a condition. Here we are referring to excluding the Northern Hemisphere summer months of June, July, and August.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">not_summer_flights &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span><span class="kw">filter</span>(!<span class="kw">between</span>(month, <span class="dv">6</span>, <span class="dv">8</span>))
-not_summer_flights</code></pre></div>
-<pre><code>## # A tibble: 249,781 × 19
-##     year month   day dep_time sched_dep_time dep_delay arr_time
-##    &lt;int&gt; &lt;int&gt; &lt;int&gt;    &lt;int&gt;          &lt;int&gt;     &lt;dbl&gt;    &lt;int&gt;
-## 1   2013     1     1      517            515         2      830
-## 2   2013     1     1      533            529         4      850
-## 3   2013     1     1      542            540         2      923
-## 4   2013     1     1      544            545        -1     1004
-## 5   2013     1     1      554            600        -6      812
-## 6   2013     1     1      554            558        -4      740
-## 7   2013     1     1      555            600        -5      913
-## 8   2013     1     1      557            600        -3      709
-## 9   2013     1     1      557            600        -3      838
-## 10  2013     1     1      558            600        -2      753
-## # ... with 249,771 more rows, and 12 more variables:
-## #   sched_arr_time &lt;int&gt;, arr_delay &lt;dbl&gt;, carrier &lt;chr&gt;, flight &lt;int&gt;,
-## #   tailnum &lt;chr&gt;, origin &lt;chr&gt;, dest &lt;chr&gt;, air_time &lt;dbl&gt;,
-## #   distance &lt;dbl&gt;, hour &lt;dbl&gt;, minute &lt;dbl&gt;, time_hour &lt;dttm&gt;</code></pre>
-<p>To check that we are correct here we can use the <code>count</code> function in the <code>dplyr</code> package on the <code>month</code> variable in our <code>not_summer_flights</code> data frame to ensure June, July, and August are not selected:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">not_summer_flights %&gt;%<span class="st"> </span><span class="kw">count</span>(month)</code></pre></div>
-<pre><code>## # A tibble: 1 × 1
-##    `1.n`
-##    &lt;int&gt;
-## 1 249781</code></pre>
-<p>The function <code>between</code> is a shortcut. We could also have written the following to get the same result:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">not_summer2 &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span><span class="kw">filter</span>(month &lt;=<span class="st"> </span><span class="dv">5</span> |<span class="st"> </span>month &gt;=<span class="st"> </span><span class="dv">9</span>)
-not_summer2 %&gt;%<span class="st"> </span><span class="kw">count</span>(month)</code></pre></div>
-<pre><code>## # A tibble: 1 × 1
-##    `1.n`
-##    &lt;int&gt;
-## 1 249781</code></pre>
-<div class="learncheck">
-<p>
-<strong><em>Learning check</em></strong>
-</p>
-</div>
-<p><strong>(LC5.1)</strong> What’s another way using <code>!</code> we could filter only the rows that are not summer months (June, July, or August) in the <code>flights</code> data frame?</p>
-<hr />
-</div>
-<div id="summarize-variables-using-summarize" class="section level3">
-<h3><span class="header-section-number">5.2.2</span> Summarize variables using <code>summarize</code></h3>
-<div class="figure" style="text-align: center"><span id="fig:sum1"></span>
-<img src="images/summarize1.png" alt="Summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet" width="\textwidth" />
-<p class="caption">
-Figure 5.2: Summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet
-</p>
-</div>
-<div class="figure" style="text-align: center"><span id="fig:sum2"></span>
-<img src="images/summary.png" alt="Another summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet" width="\textwidth" />
-<p class="caption">
-Figure 5.3: Another summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet
-</p>
-</div>
-<p>We saw in Subsection <a href="#contsum"><strong>??</strong></a> a way to calculate the standard deviation and mean of the temperature variable <code>temp</code> in the <code>weather</code> data frame of <code>nycflights</code>. We can do so in one step using the <code>summarize</code> function in <code>dplyr</code>:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">weather %&gt;%<span class="st"> </span><span class="kw">summarize</span>(<span class="dt">mean =</span> <span class="kw">mean</span>(temp), <span class="dt">std_dev =</span> <span class="kw">sd</span>(temp))</code></pre></div>
-<pre><code>## # A tibble: 1 × 2
-##    mean std_dev
-##   &lt;dbl&gt;   &lt;dbl&gt;
-## 1    NA      NA</code></pre>
-<p>What happened here? The mean and the standard deviation temperatures are missing? Remember that by default the <code>mean</code> and <code>sd</code> functions do not ignore missing values. We need to specify <code>TRUE</code> for the <code>na.rm</code> parameter:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">summary_temp &lt;-<span class="st"> </span>weather %&gt;%<span class="st"> </span>
-<span class="st">  </span><span class="kw">summarize</span>(<span class="dt">mean =</span> <span class="kw">mean</span>(temp, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
-          <span class="dt">std_dev =</span> <span class="kw">sd</span>(temp, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>))
-summary_temp</code></pre></div>
-<pre><code>## # A tibble: 1 × 2
-##       mean  std_dev
-##      &lt;dbl&gt;    &lt;dbl&gt;
-## 1 55.20351 17.78212</code></pre>
-<!--
-Note that 
-  summarize(mean = mean(temp, na.rm = TRUE)) %>% 
-  summarize(std_dev = sd(temp, na.rm = TRUE))
-does not work
--->
-<p>We’ve created a small data frame here called <code>summary_temp</code> that includes both the <code>mean</code> and the <code>std_dev</code> of the <code>temp</code> variable in <code>weather</code>. If we’d like to access either of these values directly we can use the <code>$</code> to specify a column in a data frame:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">summary_temp$mean</code></pre></div>
-<pre><code>## [1] 55.20351</code></pre>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">summary_temp$std_dev</code></pre></div>
-<pre><code>## [1] 17.78212</code></pre>
-<p>It’s often more useful to summarize a variable based on the groupings of another variable. Let’s say we were interested in the mean and standard deviation of temperatures for each month. We believe that you will be amazed at just how simple this is:</p>
-<div class="figure" style="text-align: center"><span id="fig:groupsummarize"></span>
-<img src="images/group_summary.png" alt="Group by and summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet" width="\textwidth" />
-<p class="caption">
-Figure 5.4: Group by and summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet
-</p>
-</div>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">summary_tempXmonth &lt;-<span class="st"> </span>weather %&gt;%<span class="st"> </span>
-<span class="st">  </span><span class="kw">group_by</span>(month) %&gt;%<span class="st"> </span>
-<span class="st">  </span><span class="kw">summarize</span>(<span class="dt">mean =</span> <span class="kw">mean</span>(temp, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
-          <span class="dt">std_dev =</span> <span class="kw">sd</span>(temp, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>))
-summary_tempXmonth</code></pre></div>
-<pre><code>## # A tibble: 12 × 3
-##    month     mean   std_dev
-##    &lt;dbl&gt;    &lt;dbl&gt;     &lt;dbl&gt;
-## 1      1 35.64127 10.185459
-## 2      2 34.15454  6.940228
-## 3      3 39.81404  6.224948
-## 4      4 51.67094  8.785250
-## 5      5 61.59185  9.608687
-## 6      6 72.14500  7.603356
-## 7      7 80.00967  7.147631
-## 8      8 74.40495  5.171365
-## 9      9 67.42582  8.475824
-## 10    10 60.03305  8.829652
-## 11    11 45.10893 10.502249
-## 12    12 38.36811  9.940822</code></pre>
-<p>By simply grouping the <code>weather</code> data set by <code>month</code> first and then passing this new data frame into <code>summarize</code> we get a resulting data frame that shows the mean and standard deviation temperature for each month in New York City.</p>
-<p>Another useful function is the <code>n</code> function which gives a count of how many entries appeared in the groupings. Suppose we’d like to get a sense for how many flights departed each of the three airports in New York City:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">by_origin &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
-<span class="st">  </span><span class="kw">group_by</span>(origin) %&gt;%<span class="st"> </span>
-<span class="st">  </span><span class="kw">summarize</span>(<span class="dt">count =</span> <span class="kw">n</span>())
-by_origin</code></pre></div>
-<pre><code>## # A tibble: 3 × 2
-##   origin  count
-##    &lt;chr&gt;  &lt;int&gt;
-## 1    EWR 120835
-## 2    JFK 111279
-## 3    LGA 104662</code></pre>
-<p>We see that Newark (<code>&quot;EWR&quot;</code>) had the most flights departing in 2013 followed by <code>&quot;JFK&quot;</code> and lastly by LaGuardia (<code>&quot;LGA&quot;</code>).</p>
-<div class="learncheck">
-<p>
-<strong><em>Learning check</em></strong>
-</p>
-</div>
-<p><strong>(LC5.2)</strong> Recall from Chapter <a href="4-data-visualization-via-ggplot2.html#viz"><strong>??</strong></a> when we looked at plots of temperatures by months in NYC. What does the standard deviation column in the <code>summary_tempXmonth</code> data frame tell us about temperatures in New York City throughout the year?</p>
-<p><strong>(LC5.3)</strong> What code would be required to get the mean and standard deviation temperature for each day in 2013 for NYC?</p>
-<p><strong>(LC5.4)</strong> How could we identify how many flights left each of the three airports in each of the months of 2013?</p>
-<hr />
-</div>
-<div id="create-new-variableschange-old-variables-using-mutate" class="section level3">
-<h3><span class="header-section-number">5.2.3</span> Create new variables/change old variables using <code>mutate</code></h3>
-<div class="figure" style="text-align: center"><span id="fig:select"></span>
-<img src="images/mutate.png" alt="Mutate diagram from Data Wrangling with dplyr and tidyr cheatsheet" width="\textwidth" />
-<p class="caption">
-Figure 5.5: Mutate diagram from Data Wrangling with dplyr and tidyr cheatsheet
-</p>
-</div>
-<p>When looking at the <code>flights</code> data set, there are some clear additional variables that could be calculated based on the values of variables already in the data set. Passengers are often frustrated when their flights departs late, but change their mood a bit if pilots can make up some time during the flight to get them to their destination close to when they expected to land. This is commonly referred to as “gain” and we will create this variable using the <code>mutate</code> function. Note that we have also overwritten the <code>flights</code> data frame with what it was before as well as an additional variable <code>gain</code> here.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span><span class="kw">mutate</span>(<span class="dt">gain =</span> arr_delay -<span class="st"> </span>dep_delay)</code></pre></div>
-<p>We can now look at summary measures of this <code>gain</code> variable and even plot it in the form of a histogram:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">gain_summary &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
-<span class="st">  </span><span class="kw">summarize</span>(
-          <span class="dt">min =</span> <span class="kw">min</span>(gain, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
-          <span class="dt">q1 =</span> <span class="kw">quantile</span>(gain, <span class="fl">0.25</span>, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
-          <span class="dt">median =</span> <span class="kw">quantile</span>(gain, <span class="fl">0.5</span>, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
-          <span class="dt">q3 =</span> <span class="kw">quantile</span>(gain, <span class="fl">0.75</span>, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
-          <span class="dt">max =</span> <span class="kw">max</span>(gain, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
-          <span class="dt">mean =</span> <span class="kw">mean</span>(gain, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
-          <span class="dt">sd =</span> <span class="kw">sd</span>(gain, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
-          <span class="dt">missing =</span> <span class="kw">sum</span>(<span class="kw">is.na</span>(gain))
-)
-gain_summary</code></pre></div>
-<pre><code>## # A tibble: 1 × 8
-##     min    q1 median    q3   max      mean       sd missing
-##   &lt;dbl&gt; &lt;dbl&gt;  &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt;     &lt;dbl&gt;    &lt;dbl&gt;   &lt;int&gt;
-## 1  -109   -17     -7     3   196 -5.659779 18.04365    9430</code></pre>
-<p>We’ve recreated the <code>summary</code> function we saw in Chapter <a href="4-data-visualization-via-ggplot2.html#viz"><strong>??</strong></a> here using the <code>summarize</code> function in <code>dplyr</code>.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(ggplot2)
-<span class="kw">ggplot</span>(<span class="dt">data =</span> flights, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> gain)) +
-<span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">color =</span> <span class="st">&quot;white&quot;</span>, <span class="dt">bins =</span> <span class="dv">20</span>)</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-41"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-41-1.png" alt="Histogram of gain variable" width="\textwidth" />
-<p class="caption">
-Figure 5.6: Histogram of gain variable
-</p>
-</div>
-<p>We can also create multiple columns at once and even refer to columns that were just created in a new column. Hadley produces one such example in Chapter 5 of “R for Data Science” <span class="citation">(Grolemund and Wickham <a href="#ref-rds2016">2016</a>)</span>:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights_plus &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span><span class="kw">mutate</span>(
-  <span class="dt">gain =</span> arr_delay -<span class="st"> </span>dep_delay,
-  <span class="dt">hours =</span> air_time /<span class="st"> </span><span class="dv">60</span>,
-  <span class="dt">gain_per_hour =</span> gain /<span class="st"> </span>hours
-)</code></pre></div>
-<hr />
-<div class="learncheck">
-<p>
-<strong><em>Learning check</em></strong>
-</p>
-</div>
-<p><strong>(LC5.5)</strong> What do positive values of the <code>gain</code> variable in <code>flights_plus</code> correspond to? What about negative values? And what about a zero value?</p>
-<p><strong>(LC5.6)</strong> Could we create the <code>dep_delay</code> and <code>arr_delay</code> columns by simply subtracting <code>dep_time</code> from <code>sched_dep_time</code> and similarly for arrivals? Try the code out and explain any differences between the result and what actually appears in <code>flights</code>.</p>
-<p><strong>(LC5.7)</strong> What can we say about the distribution of <code>gain</code>? Describe it in a few sentences using the plot and the <code>gain_summary</code> data frame values.</p>
-<hr />
-</div>
-<div id="reorder-the-data-frame-using-arrange" class="section level3">
-<h3><span class="header-section-number">5.2.4</span> Reorder the data frame using <code id="arrange">arrange</code></h3>
-<p>As you may have thought about with the data frames we’ve worked with so far in the book, one of the most common things you’d like to do is sort the data frames by a specific column. Have you ever been asked to calculate a median by hand? This requires you to put the data in order from smallest to highest in value. The <code>dplyr</code> package has a function called <code>arrange</code> that we will use to sort/reorder our data according to the values of the specified variable. This is most frequently used after we have used the <code>group_by</code> and <code>summarize</code> functions as we will see.</p>
-<p>Let’s suppose we were interested in determining the most frequent destination airports from New York City in 2013:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">freq_dest &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
-<span class="st">  </span><span class="kw">group_by</span>(dest) %&gt;%<span class="st"> </span>
-<span class="st">  </span><span class="kw">summarize</span>(<span class="dt">num_flights =</span> <span class="kw">n</span>())
-freq_dest</code></pre></div>
-<pre><code>## # A tibble: 105 × 2
-##     dest num_flights
-##    &lt;chr&gt;       &lt;int&gt;
-## 1    ABQ         254
-## 2    ACK         265
-## 3    ALB         439
-## 4    ANC           8
-## 5    ATL       17215
-## 6    AUS        2439
-## 7    AVL         275
-## 8    BDL         443
-## 9    BGR         375
-## 10   BHM         297
-## # ... with 95 more rows</code></pre>
-<p>You’ll see that by default the values of <code>dest</code> are displayed in alphabetical order here. Remember to use <code>View()</code> in the R Console to look at all the values of <code>freq_dest</code> in spreadsheet format. We are interested in finding those airports that appear most:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">freq_dest %&gt;%<span class="st"> </span><span class="kw">arrange</span>(num_flights)</code></pre></div>
-<pre><code>## # A tibble: 105 × 2
-##     dest num_flights
-##    &lt;chr&gt;       &lt;int&gt;
-## 1    LEX           1
-## 2    LGA           1
-## 3    ANC           8
-## 4    SBN          10
-## 5    HDN          15
-## 6    MTJ          15
-## 7    EYW          17
-## 8    PSP          19
-## 9    JAC          25
-## 10   BZN          36
-## # ... with 95 more rows</code></pre>
-<p>This is actually giving us the opposite of what we are looking for. It tells us the least frequent destination airports first. To switch the ordering to be descending instead of ascending we use the <code>desc</code> function:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">freq_dest %&gt;%<span class="st"> </span><span class="kw">arrange</span>(<span class="kw">desc</span>(num_flights))</code></pre></div>
-<pre><code>## # A tibble: 105 × 2
-##     dest num_flights
-##    &lt;chr&gt;       &lt;int&gt;
-## 1    ORD       17283
-## 2    ATL       17215
-## 3    LAX       16174
-## 4    BOS       15508
-## 5    MCO       14082
-## 6    CLT       14064
-## 7    SFO       13331
-## 8    FLL       12055
-## 9    MIA       11728
-## 10   DCA        9705
-## # ... with 95 more rows</code></pre>
-<hr />
-</div>
-</div>
-<div id="other-verbs" class="section level2">
-<h2><span class="header-section-number">5.3</span> Other verbs</h2>
-<div id="select-variables-using-select" class="section level3">
-<h3><span class="header-section-number">5.3.1</span> Select variables using <code id="select">select</code></h3>
-<div class="figure" style="text-align: center"><span id="fig:selectfig"></span>
-<img src="images/select.png" alt="Select diagram from Data Wrangling with dplyr and tidyr cheatsheet" width="\textwidth" />
-<p class="caption">
-Figure 5.7: Select diagram from Data Wrangling with dplyr and tidyr cheatsheet
-</p>
-</div>
-<p>We’ve seen that the <code>flights</code> data frame in the <code>nycflights13</code> package contains many different variables (19 in fact). You can identify this by running the <code>dim</code> function or the <code>ncol</code> function:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">data</span>(flights)
-<span class="kw">dim</span>(flights)</code></pre></div>
-<pre><code>## [1] 336776     19</code></pre>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ncol</span>(flights)</code></pre></div>
-<pre><code>## [1] 19</code></pre>
-<p>One of these variables is <code>year</code>. If you remember the original description of the <code>flights</code> data frame (or by running <code>?flights</code>), you’ll remember that this data correspond to flights in 2013 departing New York City. The <code>year</code> variable isn’t really a variable here in that it doesn’t vary… <code>flights</code> actually comes from a larger data set that covers many years. We may want to remove the <code>year</code> variable from our data set since it won’t be helpful for analysis in this case. To do so easily, we use the <code>select</code> variable:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights_small &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span><span class="kw">select</span>( -year)
-<span class="kw">names</span>(flights_small)</code></pre></div>
-<pre><code>##  [1] &quot;month&quot;          &quot;day&quot;            &quot;dep_time&quot;       &quot;sched_dep_time&quot;
-##  [5] &quot;dep_delay&quot;      &quot;arr_time&quot;       &quot;sched_arr_time&quot; &quot;arr_delay&quot;     
-##  [9] &quot;carrier&quot;        &quot;flight&quot;         &quot;tailnum&quot;        &quot;origin&quot;        
-## [13] &quot;dest&quot;           &quot;air_time&quot;       &quot;distance&quot;       &quot;hour&quot;          
-## [17] &quot;minute&quot;         &quot;time_hour&quot;</code></pre>
-<p>The <code>names</code> function gives a listing of all the columns in a data frame. We see that <code>year</code> has been removed. This was done using a <code>-</code> in front of the name of the column we’d like to remove.</p>
-<p>We could also select specific columns (instead of deselecting columns) by listing them out:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flight_dep_times &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span><span class="kw">select</span>(month, day, dep_time, sched_dep_time)
-flight_dep_times</code></pre></div>
-<pre><code>## # A tibble: 336,776 × 4
-##    month   day dep_time sched_dep_time
-##    &lt;int&gt; &lt;int&gt;    &lt;int&gt;          &lt;int&gt;
-## 1      1     1      517            515
-## 2      1     1      533            529
-## 3      1     1      542            540
-## 4      1     1      544            545
-## 5      1     1      554            600
-## 6      1     1      554            558
-## 7      1     1      555            600
-## 8      1     1      557            600
-## 9      1     1      557            600
-## 10     1     1      558            600
-## # ... with 336,766 more rows</code></pre>
-<p>Or we could specify a ranges of columns:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flight_arr_times &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span><span class="kw">select</span>(month:day, arr_time:sched_arr_time)
-flight_arr_times</code></pre></div>
-<pre><code>## # A tibble: 336,776 × 4
-##    month   day arr_time sched_arr_time
-##    &lt;int&gt; &lt;int&gt;    &lt;int&gt;          &lt;int&gt;
-## 1      1     1      830            819
-## 2      1     1      850            830
-## 3      1     1      923            850
-## 4      1     1     1004           1022
-## 5      1     1      812            837
-## 6      1     1      740            728
-## 7      1     1      913            854
-## 8      1     1      709            723
-## 9      1     1      838            846
-## 10     1     1      753            745
-## # ... with 336,766 more rows</code></pre>
-<p>The <code>select</code> function can also be used to reorder columns in combination with the <code>everything</code> helper function. Let’s suppose we’d like the <code>hour</code>, <code>minute</code>, and <code>time_hour</code> variables, which appear at the end of the <code>flights</code> data set, to actually appear immediately after the <code>day</code> variable:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights_reorder &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span><span class="kw">select</span>(month:day, hour:time_hour, <span class="kw">everything</span>())
-<span class="kw">names</span>(flights_reorder)</code></pre></div>
-<pre><code>##  [1] &quot;month&quot;          &quot;day&quot;            &quot;hour&quot;           &quot;minute&quot;        
-##  [5] &quot;time_hour&quot;      &quot;year&quot;           &quot;dep_time&quot;       &quot;sched_dep_time&quot;
-##  [9] &quot;dep_delay&quot;      &quot;arr_time&quot;       &quot;sched_arr_time&quot; &quot;arr_delay&quot;     
-## [13] &quot;carrier&quot;        &quot;flight&quot;         &quot;tailnum&quot;        &quot;origin&quot;        
-## [17] &quot;dest&quot;           &quot;air_time&quot;       &quot;distance&quot;</code></pre>
-<p>Lastly, the helper functions <code>starts_with</code>, <code>ends_with</code>, and <code>contains</code> can be used to choose column names that match those conditions:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights_begin_a &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span><span class="kw">select</span>(<span class="kw">starts_with</span>(<span class="st">&quot;a&quot;</span>))
-flights_begin_a</code></pre></div>
-<pre><code>## # A tibble: 336,776 × 3
-##    arr_time arr_delay air_time
-##       &lt;int&gt;     &lt;dbl&gt;    &lt;dbl&gt;
-## 1       830        11      227
-## 2       850        20      227
-## 3       923        33      160
-## 4      1004       -18      183
-## 5       812       -25      116
-## 6       740        12      150
-## 7       913        19      158
-## 8       709       -14       53
-## 9       838        -8      140
-## 10      753         8      138
-## # ... with 336,766 more rows</code></pre>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights_delays &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span><span class="kw">select</span>(<span class="kw">ends_with</span>(<span class="st">&quot;delay&quot;</span>))
-flights_delays</code></pre></div>
-<pre><code>## # A tibble: 336,776 × 2
-##    dep_delay arr_delay
-##        &lt;dbl&gt;     &lt;dbl&gt;
-## 1          2        11
-## 2          4        20
-## 3          2        33
-## 4         -1       -18
-## 5         -6       -25
-## 6         -4        12
-## 7         -5        19
-## 8         -3       -14
-## 9         -3        -8
-## 10        -2         8
-## # ... with 336,766 more rows</code></pre>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights_time &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span><span class="kw">select</span>(<span class="kw">contains</span>(<span class="st">&quot;time&quot;</span>))
-flights_time</code></pre></div>
-<pre><code>## # A tibble: 336,776 × 6
-##    dep_time sched_dep_time arr_time sched_arr_time air_time
-##       &lt;int&gt;          &lt;int&gt;    &lt;int&gt;          &lt;int&gt;    &lt;dbl&gt;
-## 1       517            515      830            819      227
-## 2       533            529      850            830      227
-## 3       542            540      923            850      160
-## 4       544            545     1004           1022      183
-## 5       554            600      812            837      116
-## 6       554            558      740            728      150
-## 7       555            600      913            854      158
-## 8       557            600      709            723       53
-## 9       557            600      838            846      140
-## 10      558            600      753            745      138
-## # ... with 336,766 more rows, and 1 more variables: time_hour &lt;dttm&gt;</code></pre>
-</div>
-<div id="rename-variables-using-rename" class="section level3">
-<h3><span class="header-section-number">5.3.2</span> Rename variables using <code id="rename">rename</code></h3>
-<p>Another useful function is <code>rename</code>, which as you may suspect renames one column to another name. Suppose we wanted <code>dep_time</code> and <code>arr_time</code> to be <code>departure_time</code> and <code>arrival_time</code> instead in the <code>flights_time</code> data frame:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights_time &lt;-<span class="st"> </span>flights_time %&gt;%<span class="st"> </span>
-<span class="st">  </span><span class="kw">rename</span>(<span class="dt">departure_time =</span> dep_time,
-         <span class="dt">arrival_time =</span> arr_time)
-<span class="kw">names</span>(flights_time)</code></pre></div>
-<pre><code>## [1] &quot;departure_time&quot; &quot;sched_dep_time&quot; &quot;arrival_time&quot;   &quot;sched_arr_time&quot;
-## [5] &quot;air_time&quot;       &quot;time_hour&quot;</code></pre>
-<p>It’s easy to forget if the new name comes before or after the equals sign. I usually remember this as “New Before, Old After” or NBOA.</p>
-<p>You’ll receive an error if you try to do it the other way:</p>
-<pre><code>Error: Unknown variables: departure_time, arrival_time.</code></pre>
-<hr />
-<div class="learncheck">
-<p>
-<strong><em>Learning check</em></strong>
-</p>
-</div>
-<p><strong>(LC5.8)</strong> What are some ways to select all three of the <code>dest</code>, <code>air_time</code>, and <code>distance</code> variables from <code>flights</code>? Give the code showing how to do this in at least three different ways.</p>
-<p><strong>(LC5.9)</strong> How could one use <code>starts_with</code>, <code>ends_with</code>, and <code>contains</code> to select columns from the <code>flights</code> data frame? Provide three different examples in total: one for <code>starts_with</code>, one for <code>ends_with</code>, and one for <code>contains</code>.</p>
-<p><strong>(LC5.10)</strong> Why might we want to use the <code>select</code> function on a data frame?</p>
-<hr />
-</div>
-<div id="find-the-top-number-of-values-using-top_n" class="section level3">
-<h3><span class="header-section-number">5.3.3</span> Find the top number of values using <code>top_n</code></h3>
-<p>We can also use the <code>top_n</code> function which automatically tells us the most frequent <code>num_flights</code>. We specify the top 10 airports here:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">freq_dest %&gt;%<span class="st"> </span><span class="kw">top_n</span>(<span class="dt">n =</span> <span class="dv">10</span>, <span class="dt">wt =</span> num_flights)</code></pre></div>
-<pre><code>## # A tibble: 10 × 2
-##     dest num_flights
-##    &lt;chr&gt;       &lt;int&gt;
-## 1    ATL       17215
-## 2    BOS       15508
-## 3    CLT       14064
-## 4    DCA        9705
-## 5    FLL       12055
-## 6    LAX       16174
-## 7    MCO       14082
-## 8    MIA       11728
-## 9    ORD       17283
-## 10   SFO       13331</code></pre>
-<p>We’ll still need to arrange this by <code>num_flights</code> though:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">freq_dest %&gt;%<span class="st"> </span><span class="kw">top_n</span>(<span class="dt">n =</span> <span class="dv">10</span>, <span class="dt">wt =</span> num_flights) %&gt;%<span class="st"> </span>
-<span class="st">  </span><span class="kw">arrange</span>(<span class="kw">desc</span>(num_flights))</code></pre></div>
-<pre><code>## # A tibble: 10 × 2
-##     dest num_flights
-##    &lt;chr&gt;       &lt;int&gt;
-## 1    ORD       17283
-## 2    ATL       17215
-## 3    LAX       16174
-## 4    BOS       15508
-## 5    MCO       14082
-## 6    CLT       14064
-## 7    SFO       13331
-## 8    FLL       12055
-## 9    MIA       11728
-## 10   DCA        9705</code></pre>
-<p><strong>Note:</strong> Remember that I didn’t pull the <code>n</code> and <code>wt</code> arguments out of thin air. They can be found by using the <code>?</code> function on <code>top_n</code>.</p>
-<p>We can go one stop further and tie together the group_by and summarize functions we used to find the most frequent flights:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">ten_freq_dests &lt;-<span class="st"> </span>flights %&gt;%
-<span class="st">  </span><span class="kw">group_by</span>(dest) %&gt;%
-<span class="st">  </span><span class="kw">summarize</span>(<span class="dt">num_flights =</span> <span class="kw">n</span>()) %&gt;%
-<span class="st">  </span><span class="kw">top_n</span>(<span class="dt">n =</span> <span class="dv">10</span>) %&gt;%
-<span class="st">  </span><span class="kw">arrange</span>(<span class="kw">desc</span>(num_flights))</code></pre></div>
-<pre><code>## Selecting by num_flights</code></pre>
-<div class="learncheck">
-<p>
-<strong><em>Learning check</em></strong>
-</p>
-</div>
-<p><strong><code>paste0(&quot;(LC&quot;, chap, &quot;.&quot;, (lc &lt;- lc + 1), &quot;)&quot;)</code></strong> Create a new data frame that shows the top 5 airports with the largest arrival delays from NYC in 2013.</p>
-</div>
-</div>
-<div id="joiningmerging-data-frames" class="section level2">
-<h2><span class="header-section-number">5.4</span> Joining/merging data frames</h2>
-<p>Something you may have thought to yourself as you looked at the most freqent destinations of flights from NYC in 2013 is</p>
-<ul>
-<li>“What cities are these airports in?”</li>
-<li>“Is <code>&quot;ORD&quot;</code> Orlando?”</li>
-<li>“Where is <code>&quot;FLL&quot;</code>?</li>
-</ul>
-<p>The <code>nycflights13</code> data package contains multiple data frames. Instead of having to manually look up different values of airport names corresponding to airport codes like <code>ORD</code>, we can have R automatically do this “looking up” for us. To do so, we’ll need to tell R how to match one data frame to another data frame. Let’s first check out the <code>airports</code> data frame inside of R:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">View</span>(airports)</code></pre></div>
-<p>The first column <code>faa</code> corresponds to the airport codes that we saw in <code>dest</code> in our <code>flights</code> and subsequent <code>ten_freq_dests</code> data sets. Hadley and Garrett <span class="citation">(Grolemund and Wickham <a href="#ref-rds2016">2016</a>)</span> created the following diagram to help us understand how the different data sets are linked:</p>
-<div class="figure" style="text-align: center"><span id="fig:reldiagram"></span>
-<img src="images/relational-nycflights.png" alt="Data relationships in nycflights13 from R for Data Science" width="\textwidth" />
-<p class="caption">
-Figure 5.8: Data relationships in nycflights13 from R for Data Science
-</p>
-</div>
-<p>We see from <code>View(airports)</code> that <code>airports</code> contains a lot of other information about 1458. We are only really interested here in the <code>faa</code> and <code>name</code> columns. Let’s use the <code>select</code> function to only use those variables:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">airports_small &lt;-<span class="st"> </span>airports %&gt;%<span class="st"> </span><span class="kw">select</span>(faa, name)</code></pre></div>
-<p>So if we identify the names of the airports we can use the <code>inner_join</code> function to bring two different data frames together. Note that we will also rename the subsequent column <code>name</code> as <code>airport_name</code>:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">named_freq_dests &lt;-<span class="st"> </span>ten_freq_dests %&gt;%
-<span class="st">  </span><span class="kw">inner_join</span>(airports_small, <span class="dt">by =</span> <span class="kw">c</span>(<span class="st">&quot;dest&quot;</span> =<span class="st"> &quot;faa&quot;</span>)) %&gt;%
-<span class="st">  </span><span class="kw">rename</span>(<span class="dt">airport_name =</span> name)
-named_freq_dests</code></pre></div>
-<pre><code>## # A tibble: 10 × 3
-##     dest num_flights                       airport_name
-##    &lt;chr&gt;       &lt;int&gt;                              &lt;chr&gt;
-## 1    ORD       17283                 Chicago Ohare Intl
-## 2    ATL       17215    Hartsfield Jackson Atlanta Intl
-## 3    LAX       16174                   Los Angeles Intl
-## 4    BOS       15508 General Edward Lawrence Logan Intl
-## 5    MCO       14082                       Orlando Intl
-## 6    CLT       14064             Charlotte Douglas Intl
-## 7    SFO       13331                 San Francisco Intl
-## 8    FLL       12055     Fort Lauderdale Hollywood Intl
-## 9    MIA       11728                         Miami Intl
-## 10   DCA        9705      Ronald Reagan Washington Natl</code></pre>
-<p>In case you didn’t know, <code>&quot;ORD&quot;</code> is the airport code of Chicago O’Hare airport and <code>&quot;FLL&quot;</code> is the main airport in Fort Lauderdale, Florida, which we can now see in our <code>named_freq_dests</code> data frame.</p>
-<p>A visual representation of the <code>inner_join</code> is given below <span class="citation">(Grolemund and Wickham <a href="#ref-rds2016">2016</a>)</span>:</p>
-<div class="figure" style="text-align: center"><span id="fig:ijdiagram"></span>
-<img src="images/join-inner.png" alt="Diagram of inner join from R for Data Science" width="\textwidth" />
-<p class="caption">
-Figure 5.9: Diagram of inner join from R for Data Science
-</p>
-</div>
-<p>There are more complex joins available, but the <code>inner_join</code> will solve nearly all of the problems you’ll face in our experience.</p>
-<hr />
-<div class="learncheck">
-<p>
-<strong><em>Learning check</em></strong>
-</p>
-</div>
-<p><strong>(LC5.11)</strong> What happens when you try to <code>inner_join</code> the <code>ten_freq_dests</code> data frame with <code>airports</code> instead of <code>airports_small</code>? How might one use this result to answer further questions about the top 10 destinations?</p>
-<p><strong>(LC5.12)</strong> What surprises you about the top 10 destinations from NYC in 2013?</p>
-<hr />
-<p>As we saw with the RStudio cheatsheet on <a href="https://www.rstudio.com/wp-content/uploads/2015/12/ggplot2-cheatsheet-2.0.pdf">data visualization</a>, RStudio has also created a cheatsheet for data manipulation entitled “Data Wrangling with dplyr and tidyr” available <a href="https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf">here</a>. We will focus only on the <code>dplyr</code> functions in this book, but you are encouraged to also explore <code>tidyr</code> if you are presented with data that is not in the tidy format that we have specified as the preferred option for our purposes.</p>
-</div>
-<div id="script-of-r-code-1" class="section level2">
-<h2><span class="header-section-number">5.5</span> Script of R code</h2>
-<p>An R script file of all R code used in this chapter is available <a href="http://ismayc.github.io/moderndiver-book/05-manip.R">here</a>.</p>
-</div>
-<div id="whats-to-come-2" class="section level2">
-<h2><span class="header-section-number">5.6</span> What’s to come?</h2>
-<p>This concludes the <strong>Data Exploration</strong> unit of this book. You should be pretty proficient in both plotting variables (or multiple variables together) in various data sets and manipulating data as we’ve done in this chapter. You are encouraged to step back through the code in earlier chapters and make changes as you see fit based on your updated knowledge.</p>
-<p>In Chapter <a href="6-simulating-randomness-via-mosaic.html#sim"><strong>??</strong></a>, we’ll begin to build the pieces needed to understand how this unit of <strong>Data Exploration</strong> can tie into statistical inference in the <strong>Inference</strong> part of the book. Remember that the focus throughout is on data visualization and we’ll see that next when we discuss sampling, resampling, and bootstrapping. These ideas will lead us into hypothesis testing and confidence intervals.</p>
-
-</div>
-</div>
-
-
-
-<h3>References</h3>
-<div id="refs" class="references">
-<div id="ref-R-dplyr">
-<p>Wickham, Hadley, and Romain Francois. 2016. <em>Dplyr: A Grammar of Data Manipulation</em>. <a href="https://CRAN.R-project.org/package=dplyr" class="uri">https://CRAN.R-project.org/package=dplyr</a>.</p>
-</div>
-<div id="ref-rds2016">
-<p>Grolemund, Garrett, and Hadley Wickham. 2016. <em>R for Data Science</em>. <a href="http://r4ds.had.co.nz/" class="uri">http://r4ds.had.co.nz/</a>.</p>
-</div>
-</div>
-            </section>
-
-          </div>
-        </div>
-      </div>
-<a href="4-data-visualization-via-ggplot2.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
-<a href="6-simulating-randomness-via-mosaic.html" class="navigation navigation-next " aria-label="Next page""><i class="fa fa-angle-right"></i></a>
-
-<script src="libs/gitbook-2.6.7/js/app.min.js"></script>
-<script src="libs/gitbook-2.6.7/js/lunr.js"></script>
-<script src="libs/gitbook-2.6.7/js/plugin-search.js"></script>
-<script src="libs/gitbook-2.6.7/js/plugin-sharing.js"></script>
-<script src="libs/gitbook-2.6.7/js/plugin-fontsettings.js"></script>
-<script src="libs/gitbook-2.6.7/js/plugin-bookdown.js"></script>
-<script src="libs/gitbook-2.6.7/js/jquery.highlight.js"></script>
-<script>
-require(["gitbook"], function(gitbook) {
-gitbook.start({
-"sharing": {
-"github": false,
-"facebook": true,
-"twitter": true,
-"google": false,
-"weibo": false,
-"instapper": false,
-"vk": false,
-"all": ["facebook", "google", "twitter", "weibo", "instapaper"]
-},
-"fontsettings": {
-"theme": "white",
-"family": "sans",
-"size": 2
-},
-"edit": {
-"link": "https://github.com/ismayc/moderndiver-source/edit/master/05-manip.Rmd",
-"text": "Edit"
-},
-"download": ["ismaykim.pdf"],
-"toc": {
-"collapse": "section",
-"scroll_highlight": true
-}
-});
-});
-</script>
-
-<!-- dynamically load mathjax for compatibility with self-contained -->
-<script>
-  (function () {
-    var script = document.createElement("script");
-    script.type = "text/javascript";
-    script.src  = "https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
-    if (location.protocol !== "file:" && /^https?:/.test(script.src))
-      script.src  = script.src.replace(/^https?:/, '');
-    document.getElementsByTagName("head")[0].appendChild(script);
-  })();
-</script>
-</body>
-
-</html>
diff --git a/docs/5-manip.html b/docs/5-manip.html
new file mode 100644
index 000000000..0d3e2a542
--- /dev/null
+++ b/docs/5-manip.html
@@ -0,0 +1,982 @@
+<!DOCTYPE html>
+<html >
+
+<head>
+
+  <meta charset="UTF-8">
+  <meta http-equiv="X-UA-Compatible" content="IE=edge">
+  <title>ModernDive</title>
+  <meta content="text/html; charset=UTF-8" http-equiv="Content-Type">
+  <meta name="description" content="Combining statistical and computational thinking to make sense of data. An evolution of the traditional introductory statistics curriculum, more focused on reproducible research, data visualization, and modern data analysis techniques and tools including resampling and bootstrapping using R, RStudio, and R Markdown">
+  <meta name="generator" content="bookdown 0.3 and GitBook 2.6.7">
+
+  <meta property="og:title" content="ModernDive" />
+  <meta property="og:type" content="book" />
+  
+  
+  <meta property="og:description" content="Combining statistical and computational thinking to make sense of data. An evolution of the traditional introductory statistics curriculum, more focused on reproducible research, data visualization, and modern data analysis techniques and tools including resampling and bootstrapping using R, RStudio, and R Markdown" />
+  <meta name="github-repo" content="ismayc/moderndiver-book" />
+
+  <meta name="twitter:card" content="summary" />
+  <meta name="twitter:title" content="ModernDive" />
+  
+  <meta name="twitter:description" content="Combining statistical and computational thinking to make sense of data. An evolution of the traditional introductory statistics curriculum, more focused on reproducible research, data visualization, and modern data analysis techniques and tools including resampling and bootstrapping using R, RStudio, and R Markdown" />
+  
+
+<meta name="author" content="Chester Ismay and Albert Y. Kim">
+
+
+<meta name="date" content="2017-01-10">
+
+  <meta name="viewport" content="width=device-width, initial-scale=1">
+  <meta name="apple-mobile-web-app-capable" content="yes">
+  <meta name="apple-mobile-web-app-status-bar-style" content="black">
+  
+  
+<link rel="prev" href="4-viz.html">
+<link rel="next" href="6-sim.html">
+
+<script src="libs/jquery-2.2.3/jquery.min.js"></script>
+<link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
+<link href="libs/gitbook-2.6.7/css/plugin-bookdown.css" rel="stylesheet" />
+<link href="libs/gitbook-2.6.7/css/plugin-highlight.css" rel="stylesheet" />
+<link href="libs/gitbook-2.6.7/css/plugin-search.css" rel="stylesheet" />
+<link href="libs/gitbook-2.6.7/css/plugin-fontsettings.css" rel="stylesheet" />
+
+
+
+
+
+
+
+<script src="libs/htmlwidgets-0.8/htmlwidgets.js"></script>
+<link href="libs/dygraphs-1.1.1/dygraph.css" rel="stylesheet" />
+<script src="libs/dygraphs-1.1.1/dygraph-combined.js"></script>
+<script src="libs/moment-2.8.4/moment.js"></script>
+<script src="libs/moment-timezone-0.2.5/moment-timezone-with-data.js"></script>
+<script src="libs/moment-fquarter-1.0.0/moment-fquarter.min.js"></script>
+<script src="libs/dygraphs-binding-1.1.1.4/dygraphs.js"></script>
+<script>
+  (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
+  (i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
+  m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
+  })(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
+
+  ga('create', 'UA-89938436-1', 'auto');
+  ga('send', 'pageview');
+
+</script>
+
+
+<style type="text/css">
+div.sourceCode { overflow-x: auto; }
+table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
+  margin: 0; padding: 0; vertical-align: baseline; border: none; }
+table.sourceCode { width: 100%; line-height: 100%; }
+td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
+td.sourceCode { padding-left: 5px; }
+code > span.kw { color: #007020; font-weight: bold; } /* Keyword */
+code > span.dt { color: #902000; } /* DataType */
+code > span.dv { color: #40a070; } /* DecVal */
+code > span.bn { color: #40a070; } /* BaseN */
+code > span.fl { color: #40a070; } /* Float */
+code > span.ch { color: #4070a0; } /* Char */
+code > span.st { color: #4070a0; } /* String */
+code > span.co { color: #60a0b0; font-style: italic; } /* Comment */
+code > span.ot { color: #007020; } /* Other */
+code > span.al { color: #ff0000; font-weight: bold; } /* Alert */
+code > span.fu { color: #06287e; } /* Function */
+code > span.er { color: #ff0000; font-weight: bold; } /* Error */
+code > span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } /* Warning */
+code > span.cn { color: #880000; } /* Constant */
+code > span.sc { color: #4070a0; } /* SpecialChar */
+code > span.vs { color: #4070a0; } /* VerbatimString */
+code > span.ss { color: #bb6688; } /* SpecialString */
+code > span.im { } /* Import */
+code > span.va { color: #19177c; } /* Variable */
+code > span.cf { color: #007020; font-weight: bold; } /* ControlFlow */
+code > span.op { color: #666666; } /* Operator */
+code > span.bu { } /* BuiltIn */
+code > span.ex { } /* Extension */
+code > span.pp { color: #bc7a00; } /* Preprocessor */
+code > span.at { color: #7d9029; } /* Attribute */
+code > span.do { color: #ba2121; font-style: italic; } /* Documentation */
+code > span.an { color: #60a0b0; font-weight: bold; font-style: italic; } /* Annotation */
+code > span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } /* CommentVar */
+code > span.in { color: #60a0b0; font-weight: bold; font-style: italic; } /* Information */
+</style>
+
+<link rel="stylesheet" href="style.css" type="text/css" />
+</head>
+
+<body>
+
+
+  <div class="book without-animation with-summary font-size-2 font-family-1" data-basepath=".">
+
+    <div class="book-summary">
+      <nav role="navigation">
+
+<ul class="summary">
+<li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> Preamble</a><ul>
+<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book---for-instructors"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book - For Instructors</a></li>
+<li class="chapter" data-level="1.2" data-path="index.html"><a href="index.html#contribute"><i class="fa fa-check"></i><b>1.2</b> Contribute</a></li>
+<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started---for-students"><i class="fa fa-check"></i><b>1.3</b> Getting Started - For Students</a></li>
+<li class="chapter" data-level="" data-path="index.html"><a href="index.html#colophon"><i class="fa fa-check"></i>Colophon</a></li>
+</ul></li>
+<li class="chapter" data-level="2" data-path="2-intro.html"><a href="2-intro.html"><i class="fa fa-check"></i><b>2</b> Introduction</a><ul>
+<li class="chapter" data-level="2.1" data-path="2-intro.html"><a href="2-intro.html#preamble-1"><i class="fa fa-check"></i><b>2.1</b> Preamble</a></li>
+<li class="chapter" data-level="2.2" data-path="2-intro.html"><a href="2-intro.html#three-driving-data-sources"><i class="fa fa-check"></i><b>2.2</b> Three driving data sources</a></li>
+<li class="chapter" data-level="2.3" data-path="2-intro.html"><a href="2-intro.html#datascience-pipeline"><i class="fa fa-check"></i><b>2.3</b> Data/science pipeline</a></li>
+<li class="chapter" data-level="2.4" data-path="2-intro.html"><a href="2-intro.html#reproducibility"><i class="fa fa-check"></i><b>2.4</b> Reproducibility</a></li>
+<li class="chapter" data-level="2.5" data-path="2-intro.html"><a href="2-intro.html#who-is-this-book-for"><i class="fa fa-check"></i><b>2.5</b> Who is this book for?</a></li>
+</ul></li>
+<li class="part"><span><b>I Data Exploration</b></span></li>
+<li class="chapter" data-level="3" data-path="3-tidy.html"><a href="3-tidy.html"><i class="fa fa-check"></i><b>3</b> Tidy Data</a><ul>
+<li class="chapter" data-level="" data-path="3-tidy.html"><a href="3-tidy.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="3.1" data-path="3-tidy.html"><a href="3-tidy.html#what-is-tidy-data"><i class="fa fa-check"></i><b>3.1</b> What is tidy data?</a></li>
+<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#datasets-in-the-nycflights13-package"><i class="fa fa-check"></i><b>3.2</b> Datasets in the <code>nycflights13</code> package</a></li>
+<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a><ul>
+<li class="chapter" data-level="3.3.1" data-path="3-tidy.html"><a href="3-tidy.html#identification-variables"><i class="fa fa-check"></i><b>3.3.1</b> Identification variables</a></li>
+</ul></li>
+<li class="chapter" data-level="3.4" data-path="3-tidy.html"><a href="3-tidy.html#normal-forms-of-data"><i class="fa fa-check"></i><b>3.4</b> Normal forms of data</a></li>
+<li class="chapter" data-level="3.5" data-path="3-tidy.html"><a href="3-tidy.html#whats-to-come"><i class="fa fa-check"></i><b>3.5</b> What’s to come?</a></li>
+</ul></li>
+<li class="chapter" data-level="4" data-path="4-viz.html"><a href="4-viz.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via ggplot2</a><ul>
+<li class="chapter" data-level="" data-path="4-viz.html"><a href="4-viz.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="4.1" data-path="4-viz.html"><a href="4-viz.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
+<li class="chapter" data-level="4.1.1" data-path="4-viz.html"><a href="4-viz.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
+<li class="chapter" data-level="4.1.2" data-path="4-viz.html"><a href="4-viz.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
+<li class="chapter" data-level="4.1.3" data-path="4-viz.html"><a href="4-viz.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
+<li class="chapter" data-level="4.1.4" data-path="4-viz.html"><a href="4-viz.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The ggplot2 Package</a></li>
+</ul></li>
+<li class="chapter" data-level="4.2" data-path="4-viz.html"><a href="4-viz.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
+<li class="chapter" data-level="4.3" data-path="4-viz.html"><a href="4-viz.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
+<li class="chapter" data-level="4.3.1" data-path="4-viz.html"><a href="4-viz.html#geompoint"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via geom_point</a></li>
+<li class="chapter" data-level="4.3.2" data-path="4-viz.html"><a href="4-viz.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
+<li class="chapter" data-level="4.3.3" data-path="4-viz.html"><a href="4-viz.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.4" data-path="4-viz.html"><a href="4-viz.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
+<li class="chapter" data-level="4.4.1" data-path="4-viz.html"><a href="4-viz.html#geomline"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via geom_line</a></li>
+<li class="chapter" data-level="4.4.2" data-path="4-viz.html"><a href="4-viz.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.5" data-path="4-viz.html"><a href="4-viz.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
+<li class="chapter" data-level="4.5.1" data-path="4-viz.html"><a href="4-viz.html#geomhistogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via geom_histogram</a></li>
+<li class="chapter" data-level="4.5.2" data-path="4-viz.html"><a href="4-viz.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
+<li class="chapter" data-level="4.5.3" data-path="4-viz.html"><a href="4-viz.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.6" data-path="4-viz.html"><a href="4-viz.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
+<li class="chapter" data-level="4.7" data-path="4-viz.html"><a href="4-viz.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
+<li class="chapter" data-level="4.7.1" data-path="4-viz.html"><a href="4-viz.html#geomboxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via geom_boxplot</a></li>
+<li class="chapter" data-level="4.7.2" data-path="4-viz.html"><a href="4-viz.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.8" data-path="4-viz.html"><a href="4-viz.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
+<li class="chapter" data-level="4.8.1" data-path="4-viz.html"><a href="4-viz.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via geom_bar</a></li>
+<li class="chapter" data-level="4.8.2" data-path="4-viz.html"><a href="4-viz.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
+<li class="chapter" data-level="4.8.3" data-path="4-viz.html"><a href="4-viz.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
+<li class="chapter" data-level="4.8.4" data-path="4-viz.html"><a href="4-viz.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.9" data-path="4-viz.html"><a href="4-viz.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="4.9.1" data-path="4-viz.html"><a href="4-viz.html#resources"><i class="fa fa-check"></i><b>4.9.1</b> Resources</a></li>
+<li class="chapter" data-level="4.9.2" data-path="4-viz.html"><a href="4-viz.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
+<li class="chapter" data-level="4.9.3" data-path="4-viz.html"><a href="4-viz.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.3</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="5" data-path="5-manip.html"><a href="5-manip.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via dplyr</a><ul>
+<li class="chapter" data-level="" data-path="5-manip.html"><a href="5-manip.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="5.1" data-path="5-manip.html"><a href="5-manip.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
+<li class="chapter" data-level="5.2" data-path="5-manip.html"><a href="5-manip.html#five-main-verbs---the-5mv"><i class="fa fa-check"></i><b>5.2</b> Five Main Verbs - The 5MV</a><ul>
+<li class="chapter" data-level="5.2.1" data-path="5-manip.html"><a href="5-manip.html#filter"><i class="fa fa-check"></i><b>5.2.1</b> 5MV#1: Filter observations using filter</a></li>
+<li class="chapter" data-level="5.2.2" data-path="5-manip.html"><a href="5-manip.html#mv2-summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> 5MV#2: Summarize variables using summarize</a></li>
+<li class="chapter" data-level="5.2.3" data-path="5-manip.html"><a href="5-manip.html#mv3-group-rows-using-group_by"><i class="fa fa-check"></i><b>5.2.3</b> 5MV#3: Group rows using group_by</a></li>
+<li class="chapter" data-level="5.2.4" data-path="5-manip.html"><a href="5-manip.html#mv4-create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.4</b> 5MV#4: Create new variables/change old variables using mutate</a></li>
+<li class="chapter" data-level="5.2.5" data-path="5-manip.html"><a href="5-manip.html#arrange"><i class="fa fa-check"></i><b>5.2.5</b> 5MV#5: Reorder the data frame using arrange</a></li>
+</ul></li>
+<li class="chapter" data-level="5.3" data-path="5-manip.html"><a href="5-manip.html#joining-data-frames"><i class="fa fa-check"></i><b>5.3</b> Joining data frames</a><ul>
+<li class="chapter" data-level="5.3.1" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables"><i class="fa fa-check"></i><b>5.3.1</b> Joining by Key Variables</a></li>
+<li class="chapter" data-level="5.3.2" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables-with-different-names"><i class="fa fa-check"></i><b>5.3.2</b> Joining by Key Variables with Different Names</a></li>
+</ul></li>
+<li class="chapter" data-level="5.4" data-path="5-manip.html"><a href="5-manip.html#optional-other-verbs"><i class="fa fa-check"></i><b>5.4</b> Optional: Other verbs</a><ul>
+<li class="chapter" data-level="5.4.1" data-path="5-manip.html"><a href="5-manip.html#select"><i class="fa fa-check"></i><b>5.4.1</b> Select variables using select</a></li>
+<li class="chapter" data-level="5.4.2" data-path="5-manip.html"><a href="5-manip.html#rename"><i class="fa fa-check"></i><b>5.4.2</b> Rename variables using rename</a></li>
+<li class="chapter" data-level="5.4.3" data-path="5-manip.html"><a href="5-manip.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.4.3</b> Find the top number of values using top_n</a></li>
+</ul></li>
+<li class="chapter" data-level="5.5" data-path="5-manip.html"><a href="5-manip.html#conclusion-1"><i class="fa fa-check"></i><b>5.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="5.5.1" data-path="5-manip.html"><a href="5-manip.html#resources-1"><i class="fa fa-check"></i><b>5.5.1</b> Resources</a></li>
+<li class="chapter" data-level="5.5.2" data-path="5-manip.html"><a href="5-manip.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5.2</b> Script of R code</a></li>
+<li class="chapter" data-level="5.5.3" data-path="5-manip.html"><a href="5-manip.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.5.3</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="part"><span><b>II Inference</b></span></li>
+<li class="chapter" data-level="6" data-path="6-sim.html"><a href="6-sim.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via mosaic</a><ul>
+<li class="chapter" data-level="" data-path="6-sim.html"><a href="6-sim.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="6.1" data-path="6-sim.html"><a href="6-sim.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
+<li class="chapter" data-level="6.1.1" data-path="6-sim.html"><a href="6-sim.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
+<li class="chapter" data-level="6.1.2" data-path="6-sim.html"><a href="6-sim.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+</ul></li>
+<li class="chapter" data-level="6.2" data-path="6-sim.html"><a href="6-sim.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
+<li class="chapter" data-level="6.2.1" data-path="6-sim.html"><a href="6-sim.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
+<li class="chapter" data-level="6.2.2" data-path="6-sim.html"><a href="6-sim.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
+</ul></li>
+<li class="chapter" data-level="6.3" data-path="6-sim.html"><a href="6-sim.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
+<li class="chapter" data-level="6.4" data-path="6-sim.html"><a href="6-sim.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
+<li class="chapter" data-level="6.5" data-path="6-sim.html"><a href="6-sim.html#conclusion-2"><i class="fa fa-check"></i><b>6.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="6.5.1" data-path="6-sim.html"><a href="6-sim.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5.1</b> Script of R code</a></li>
+<li class="chapter" data-level="6.5.2" data-path="6-sim.html"><a href="6-sim.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.5.2</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="7" data-path="7-hypo.html"><a href="7-hypo.html"><i class="fa fa-check"></i><b>7</b> Hypothesis Testing</a><ul>
+<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="7.1" data-path="7-hypo.html"><a href="7-hypo.html#when-inference-is-not-needed"><i class="fa fa-check"></i><b>7.1</b> When Inference Is Not Needed</a></li>
+<li class="chapter" data-level="7.2" data-path="7-hypo.html"><a href="7-hypo.html#basics-of-hypothesis-testing"><i class="fa fa-check"></i><b>7.2</b> Basics of Hypothesis Testing</a></li>
+<li class="chapter" data-level="7.3" data-path="7-hypo.html"><a href="7-hypo.html#trial"><i class="fa fa-check"></i><b>7.3</b> Criminal trial analogy</a><ul>
+<li class="chapter" data-level="7.3.1" data-path="7-hypo.html"><a href="7-hypo.html#two-possible-conclusions"><i class="fa fa-check"></i><b>7.3.1</b> Two possible conclusions</a></li>
+</ul></li>
+<li class="chapter" data-level="7.4" data-path="7-hypo.html"><a href="7-hypo.html#types-of-errors-in-hypothesis-testing"><i class="fa fa-check"></i><b>7.4</b> Types of Errors in Hypothesis Testing</a><ul>
+<li class="chapter" data-level="7.4.1" data-path="7-hypo.html"><a href="7-hypo.html#logic-of-hypothesis-testing"><i class="fa fa-check"></i><b>7.4.1</b> Logic of Hypothesis Testing</a></li>
+</ul></li>
+<li class="chapter" data-level="7.5" data-path="7-hypo.html"><a href="7-hypo.html#statistical-significance"><i class="fa fa-check"></i><b>7.5</b> Statistical Significance</a></li>
+<li class="chapter" data-level="7.6" data-path="7-hypo.html"><a href="7-hypo.html#example-revisiting-the-lady-tasting-tea"><i class="fa fa-check"></i><b>7.6</b> EXAMPLE: Revisiting the Lady Tasting Tea</a><ul>
+<li class="chapter" data-level="7.6.1" data-path="7-hypo.html"><a href="7-hypo.html#data"><i class="fa fa-check"></i><b>7.6.1</b> Data</a></li>
+<li class="chapter" data-level="7.6.2" data-path="7-hypo.html"><a href="7-hypo.html#test-statistic-delta"><i class="fa fa-check"></i><b>7.6.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
+<li class="chapter" data-level="7.6.3" data-path="7-hypo.html"><a href="7-hypo.html#observed-effect-delta"><i class="fa fa-check"></i><b>7.6.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
+<li class="chapter" data-level="7.6.4" data-path="7-hypo.html"><a href="7-hypo.html#model-of-h_0"><i class="fa fa-check"></i><b>7.6.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="7.6.5" data-path="7-hypo.html"><a href="7-hypo.html#simulated-data"><i class="fa fa-check"></i><b>7.6.5</b> Simulated Data</a></li>
+<li class="chapter" data-level="7.6.6" data-path="7-hypo.html"><a href="7-hypo.html#distribution-of-delta-under-h_0"><i class="fa fa-check"></i><b>7.6.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="7.6.7" data-path="7-hypo.html"><a href="7-hypo.html#the-p-value"><i class="fa fa-check"></i><b>7.6.7</b> The p-value</a></li>
+</ul></li>
+<li class="chapter" data-level="7.7" data-path="7-hypo.html"><a href="7-hypo.html#example-comparing-two-means"><i class="fa fa-check"></i><b>7.7</b> EXAMPLE: Comparing two means</a><ul>
+<li class="chapter" data-level="7.7.1" data-path="7-hypo.html"><a href="7-hypo.html#randomizationpermutation"><i class="fa fa-check"></i><b>7.7.1</b> Randomization/Permutation</a></li>
+<li class="chapter" data-level="7.7.2" data-path="7-hypo.html"><a href="7-hypo.html#comparing-action-and-romance-movies"><i class="fa fa-check"></i><b>7.7.2</b> Comparing Action and Romance Movies</a></li>
+<li class="chapter" data-level="7.7.3" data-path="7-hypo.html"><a href="7-hypo.html#sampling-rightarrow-randomization"><i class="fa fa-check"></i><b>7.7.3</b> Sampling <span class="math inline">\(\rightarrow\)</span> Randomization</a></li>
+<li class="chapter" data-level="7.7.4" data-path="7-hypo.html"><a href="7-hypo.html#data-1"><i class="fa fa-check"></i><b>7.7.4</b> Data</a></li>
+<li class="chapter" data-level="7.7.5" data-path="7-hypo.html"><a href="7-hypo.html#model-of-h_0-1"><i class="fa fa-check"></i><b>7.7.5</b> Model of <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="7.7.6" data-path="7-hypo.html"><a href="7-hypo.html#test-statistic-delta-1"><i class="fa fa-check"></i><b>7.7.6</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
+<li class="chapter" data-level="7.7.7" data-path="7-hypo.html"><a href="7-hypo.html#observed-effect-delta-1"><i class="fa fa-check"></i><b>7.7.7</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
+<li class="chapter" data-level="7.7.8" data-path="7-hypo.html"><a href="7-hypo.html#simulated-data-1"><i class="fa fa-check"></i><b>7.7.8</b> Simulated Data</a></li>
+<li class="chapter" data-level="7.7.9" data-path="7-hypo.html"><a href="7-hypo.html#distribution-of-delta-under-h_0-1"><i class="fa fa-check"></i><b>7.7.9</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="7.7.10" data-path="7-hypo.html"><a href="7-hypo.html#the-p-value-1"><i class="fa fa-check"></i><b>7.7.10</b> The p-value</a></li>
+<li class="chapter" data-level="7.7.11" data-path="7-hypo.html"><a href="7-hypo.html#summary-5"><i class="fa fa-check"></i><b>7.7.11</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="7.8" data-path="7-hypo.html"><a href="7-hypo.html#theory-hypo"><i class="fa fa-check"></i><b>7.8</b> Building theory-based methods using computation</a><ul>
+<li class="chapter" data-level="7.8.1" data-path="7-hypo.html"><a href="7-hypo.html#example-t-test-for-two-independent-samples"><i class="fa fa-check"></i><b>7.8.1</b> EXAMPLE: <span class="math inline">\(t\)</span>-test for two independent samples</a></li>
+<li class="chapter" data-level="7.8.2" data-path="7-hypo.html"><a href="7-hypo.html#conditions-for-t-test"><i class="fa fa-check"></i><b>7.8.2</b> Conditions for t-test</a></li>
+</ul></li>
+<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#conclusion-3"><i class="fa fa-check"></i><b>7.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="7.9.1" data-path="7-hypo.html"><a href="7-hypo.html#script-of-r-code-3"><i class="fa fa-check"></i><b>7.9.1</b> Script of R code</a></li>
+<li class="chapter" data-level="7.9.2" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9.2</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="8" data-path="8-ci.html"><a href="8-ci.html"><i class="fa fa-check"></i><b>8</b> Confidence Intervals</a><ul>
+<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a><ul>
+<li class="chapter" data-level="8.1.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.1.1</b> Review of Bootstrapping</a></li>
+</ul></li>
+<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.2</b> Relation to hypothesis testing</a></li>
+<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.3</b> Effect size</a></li>
+<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#conclusion-4"><i class="fa fa-check"></i><b>8.4</b> Conclusion</a><ul>
+<li class="chapter" data-level="8.4.1" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-4"><i class="fa fa-check"></i><b>8.4.1</b> Script of R code</a></li>
+<li class="chapter" data-level="8.4.2" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.4.2</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="9" data-path="9-regress.html"><a href="9-regress.html"><i class="fa fa-check"></i><b>9</b> Regression via broom</a><ul>
+<li class="chapter" data-level="" data-path="9-regress.html"><a href="9-regress.html#needed-packages-6"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="9.1" data-path="9-regress.html"><a href="9-regress.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
+<li class="chapter" data-level="9.2" data-path="9-regress.html"><a href="9-regress.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
+<li class="chapter" data-level="9.2.1" data-path="9-regress.html"><a href="9-regress.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
+</ul></li>
+<li class="chapter" data-level="9.3" data-path="9-regress.html"><a href="9-regress.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
+<li class="chapter" data-level="9.3.1" data-path="9-regress.html"><a href="9-regress.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
+<li class="chapter" data-level="9.3.2" data-path="9-regress.html"><a href="9-regress.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
+<li class="chapter" data-level="9.3.3" data-path="9-regress.html"><a href="9-regress.html#interpreting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpreting the slope</a></li>
+<li class="chapter" data-level="9.3.4" data-path="9-regress.html"><a href="9-regress.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
+</ul></li>
+<li class="chapter" data-level="9.4" data-path="9-regress.html"><a href="9-regress.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
+<li class="chapter" data-level="9.4.1" data-path="9-regress.html"><a href="9-regress.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
+<li class="chapter" data-level="9.4.2" data-path="9-regress.html"><a href="9-regress.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
+<li class="chapter" data-level="9.4.3" data-path="9-regress.html"><a href="9-regress.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
+<li class="chapter" data-level="9.4.4" data-path="9-regress.html"><a href="9-regress.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.5" data-path="9-regress.html"><a href="9-regress.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
+<li class="chapter" data-level="9.4.6" data-path="9-regress.html"><a href="9-regress.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.7" data-path="9-regress.html"><a href="9-regress.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
+</ul></li>
+<li class="chapter" data-level="9.5" data-path="9-regress.html"><a href="9-regress.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
+<li class="chapter" data-level="9.6" data-path="9-regress.html"><a href="9-regress.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
+<li class="chapter" data-level="9.7" data-path="9-regress.html"><a href="9-regress.html#conclusion-5"><i class="fa fa-check"></i><b>9.7</b> Conclusion</a><ul>
+<li class="chapter" data-level="9.7.1" data-path="9-regress.html"><a href="9-regress.html#script-of-r-code-5"><i class="fa fa-check"></i><b>9.7.1</b> Script of R code</a></li>
+<li class="chapter" data-level="9.7.2" data-path="9-regress.html"><a href="9-regress.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.7.2</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="part"><span><b>III Conclusion</b></span></li>
+<li class="chapter" data-level="10" data-path="10-effective-data-storytelling.html"><a href="10-effective-data-storytelling.html"><i class="fa fa-check"></i><b>10</b> Effective Data Storytelling</a><ul>
+<li class="chapter" data-level="" data-path="10-effective-data-storytelling.html"><a href="10-effective-data-storytelling.html#concluding-remarks"><i class="fa fa-check"></i>Concluding Remarks</a></li>
+</ul></li>
+<li class="appendix"><span><b>Appendix</b></span></li>
+<li class="chapter" data-level="A" data-path="A-appendixA.html"><a href="A-appendixA.html"><i class="fa fa-check"></i><b>A</b> Statistical Background</a><ul>
+<li class="chapter" data-level="A.1" data-path="A-appendixA.html"><a href="A-appendixA.html#basic-statistical-terms"><i class="fa fa-check"></i><b>A.1</b> Basic statistical terms</a><ul>
+<li class="chapter" data-level="A.1.1" data-path="A-appendixA.html"><a href="A-appendixA.html#mean"><i class="fa fa-check"></i><b>A.1.1</b> Mean</a></li>
+<li class="chapter" data-level="A.1.2" data-path="A-appendixA.html"><a href="A-appendixA.html#median"><i class="fa fa-check"></i><b>A.1.2</b> Median</a></li>
+<li class="chapter" data-level="A.1.3" data-path="A-appendixA.html"><a href="A-appendixA.html#standard-deviation"><i class="fa fa-check"></i><b>A.1.3</b> Standard deviation</a></li>
+<li class="chapter" data-level="A.1.4" data-path="A-appendixA.html"><a href="A-appendixA.html#five-number-summary"><i class="fa fa-check"></i><b>A.1.4</b> Five-number summary</a></li>
+<li class="chapter" data-level="A.1.5" data-path="A-appendixA.html"><a href="A-appendixA.html#distribution"><i class="fa fa-check"></i><b>A.1.5</b> Distribution</a></li>
+<li class="chapter" data-level="A.1.6" data-path="A-appendixA.html"><a href="A-appendixA.html#outliers"><i class="fa fa-check"></i><b>A.1.6</b> Outliers</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="B" data-path="B-appendixB.html"><a href="B-appendixB.html"><i class="fa fa-check"></i><b>B</b> Inference Examples</a><ul>
+<li class="chapter" data-level="" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-7"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#inference-mind-map"><i class="fa fa-check"></i><b>B.1</b> Inference Mind Map</a></li>
+<li class="chapter" data-level="B.2" data-path="B-appendixB.html"><a href="B-appendixB.html#one-mean"><i class="fa fa-check"></i><b>B.2</b> One Mean</a><ul>
+<li class="chapter" data-level="B.2.1" data-path="B-appendixB.html"><a href="B-appendixB.html#problem-statement"><i class="fa fa-check"></i><b>B.2.1</b> Problem Statement</a></li>
+<li class="chapter" data-level="B.2.2" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses"><i class="fa fa-check"></i><b>B.2.2</b> Competing Hypotheses</a></li>
+<li class="chapter" data-level="B.2.3" data-path="B-appendixB.html"><a href="B-appendixB.html#exploring-the-sample-data"><i class="fa fa-check"></i><b>B.2.3</b> Exploring the sample data</a></li>
+<li class="chapter" data-level="B.2.4" data-path="B-appendixB.html"><a href="B-appendixB.html#non-traditional-methods"><i class="fa fa-check"></i><b>B.2.4</b> Non-traditional methods</a></li>
+<li class="chapter" data-level="B.2.5" data-path="B-appendixB.html"><a href="B-appendixB.html#traditional-methods"><i class="fa fa-check"></i><b>B.2.5</b> Traditional methods</a></li>
+<li class="chapter" data-level="B.2.6" data-path="B-appendixB.html"><a href="B-appendixB.html#comparing-results"><i class="fa fa-check"></i><b>B.2.6</b> Comparing results</a></li>
+</ul></li>
+<li class="chapter" data-level="B.3" data-path="B-appendixB.html"><a href="B-appendixB.html#one-proportion"><i class="fa fa-check"></i><b>B.3</b> One Proportion</a><ul>
+<li class="chapter" data-level="B.3.1" data-path="B-appendixB.html"><a href="B-appendixB.html#problem-statement-1"><i class="fa fa-check"></i><b>B.3.1</b> Problem Statement</a></li>
+<li class="chapter" data-level="B.3.2" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses-1"><i class="fa fa-check"></i><b>B.3.2</b> Competing Hypotheses</a></li>
+<li class="chapter" data-level="B.3.3" data-path="B-appendixB.html"><a href="B-appendixB.html#exploring-the-sample-data-1"><i class="fa fa-check"></i><b>B.3.3</b> Exploring the sample data</a></li>
+<li class="chapter" data-level="B.3.4" data-path="B-appendixB.html"><a href="B-appendixB.html#non-traditional-methods-1"><i class="fa fa-check"></i><b>B.3.4</b> Non-traditional methods</a></li>
+<li class="chapter" data-level="B.3.5" data-path="B-appendixB.html"><a href="B-appendixB.html#traditional-methods-1"><i class="fa fa-check"></i><b>B.3.5</b> Traditional methods</a></li>
+<li class="chapter" data-level="B.3.6" data-path="B-appendixB.html"><a href="B-appendixB.html#comparing-results-1"><i class="fa fa-check"></i><b>B.3.6</b> Comparing results</a></li>
+</ul></li>
+<li class="chapter" data-level="B.4" data-path="B-appendixB.html"><a href="B-appendixB.html#two-proportions"><i class="fa fa-check"></i><b>B.4</b> Two Proportions</a><ul>
+<li class="chapter" data-level="B.4.1" data-path="B-appendixB.html"><a href="B-appendixB.html#problem-statement-2"><i class="fa fa-check"></i><b>B.4.1</b> Problem Statement</a></li>
+<li class="chapter" data-level="B.4.2" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses-2"><i class="fa fa-check"></i><b>B.4.2</b> Competing Hypotheses</a></li>
+<li class="chapter" data-level="B.4.3" data-path="B-appendixB.html"><a href="B-appendixB.html#exploring-the-sample-data-2"><i class="fa fa-check"></i><b>B.4.3</b> Exploring the sample data</a></li>
+<li class="chapter" data-level="B.4.4" data-path="B-appendixB.html"><a href="B-appendixB.html#non-traditional-methods-2"><i class="fa fa-check"></i><b>B.4.4</b> Non-traditional methods</a></li>
+<li class="chapter" data-level="B.4.5" data-path="B-appendixB.html"><a href="B-appendixB.html#traditional-methods-2"><i class="fa fa-check"></i><b>B.4.5</b> Traditional methods</a></li>
+<li class="chapter" data-level="B.4.6" data-path="B-appendixB.html"><a href="B-appendixB.html#check-conditions-2"><i class="fa fa-check"></i><b>B.4.6</b> Check conditions</a></li>
+<li class="chapter" data-level="B.4.7" data-path="B-appendixB.html"><a href="B-appendixB.html#test-statistic-2"><i class="fa fa-check"></i><b>B.4.7</b> Test statistic</a></li>
+<li class="chapter" data-level="B.4.8" data-path="B-appendixB.html"><a href="B-appendixB.html#state-conclusion-2"><i class="fa fa-check"></i><b>B.4.8</b> State conclusion</a></li>
+<li class="chapter" data-level="B.4.9" data-path="B-appendixB.html"><a href="B-appendixB.html#comparing-results-2"><i class="fa fa-check"></i><b>B.4.9</b> Comparing results</a></li>
+</ul></li>
+<li class="chapter" data-level="B.5" data-path="B-appendixB.html"><a href="B-appendixB.html#two-means-independent-samples"><i class="fa fa-check"></i><b>B.5</b> Two Means (Independent Samples)</a><ul>
+<li class="chapter" data-level="B.5.1" data-path="B-appendixB.html"><a href="B-appendixB.html#problem-statement-3"><i class="fa fa-check"></i><b>B.5.1</b> Problem Statement</a></li>
+<li class="chapter" data-level="B.5.2" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses-3"><i class="fa fa-check"></i><b>B.5.2</b> Competing Hypotheses</a></li>
+<li class="chapter" data-level="B.5.3" data-path="B-appendixB.html"><a href="B-appendixB.html#exploring-the-sample-data-3"><i class="fa fa-check"></i><b>B.5.3</b> Exploring the sample data</a></li>
+<li class="chapter" data-level="B.5.4" data-path="B-appendixB.html"><a href="B-appendixB.html#non-traditional-methods-3"><i class="fa fa-check"></i><b>B.5.4</b> Non-traditional methods</a></li>
+<li class="chapter" data-level="B.5.5" data-path="B-appendixB.html"><a href="B-appendixB.html#traditional-methods-3"><i class="fa fa-check"></i><b>B.5.5</b> Traditional methods</a></li>
+<li class="chapter" data-level="B.5.6" data-path="B-appendixB.html"><a href="B-appendixB.html#test-statistic-3"><i class="fa fa-check"></i><b>B.5.6</b> Test statistic</a></li>
+<li class="chapter" data-level="B.5.7" data-path="B-appendixB.html"><a href="B-appendixB.html#compute-p-value-2"><i class="fa fa-check"></i><b>B.5.7</b> Compute <span class="math inline">\(p\)</span>-value</a></li>
+<li class="chapter" data-level="B.5.8" data-path="B-appendixB.html"><a href="B-appendixB.html#state-conclusion-3"><i class="fa fa-check"></i><b>B.5.8</b> State conclusion</a></li>
+<li class="chapter" data-level="B.5.9" data-path="B-appendixB.html"><a href="B-appendixB.html#comparing-results-3"><i class="fa fa-check"></i><b>B.5.9</b> Comparing results</a></li>
+</ul></li>
+<li class="chapter" data-level="B.6" data-path="B-appendixB.html"><a href="B-appendixB.html#two-means-paired-samples"><i class="fa fa-check"></i><b>B.6</b> Two Means (Paired Samples)</a><ul>
+<li class="chapter" data-level="B.6.1" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses-4"><i class="fa fa-check"></i><b>B.6.1</b> Competing Hypotheses</a></li>
+<li class="chapter" data-level="B.6.2" data-path="B-appendixB.html"><a href="B-appendixB.html#exploring-the-sample-data-4"><i class="fa fa-check"></i><b>B.6.2</b> Exploring the sample data</a></li>
+<li class="chapter" data-level="B.6.3" data-path="B-appendixB.html"><a href="B-appendixB.html#non-traditional-methods-4"><i class="fa fa-check"></i><b>B.6.3</b> Non-traditional methods</a></li>
+<li class="chapter" data-level="B.6.4" data-path="B-appendixB.html"><a href="B-appendixB.html#traditional-methods-4"><i class="fa fa-check"></i><b>B.6.4</b> Traditional methods</a></li>
+<li class="chapter" data-level="B.6.5" data-path="B-appendixB.html"><a href="B-appendixB.html#comparing-results-4"><i class="fa fa-check"></i><b>B.6.5</b> Comparing results</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="C" data-path="C-appendixC.html"><a href="C-appendixC.html"><i class="fa fa-check"></i><b>C</b> Reach for the Starts</a><ul>
+<li class="chapter" data-level="" data-path="C-appendixC.html"><a href="C-appendixC.html#needed-packages-8"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="C.1" data-path="C-appendixC.html"><a href="C-appendixC.html#sorted-barplots"><i class="fa fa-check"></i><b>C.1</b> Sorted barplots</a></li>
+<li class="chapter" data-level="C.2" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-graphics"><i class="fa fa-check"></i><b>C.2</b> Interactive graphics</a><ul>
+<li class="chapter" data-level="C.2.1" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-line-graphs"><i class="fa fa-check"></i><b>C.2.1</b> Interactive line-graphs</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="" data-path="references.html"><a href="references.html"><i class="fa fa-check"></i>References</a></li>
+</ul>
+
+      </nav>
+    </div>
+
+    <div class="book-body">
+      <div class="body-inner">
+        <div class="book-header" role="navigation">
+          <h1>
+            <i class="fa fa-circle-o-notch fa-spin"></i><a href="./">ModernDive</a>
+          </h1>
+        </div>
+
+        <div class="page-wrapper" tabindex="-1" role="main">
+          <div class="page-inner">
+
+            <section class="normal" id="section-">
+<div id="manip" class="section level1">
+<h1><span class="header-section-number">5</span> Data Manipulation via dplyr</h1>
+<!--
+- Make sure to refer back to plots in the viz chapter and how the
+  material here relates to answering those questions
+-->
+<p>Let’s briefly recap where we have been so far and where we are headed. In Chapter <a href="3-tidy.html#tidy">3</a>, we discussed what it means for data to be tidy. We saw that this refers to observational units corresponding to rows and variables being stored in columns (one variable for every column). The entries in the data frame correspond to different combinations of observational units and variables. In the <code>flights</code> data frame, we saw that each row corresponds to a different flight leaving New York City. In other words, the observational unit of that tidy data frame is a flight. The variables are listed as columns and for <code>flights</code> they include both quantitative variables like <code>dep_delay</code> and <code>distance</code> but also categorical variables like <code>carrier</code> and <code>origin</code>. An entry in the table corresponds to a particular flight on a given day and a particular value of a given variable representing that flight.</p>
+<p>We saw in Chapter <a href="4-viz.html#viz">4</a> that organizing data in this tidy way makes it easy for us to produce graphics. We can simply specify what variable/column we would like on one axis, what variable we’d like on the other axis, and what type of plot we’d like to make. We can also do things such as changing the color by another variable or change the size of our points by a fourth variable given this tidy data set.</p>
+<p>Furthermore, in Chapter <a href="4-viz.html#viz">4</a>, we hinted at some ways to summarize and manipulate data to suit your needs. This chapter expands on this by giving a variety of examples using what we call the <em>Five Main Verbs</em> in the <code>dplyr</code> package <span class="citation">(Wickham and Francois <a href="#ref-R-dplyr">2016</a>)</span>. There are more advanced operations than just these and you’ll see some examples of this near the end of the chapter.</p>
+<p>While at various points we specifically make mention to use the <code>View()</code> command to inspect a particular data frame, feel free to do so whenever. In fact, you should get into the habit of doing this for <em>any</em> data frame you work with.</p>
+<div id="needed-packages-2" class="section level3 unnumbered">
+<h3>Needed packages</h3>
+<p>Before we proceed with this chapter, let’s load all the necessary packages.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(dplyr)
+<span class="kw">library</span>(ggplot2)
+<span class="kw">library</span>(nycflights13)
+<span class="kw">library</span>(knitr)</code></pre></div>
+<!--Subsection on Pipe -->
+</div>
+<div id="the-pipe" class="section level2">
+<h2><span class="header-section-number">5.1</span> The pipe <code>%&gt;%</code></h2>
+<p>Before we introduce the five main verbs, we first introduce the the pipe operator (<code>%&gt;%</code>). Just as the <code>+</code> sign was used to add layers to a plot created using <code>ggplot()</code>, the pipe operator allows us to chain together <code>dplyr</code> data manipulation functions. The pipe operator can be read as “<em>then</em>”. The <code>%&gt;%</code> operator allows us to go from one step in <code>dplyr</code> to the next easily so we can, for example:</p>
+<ul>
+<li><code>filter</code> our data frame to only focus on a few rows <em>then</em></li>
+<li><code>group_by</code> another variable to create groups <em>then</em></li>
+<li><code>summarize</code> this grouped data to calculate the mean for each level of the group.</li>
+</ul>
+<p>The piping syntax will be our major focus throughout the rest of this book and you’ll find that you’ll quickly be addicted to the chaining with some practice. If you’d like to see more examples on using <code>dplyr</code>, the 5MV (in addition to some other <code>dplyr</code> verbs), and <code>%&gt;%</code> with the <code>nycflights13</code> data set, you can check out Chapter 5 of Hadley and Garrett’s book <span class="citation">(Grolemund and Wickham <a href="#ref-rds2016">2016</a>)</span>.</p>
+<!--Subsection on Verbs -->
+</div>
+<div id="five-main-verbs---the-5mv" class="section level2">
+<h2><span class="header-section-number">5.2</span> Five Main Verbs - The 5MV</h2>
+<p>The <code>d</code> in <code>dplyr</code> stands for data frames, so the functions here work when you are working with objects of the data frame type. It’s most important for you to focus on the 5MV: the five most commonly used functions that help us manipulate and summarize data. A description of these verbs follows with each subsection devoted to seeing an example of that verb in play (or a combination of a few verbs):</p>
+<ul>
+<li><code>filter</code>: Pick rows based on conditions about their values</li>
+<li><code>summarize</code>: Create summary measures of variables either
+<ul>
+<li>over the entire data frame</li>
+<li>or over groups of observations on variables using <code>group_by</code></li>
+</ul></li>
+<li><code>mutate</code>: Create a new variable in the data frame by mutating existing ones</li>
+<li><code>arrange</code>: Arrange/sort the rows based on one or more variables</li>
+</ul>
+<p>Just as we had the 5NG (The Five Named Graphs in Chapter <a href="4-viz.html#viz">4</a> using <code>ggplot2</code>) for data visualization, we also have the 5MV here (The Five Main Verbs in <code>dplyr</code>) for data manipulation. All of the 5MVs follow the same syntax with the argument before the pipe <code>%&gt;%</code> being the name of the data frame and then the name of the verb with other arguments specifying which criteria you’d like the verb to work with in parentheses.</p>
+<div id="filter" class="section level3">
+<h3><span class="header-section-number">5.2.1</span> 5MV#1: Filter observations using filter</h3>
+<div class="figure" style="text-align: center"><span id="fig:filter"></span>
+<img src="images/filter.png" alt="Filter diagram from Data Wrangling with dplyr and tidyr cheatsheet" width="\textwidth" />
+<p class="caption">
+Figure 5.1: Filter diagram from Data Wrangling with dplyr and tidyr cheatsheet
+</p>
+</div>
+<p>The <code>filter</code> function here works much like the “Filter” option in Microsoft Excel; it allows you to specify criteria about values of a variable in your data set and then chooses only those rows that match that criteria. We begin by focusing only on flights from New York City to Portland, Oregon. The <code>dest</code> code (or airport code) for Portland, Oregon is <code>&quot;PDX&quot;</code>. Run the following and look at the resulting spreadsheet to ensure that only flights heading to Portland are chosen here:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">portland_flights &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">filter</span>(dest ==<span class="st"> &quot;PDX&quot;</span>)
+<span class="kw">View</span>(pdx_flights)</code></pre></div>
+<p>Note the following:</p>
+<ul>
+<li>The ordering of the commands:
+<ul>
+<li>Take the data frame <code>flights</code> <em>then</em></li>
+<li><code>filter</code> the data frame so that only those where the <code>dest</code> equals <code>&quot;PDX&quot;</code> are included.</li>
+</ul></li>
+<li>The double equal sign <code>==</code> You are almost guaranteed to make the mistake at least once of only including one equals sign. Let’s see what happens when we make this error:</li>
+</ul>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">portland_flights &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">filter</span>(<span class="dt">dest =</span> <span class="st">&quot;PDX&quot;</span>)</code></pre></div>
+<pre><code>Error: filter() takes unnamed arguments. Do you need `==`?</code></pre>
+<p>You can combine multiple criteria together using operators that make comparisons:</p>
+<ul>
+<li><code>|</code> corresponds to “or”</li>
+<li><code>&amp;</code> corresponds to “and”</li>
+</ul>
+<p>We can often skip the use of <code>&amp;</code> and just separate our conditions with a comma. You’ll see this in the example below.</p>
+<p>In addition, you can use other mathematical checks (similar to <code>==</code>):</p>
+<ul>
+<li><code>&gt;</code> corresponds to “greater than”</li>
+<li><code>&lt;</code> corresponds to “less than”</li>
+<li><code>&gt;=</code> corresponds to “greater than or equal to”</li>
+<li><code>&lt;=</code> corresponds to “less than or equal to”</li>
+<li><code>!=</code> corresponds to “not equal to”</li>
+</ul>
+<p>To see many of these in action, let’s select all flights that left JFK airport heading to Burlington, Vermont (<code>&quot;BTV&quot;</code>) or Seattle, Washington (<code>&quot;SEA&quot;</code>) in the months of October, November, or December. Run the following</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">btv_sea_flights_fall &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">filter</span>(origin ==<span class="st"> &quot;JFK&quot;</span>, (dest ==<span class="st"> &quot;BTV&quot;</span> |<span class="st"> </span>dest ==<span class="st"> &quot;SEA&quot;</span>), month &gt;=<span class="st"> </span><span class="dv">10</span>)
+<span class="kw">View</span>(btv_sea_flights_fall)</code></pre></div>
+<p>Note how even though colloquially speaking one might say “all flights leaving Burlington, Vermont <em>and</em> Seattle, Washington”, in terms of computer operations, we really mean “all flights leaving Burlington, Vermont <em>or</em> Seattle, Washington”, because for a given row in the data, <code>dest</code> can either be: “BTV”, “SEA”, or something else, but not “BTV” and “SEA” at the same time.</p>
+<p>Another example uses the <code>!</code> to pick rows that <strong>DON’T</strong> match a condition. Here we are selecting rows corresponding to flights that didn’t go to Burlington, VT or Seattle, WA.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">not_BTV_SEA &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">filter</span>(!(dest ==<span class="st"> &quot;BTV&quot;</span> |<span class="st"> </span>dest ==<span class="st"> &quot;SEA&quot;</span>))
+<span class="kw">View</span>(not_BTV_SEA)</code></pre></div>
+<p>As a final note we point out that <code>filter()</code> should often be the first verb you’ll apply to your data. This cleans your data set to only those rows you care about, or put differently, it narrows down the scope to just the observational units your care about.</p>
+<hr />
+<div class="learncheck">
+<p>
+<strong><em>Learning check</em></strong>
+</p>
+</div>
+<p><strong>(LC5.1)</strong> What’s another way using <code>!</code> we could filter only the rows that are not going to Burlington, VT nor Seattle, WA in the <code>flights</code> data frame? Test this out using the code above.</p>
+<hr />
+</div>
+<div id="mv2-summarize-variables-using-summarize" class="section level3">
+<h3><span class="header-section-number">5.2.2</span> 5MV#2: Summarize variables using summarize</h3>
+<div class="figure" style="text-align: center"><span id="fig:sum1"></span>
+<img src="images/summarize1.png" alt="Summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet" width="\textwidth" />
+<p class="caption">
+Figure 5.2: Summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet
+</p>
+</div>
+<div class="figure" style="text-align: center"><span id="fig:sum2"></span>
+<img src="images/summary.png" alt="Another summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet" width="\textwidth" />
+<p class="caption">
+Figure 5.3: Another summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet
+</p>
+</div>
+<p>We saw in Subsection <a href="#contsum"><strong>??</strong></a> a way to calculate the standard deviation and mean of the temperature variable <code>temp</code> in the <code>weather</code> data frame of <code>nycflights</code>. We can do so in one step using the <code>summarize</code> function in <code>dplyr</code>:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">summary_temp &lt;-<span class="st"> </span>weather %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">summarize</span>(<span class="dt">mean =</span> <span class="kw">mean</span>(temp), <span class="dt">std_dev =</span> <span class="kw">sd</span>(temp))
+summary_temp</code></pre></div>
+<pre><code>## # A tibble: 1 × 2
+##    mean std_dev
+##   &lt;dbl&gt;   &lt;dbl&gt;
+## 1    NA      NA</code></pre>
+<p>We’ve created a small data frame here called <code>summary_temp</code> that includes both the <code>mean</code> and the <code>std_dev</code> of the <code>temp</code> variable in <code>weather</code>. Notice as shown in Figures <a href="5-manip.html#fig:sum1">5.2</a> and <a href="5-manip.html#fig:sum2">5.3</a>, the data frame <code>weather</code> went from many rows to a single row of just the summary values in the data frame <code>summary_temp</code>. But why are the mean and standard deviation missing, i.e. <code>NA</code>? Remember that by default the <code>mean</code> and <code>sd</code> functions do not ignore missing values. We need to specify the argument <code>na.rm=TRUE</code> (<code>rm</code> is short for “remove”):</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">summary_temp &lt;-<span class="st"> </span>weather %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">summarize</span>(<span class="dt">mean =</span> <span class="kw">mean</span>(temp, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>), <span class="dt">std_dev =</span> <span class="kw">sd</span>(temp, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>))
+summary_temp</code></pre></div>
+<pre><code>## # A tibble: 1 × 2
+##    mean std_dev
+##   &lt;dbl&gt;   &lt;dbl&gt;
+## 1  55.2   17.78</code></pre>
+<p>If we’d like to access either of these values directly we can use the <code>$</code> to specify a column in a data frame. For example:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">summary_temp$mean</code></pre></div>
+<pre><code>## [1] 55.2</code></pre>
+<p>You’ll often encounter issues with missing values <code>NA</code>. In fact, an entire branch of the field of statistics deals with missing data. However, it is not good practice to include a <code>na.rm = TRUE</code> in your summary commands by default; you should attempt to run them without this argument. The idea being you should at the very least be alerted to the presence of missing values and consider what the impact on the analysis might be if you ignore these values. In other words, <code>na.rm = TRUE</code> should only be used when necessary.</p>
+<!--
+
+-->
+<p>What other summary functions can we use inside the <code>summarize()</code> verb? Any function in R that takes a vector of values and returns just one. Here are just a few:</p>
+<ul>
+<li><code>min()</code> and <code>max()</code>: the minimum and maximum values respectively</li>
+<li><code>IQR()</code>: Interquartile range</li>
+<li><code>sum()</code>: the sum</li>
+<li><code>n()</code>: a count of the number of rows/observations in each group. This particular summary function will make more sense in the <code>group_by</code> chapter.</li>
+</ul>
+<hr />
+<div class="learncheck">
+<p>
+<strong><em>Learning check</em></strong>
+</p>
+</div>
+<p><strong>(LC5.2)</strong> Say a doctor is studying the effect of smoking on lung cancer of a large number of patients who have records measured at five year intervals. He notices that a large number of patients have missing data points because the patient has died, so he chooses to ignore these patients in his analysis. What is wrong with this doctor’s approach?</p>
+<p><strong>(LC5.3)</strong> Modify the above <code>summarize</code> function to be use the <code>n()</code> summary function: <code>summarize(count=n())</code>. What does the returned value correspond to?</p>
+<p><strong>(LC5.4)</strong> Why doesn’t the following code work? You may want to run the code line by line:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">summary_temp &lt;-<span class="st"> </span>weather %&gt;%<span class="st">   </span>
+<span class="st">  </span><span class="kw">summarize</span>(<span class="dt">mean =</span> <span class="kw">mean</span>(temp, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>)) %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">summarize</span>(<span class="dt">std_dev =</span> <span class="kw">sd</span>(temp, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>))</code></pre></div>
+<hr />
+</div>
+<div id="mv3-group-rows-using-group_by" class="section level3">
+<h3><span class="header-section-number">5.2.3</span> 5MV#3: Group rows using group_by</h3>
+<div class="figure" style="text-align: center"><span id="fig:groupsummarize"></span>
+<img src="images/group_summary.png" alt="Group by and summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet" width="\textwidth" />
+<p class="caption">
+Figure 5.4: Group by and summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet
+</p>
+</div>
+<p>However, it’s often more useful to summarize a variable based on the groupings of another variable. Let’s say similarly to the previous section, we are interested in the mean and standard deviation of temperatures but <em>grouped by month</em>. This concept can equivalently be articulated as: we want the mean and standard deviation of temperatures</p>
+<ol style="list-style-type: decimal">
+<li>split by month.</li>
+<li>sliced by month.</li>
+<li>aggregated by month.</li>
+<li>collapsed over month.</li>
+</ol>
+<p>We believe that you will be amazed at just how simple this is. Run the following code:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">summary_monthly_temp &lt;-<span class="st"> </span>weather %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">group_by</span>(month) %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">summarize</span>(<span class="dt">mean =</span> <span class="kw">mean</span>(temp, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>), <span class="dt">std_dev =</span> <span class="kw">sd</span>(temp, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>))
+summary_monthly_temp</code></pre></div>
+<pre><code>## # A tibble: 12 × 3
+##    month  mean std_dev
+##    &lt;dbl&gt; &lt;dbl&gt;   &lt;dbl&gt;
+## 1      1 35.64  10.185
+## 2      2 34.15   6.940
+## 3      3 39.81   6.225
+## 4      4 51.67   8.785
+## 5      5 61.59   9.609
+## 6      6 72.14   7.603
+## 7      7 80.01   7.148
+## 8      8 74.40   5.171
+## 9      9 67.43   8.476
+## 10    10 60.03   8.830
+## 11    11 45.11  10.502
+## 12    12 38.37   9.941</code></pre>
+<p>This code is identical to the previous code that created <code>summary_temp</code>, but there is an extra <code>group_by(month)</code> spliced in between. By simply grouping the <code>weather</code> data set by <code>month</code> first and then passing this new data frame into <code>summarize</code> we get a resulting data frame that shows the mean and standard deviation temperature for each month in New York City. Since each row in <code>summary_monthly_temp</code> represents a summary of different rows in <code>weather</code>, the observational units have changed.</p>
+<p>It is important to note that <code>group_by</code> doesn’t actually change the data frame. It simply sets <em>meta-data</em> (data about the data), specifically the group structure of the data. It is only after we apply the <code>summarize</code> function that the data frame actually changes. If we would like to remove this group structure meta-data, we can pipe a resulting data frame into the <code>ungroup()</code> function.</p>
+<p>We now revisit the <code>n()</code> counting summary function we introduced in the previous section. For example, suppose we’d like to get a sense for how many flights departed each of the three airports in New York City:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">by_origin &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">group_by</span>(origin) %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">summarize</span>(<span class="dt">count =</span> <span class="kw">n</span>())
+by_origin</code></pre></div>
+<pre><code>## # A tibble: 3 × 2
+##   origin  count
+##    &lt;chr&gt;  &lt;int&gt;
+## 1    EWR 120835
+## 2    JFK 111279
+## 3    LGA 104662</code></pre>
+<p>We see that Newark (<code>&quot;EWR&quot;</code>) had the most flights departing in 2013 followed by <code>&quot;JFK&quot;</code> and lastly by LaGuardia (<code>&quot;LGA&quot;</code>). Note there is a subtle but important difference between <code>sum()</code> and <code>n()</code>. While <code>sum()</code> simply adds up a large set of numbers, the latter counts the number of times each of many different values occur.</p>
+<p>You are not limited to grouping by one variable! Say you wanted to know the number of flights leaving each of the three New York City airports <em>for each month</em>, we can also group by a second variable <code>month</code>: <code>group_by(origin, month)</code>. Run the following:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">by_monthly_origin &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">group_by</span>(origin, month) %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">summarize</span>(<span class="dt">count =</span> <span class="kw">n</span>())
+<span class="kw">View</span>(by_monthly_origin)</code></pre></div>
+<hr />
+<div class="learncheck">
+<p>
+<strong><em>Learning check</em></strong>
+</p>
+</div>
+<p><strong>(LC5.5)</strong> Recall from Chapter <a href="4-viz.html#viz">4</a> when we looked at plots of temperatures by months in NYC. What does the standard deviation column in the <code>summary_monthly_temp</code> data frame tell us about temperatures in New York City throughout the year?</p>
+<p><strong>(LC5.6)</strong> What code would be required to get the mean and standard deviation temperature for each day in 2013 for NYC?</p>
+<p><strong>(LC5.7)</strong> Recreate <code>by_monthly_origin</code>, but instead of grouping via <code>group_by(origin, month)</code>, group variables in a different order <code>group_by(month, origin)</code>. What differs in the resulting data set?</p>
+<p><strong>(LC5.8)</strong> How could we identify how many flights left each of the three airports for each <em>carrier</em>?</p>
+<p><strong>(LC5.9)</strong> How does the <code>filter</code> operation differ from a <code>group_by</code> followed by a <code>summarize</code>?</p>
+<hr />
+</div>
+<div id="mv4-create-new-variableschange-old-variables-using-mutate" class="section level3">
+<h3><span class="header-section-number">5.2.4</span> 5MV#4: Create new variables/change old variables using mutate</h3>
+<div class="figure" style="text-align: center"><span id="fig:select"></span>
+<img src="images/mutate.png" alt="Mutate diagram from Data Wrangling with dplyr and tidyr cheatsheet" width="\textwidth" />
+<p class="caption">
+Figure 5.5: Mutate diagram from Data Wrangling with dplyr and tidyr cheatsheet
+</p>
+</div>
+<p>When looking at the <code>flights</code> data set, there are some clear additional variables that could be calculated based on the values of variables already in the data set. Passengers are often frustrated when their flights departs late, but change their mood a bit if pilots can make up some time during the flight to get them to their destination close to when they expected to land. This is commonly referred to as “gain” and we will create this variable using the <code>mutate</code> function. Note that we have also overwritten the <code>flights</code> data frame with what it was before as well as an additional variable <code>gain</code> here.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">mutate</span>(<span class="dt">gain =</span> arr_delay -<span class="st"> </span>dep_delay)</code></pre></div>
+<p>Why did we overwrite <code>flights</code> instead of assigning the resulting data frame to a new object, like <code>flights_with_gain</code>? As a rough rule of thumb, as long as you are not losing information that you might need later, its acceptable practice to overwrite data frames. However, if you overwrite existing variables and/or change the observational units, recovering the original information might prove difficult. It this case, it might make sense to create a new data object.</p>
+<p>Let’s look at summary measures of this <code>gain</code> variable and even plot it in the form of a histogram:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">gain_summary &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">summarize</span>(
+    <span class="dt">min =</span> <span class="kw">min</span>(gain, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
+    <span class="dt">q1 =</span> <span class="kw">quantile</span>(gain, <span class="fl">0.25</span>, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
+    <span class="dt">median =</span> <span class="kw">quantile</span>(gain, <span class="fl">0.5</span>, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
+    <span class="dt">q3 =</span> <span class="kw">quantile</span>(gain, <span class="fl">0.75</span>, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
+    <span class="dt">max =</span> <span class="kw">max</span>(gain, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
+    <span class="dt">mean =</span> <span class="kw">mean</span>(gain, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
+    <span class="dt">sd =</span> <span class="kw">sd</span>(gain, <span class="dt">na.rm =</span> <span class="ot">TRUE</span>),
+    <span class="dt">missing =</span> <span class="kw">sum</span>(<span class="kw">is.na</span>(gain))
+  )
+gain_summary</code></pre></div>
+<pre><code>## # A tibble: 1 × 8
+##     min    q1 median    q3   max  mean    sd missing
+##   &lt;dbl&gt; &lt;dbl&gt;  &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt;   &lt;int&gt;
+## 1  -109   -17     -7     3   196 -5.66 18.04    9430</code></pre>
+<p>We’ve recreated the <code>summary</code> function we saw in Chapter <a href="4-viz.html#viz">4</a> here using the <code>summarize</code> function in <code>dplyr</code>.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> flights, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> gain)) +
+<span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">color =</span> <span class="st">&quot;white&quot;</span>, <span class="dt">bins =</span> <span class="dv">20</span>)</code></pre></div>
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-45"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-45-1.png" alt="Histogram of gain variable" width="\textwidth" />
+<p class="caption">
+Figure 5.6: Histogram of gain variable
+</p>
+</div>
+<p>We can also create multiple columns at once and even refer to columns that were just created in a new column. Hadley produces one such example in Chapter 5 of “R for Data Science” <span class="citation">(Grolemund and Wickham <a href="#ref-rds2016">2016</a>)</span>:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">mutate</span>(
+    <span class="dt">gain =</span> arr_delay -<span class="st"> </span>dep_delay,
+    <span class="dt">hours =</span> air_time /<span class="st"> </span><span class="dv">60</span>,
+    <span class="dt">gain_per_hour =</span> gain /<span class="st"> </span>hours
+  )</code></pre></div>
+<hr />
+<div class="learncheck">
+<p>
+<strong><em>Learning check</em></strong>
+</p>
+</div>
+<p><strong>(LC5.10)</strong> What do positive values of the <code>gain</code> variable in <code>flights</code> correspond to? What about negative values? And what about a zero value?</p>
+<p><strong>(LC5.11)</strong> Could we create the <code>dep_delay</code> and <code>arr_delay</code> columns by simply subtracting <code>dep_time</code> from <code>sched_dep_time</code> and similarly for arrivals? Try the code out and explain any differences between the result and what actually appears in <code>flights</code>.</p>
+<p><strong>(LC5.12)</strong> What can we say about the distribution of <code>gain</code>? Describe it in a few sentences using the plot and the <code>gain_summary</code> data frame values.</p>
+<hr />
+</div>
+<div id="arrange" class="section level3">
+<h3><span class="header-section-number">5.2.5</span> 5MV#5: Reorder the data frame using arrange</h3>
+<p>As you may have thought about with the data frames we’ve worked with so far in the book, one of the most common things you’d like to do is sort the data frames by a specific variable in a column. Have you ever been asked to calculate a median by hand? This requires you to put the data in order from smallest to highest in value. The <code>dplyr</code> package has a function called <code>arrange</code> that we will use to sort/reorder our data according to the values of the specified variable. This is often used after we have used the <code>group_by</code> and <code>summarize</code> functions as we will see.</p>
+<p>Let’s suppose we were interested in determining the most frequent destination airports from New York City in 2013:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">freq_dest &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">group_by</span>(dest) %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">summarize</span>(<span class="dt">num_flights =</span> <span class="kw">n</span>())
+freq_dest</code></pre></div>
+<pre><code>## # A tibble: 105 × 2
+##     dest num_flights
+##    &lt;chr&gt;       &lt;int&gt;
+## 1    ABQ         254
+## 2    ACK         265
+## 3    ALB         439
+## 4    ANC           8
+## 5    ATL       17215
+## 6    AUS        2439
+## 7    AVL         275
+## 8    BDL         443
+## 9    BGR         375
+## 10   BHM         297
+## # ... with 95 more rows</code></pre>
+<p>You’ll see that by default the values of <code>dest</code> are displayed in alphabetical order here. We are interested in finding those airports that appear most:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">freq_dest %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">arrange</span>(num_flights)</code></pre></div>
+<pre><code>## # A tibble: 105 × 2
+##     dest num_flights
+##    &lt;chr&gt;       &lt;int&gt;
+## 1    LEX           1
+## 2    LGA           1
+## 3    ANC           8
+## 4    SBN          10
+## 5    HDN          15
+## 6    MTJ          15
+## 7    EYW          17
+## 8    PSP          19
+## 9    JAC          25
+## 10   BZN          36
+## # ... with 95 more rows</code></pre>
+<p>This is actually giving us the opposite of what we are looking for. It tells us the least frequent destination airports first. To switch the ordering to be descending instead of ascending we use the <code>desc</code> function:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">freq_dest %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">arrange</span>(<span class="kw">desc</span>(num_flights))</code></pre></div>
+<pre><code>## # A tibble: 105 × 2
+##     dest num_flights
+##    &lt;chr&gt;       &lt;int&gt;
+## 1    ORD       17283
+## 2    ATL       17215
+## 3    LAX       16174
+## 4    BOS       15508
+## 5    MCO       14082
+## 6    CLT       14064
+## 7    SFO       13331
+## 8    FLL       12055
+## 9    MIA       11728
+## 10   DCA        9705
+## # ... with 95 more rows</code></pre>
+<!--Chapter on joins-->
+</div>
+</div>
+<div id="joining-data-frames" class="section level2">
+<h2><span class="header-section-number">5.3</span> Joining data frames</h2>
+<p>Another common task is joining/merging two different data sets. For example, in the <code>flights</code> data, the variable <code>carrier</code> lists the carrier code for the different flights. While <code>&quot;UA&quot;</code> and <code>&quot;AA&quot;</code> might be somewhat easy to guess for some (United and American Airlines), what are “VX”, “HA”, and “B6”? This information is provided in a separate data frame <code>airlines</code>.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">View</span>(airlines)</code></pre></div>
+<p>We see that in <code>airports</code>, <code>carrier</code> is the carrier code while <code>name</code> is the full name of the airline. Using this table, we can see that “VX”, “HA”, and “B6” correspond to Virgin America, Hawaiian Airlines, and JetBlue respectively. However, will we have to continually look up the carrier’s name for each flight in the <code>airlines</code> data set? No! Instead of having to manually do this, we can have R automatically do this “looking up” for us.</p>
+<p>Note that the values in the variable <code>carrier</code> in <code>flights</code> match the values in the variable <code>carrier</code> in <code>airlines</code>. In this case, we can use the variable <code>carrier</code> as a <em>key variable</em> to join/merge/match the two data frames by. Hadley and Garrett <span class="citation">(Grolemund and Wickham <a href="#ref-rds2016">2016</a>)</span> created the following diagram to help us understand how the different data sets are linked:</p>
+<div class="figure" style="text-align: center"><span id="fig:reldiagram"></span>
+<img src="images/relational-nycflights.png" alt="Data relationships in nycflights13 from R for Data Science" width="\textwidth" />
+<p class="caption">
+Figure 5.7: Data relationships in nycflights13 from R for Data Science
+</p>
+</div>
+<div id="joining-by-key-variables" class="section level3">
+<h3><span class="header-section-number">5.3.1</span> Joining by Key Variables</h3>
+<p>In both <code>flights</code> and <code>airlines</code>, the key variable we want to join/merge/match the two data frames with has the same name in both data sets: <code>carriers</code>. We make use of the <code>inner_join()</code> function to join by the variable <code>carrier</code>.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights_joined &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">inner_join</span>(airlines, <span class="dt">by=</span><span class="st">&quot;carrier&quot;</span>)
+<span class="kw">View</span>(flights)
+<span class="kw">View</span>(flights_joined)</code></pre></div>
+<p>We observed that the <code>flights</code> and <code>flights_joined</code> are identical except that <code>flights_joined</code> has an additional variable <code>name</code> whose values were drawn from <code>airlines</code>.</p>
+<p>A visual representation of the <code>inner_join</code> is given below <span class="citation">(Grolemund and Wickham <a href="#ref-rds2016">2016</a>)</span>:</p>
+<div class="figure" style="text-align: center"><span id="fig:ijdiagram"></span>
+<img src="images/join-inner.png" alt="Diagram of inner join from R for Data Science" width="\textwidth" />
+<p class="caption">
+Figure 5.8: Diagram of inner join from R for Data Science
+</p>
+</div>
+<p>There are more complex joins available, but the <code>inner_join</code> will solve nearly all of the problems you’ll face in our experience.</p>
+</div>
+<div id="joining-by-key-variables-with-different-names" class="section level3">
+<h3><span class="header-section-number">5.3.2</span> Joining by Key Variables with Different Names</h3>
+<p>Say instead, you are interested in all the destinations of flights from NYC in 2013 and ask yourself:</p>
+<ul>
+<li>“What cities are these airports in?”</li>
+<li>“Is <code>&quot;ORD&quot;</code> Orlando?”</li>
+<li>“Where is <code>&quot;FLL&quot;</code>?</li>
+</ul>
+<p>The <code>airports</code> data frame contains airport codes:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">View</span>(airports)</code></pre></div>
+<p>However, looking at both the <code>airports</code> and <code>flights</code> and the visual representation of the relations between the data frames in Figure <a href="5-manip.html#fig:ijdiagram">5.8</a>, we see that in:</p>
+<ul>
+<li><code>airports</code> the airport code is in the variable <code>faa</code></li>
+<li><code>flights</code> the airport code is in the variable <code>origin</code></li>
+</ul>
+<p>So to join these two data sets, our <code>inner_join</code> operation involves a <code>by</code> argument that accounts for the different names:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">inner_join</span>(airports, <span class="dt">by =</span> <span class="kw">c</span>(<span class="st">&quot;dest&quot;</span> =<span class="st"> &quot;faa&quot;</span>))</code></pre></div>
+<p>Let’s construct the sequence of commands that computes the number of flights from NYC to each destination but also includes information about each destination airport:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">named_dests &lt;-<span class="st"> </span>flights %&gt;%
+<span class="st">  </span><span class="kw">group_by</span>(dest) %&gt;%
+<span class="st">  </span><span class="kw">summarize</span>(<span class="dt">num_flights =</span> <span class="kw">n</span>()) %&gt;%
+<span class="st">  </span><span class="kw">arrange</span>(<span class="kw">desc</span>(num_flights)) %&gt;%
+<span class="st">  </span><span class="kw">inner_join</span>(airports, <span class="dt">by =</span> <span class="kw">c</span>(<span class="st">&quot;dest&quot;</span> =<span class="st"> &quot;faa&quot;</span>)) %&gt;%
+<span class="st">  </span><span class="kw">rename</span>(<span class="dt">airport_name =</span> name)
+<span class="kw">View</span>(named_dests)</code></pre></div>
+<p>In case you didn’t know, <code>&quot;ORD&quot;</code> is the airport code of Chicago O’Hare airport and <code>&quot;FLL&quot;</code> is the main airport in Fort Lauderdale, Florida, which we can now see in our <code>named_freq_dests</code> data frame.</p>
+<hr />
+<div class="learncheck">
+<p>
+<strong><em>Learning check</em></strong>
+</p>
+</div>
+<p><strong>(LC5.13)</strong> Looking at Figure <a href="5-manip.html#fig:reldiagram">5.7</a>, when joining <code>flights</code> and <code>weather</code>, or in order words match the hourly weather values with each flight, why do we need to join by all of <code>year</code>, <code>month</code>, <code>day</code>, <code>hour</code>, and <code>origin</code>, and not just <code>hour</code>?</p>
+<p><strong>(LC5.14)</strong> What surprises you about the top 10 destinations from NYC in 2013?</p>
+<hr />
+<!--Subsection on Other Verbs -->
+</div>
+</div>
+<div id="optional-other-verbs" class="section level2">
+<h2><span class="header-section-number">5.4</span> Optional: Other verbs</h2>
+<div id="select" class="section level3">
+<h3><span class="header-section-number">5.4.1</span> Select variables using select</h3>
+<div class="figure" style="text-align: center"><span id="fig:selectfig"></span>
+<img src="images/select.png" alt="Select diagram from Data Wrangling with dplyr and tidyr cheatsheet" width="\textwidth" />
+<p class="caption">
+Figure 5.9: Select diagram from Data Wrangling with dplyr and tidyr cheatsheet
+</p>
+</div>
+<p>We’ve seen that the <code>flights</code> data frame in the <code>nycflights13</code> package contains many different variables. The <code>names</code> function gives a listing of all the columns in a data frame; in our case you would run <code>names(flights)</code>. You can also identify these variables by running the <code>glimpse</code> function in the <code>dplyr</code> package:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">glimpse</span>(flights)</code></pre></div>
+<p>However, say you only want to consider two of these variables, say <code>carrier</code> and <code>flight</code>. You can <code>select</code> these:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">select</span>(carrier, flight)</code></pre></div>
+<p>Another one of these variables is <code>year</code>. If you remember the original description of the <code>flights</code> data frame (or by running <code>?flights</code>), you’ll remember that this data correspond to flights in 2013 departing New York City. The <code>year</code> variable isn’t really a variable here in that it doesn’t vary… <code>flights</code> actually comes from a larger data set that covers many years. We may want to remove the <code>year</code> variable from our data set since it won’t be helpful for analysis in this case. We can deselect <code>year</code> by using the <code>-</code> sign:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights_no_year &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">select</span>(-year)
+<span class="kw">names</span>(flights_no_year)</code></pre></div>
+<p>Or we could specify a ranges of columns:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flight_arr_times &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">select</span>(month:day, arr_time:sched_arr_time)
+flight_arr_times</code></pre></div>
+<p>The <code>select</code> function can also be used to reorder columns in combination with the <code>everything</code> helper function. Let’s suppose we’d like the <code>hour</code>, <code>minute</code>, and <code>time_hour</code> variables, which appear at the end of the <code>flights</code> data set, to actually appear immediately after the <code>day</code> variable:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights_reorder &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">select</span>(month:day, hour:time_hour, <span class="kw">everything</span>())
+<span class="kw">names</span>(flights_reorder)</code></pre></div>
+<p>in this case <code>everything()</code> picks up all remaining variables. Lastly, the helper functions <code>starts_with</code>, <code>ends_with</code>, and <code>contains</code> can be used to choose column names that match those conditions:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights_begin_a &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">select</span>(<span class="kw">starts_with</span>(<span class="st">&quot;a&quot;</span>))
+flights_begin_a</code></pre></div>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights_delays &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">select</span>(<span class="kw">ends_with</span>(<span class="st">&quot;delay&quot;</span>))
+flights_delays</code></pre></div>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights_time &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">select</span>(<span class="kw">contains</span>(<span class="st">&quot;time&quot;</span>))
+flights_time</code></pre></div>
+</div>
+<div id="rename" class="section level3">
+<h3><span class="header-section-number">5.4.2</span> Rename variables using rename</h3>
+<p>Another useful function is <code>rename</code>, which as you may suspect renames one column to another name. Suppose we wanted <code>dep_time</code> and <code>arr_time</code> to be <code>departure_time</code> and <code>arrival_time</code> instead in the <code>flights_time</code> data frame:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights_time_new &lt;-<span class="st"> </span>flights %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">select</span>(<span class="kw">contains</span>(<span class="st">&quot;time&quot;</span>)) %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">rename</span>(<span class="dt">departure_time =</span> dep_time,
+         <span class="dt">arrival_time =</span> arr_time)
+<span class="kw">names</span>(flights_time)</code></pre></div>
+<p>It’s easy to forget if the new name comes before or after the equals sign. I usually remember this as “New Before, Old After” or NBOA. You’ll receive an error if you try to do it the other way:</p>
+<pre><code>Error: Unknown variables: departure_time, arrival_time.</code></pre>
+</div>
+<div id="find-the-top-number-of-values-using-top_n" class="section level3">
+<h3><span class="header-section-number">5.4.3</span> Find the top number of values using top_n</h3>
+<p>We can also use the <code>top_n</code> function which automatically tells us the most frequent <code>num_flights</code>. We specify the top 10 airports here:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">named_dests %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">top_n</span>(<span class="dt">n =</span> <span class="dv">10</span>, <span class="dt">wt =</span> num_flights)</code></pre></div>
+<p>We’ll still need to arrange this by <code>num_flights</code> though:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">named_dests  %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">top_n</span>(<span class="dt">n =</span> <span class="dv">10</span>, <span class="dt">wt =</span> num_flights) %&gt;%<span class="st"> </span>
+<span class="st">  </span><span class="kw">arrange</span>(<span class="kw">desc</span>(num_flights))</code></pre></div>
+<p><strong>Note:</strong> Remember that I didn’t pull the <code>n</code> and <code>wt</code> arguments out of thin air. They can be found by using the <code>?</code> function on <code>top_n</code>.</p>
+<p>We can go one stop further and tie together the <code>group_by</code> and <code>summarize</code> functions we used to find the most frequent flights:</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">ten_freq_dests &lt;-<span class="st"> </span>flights %&gt;%
+<span class="st">  </span><span class="kw">group_by</span>(dest) %&gt;%
+<span class="st">  </span><span class="kw">summarize</span>(<span class="dt">num_flights =</span> <span class="kw">n</span>()) %&gt;%
+<span class="st">  </span><span class="kw">top_n</span>(<span class="dt">n =</span> <span class="dv">10</span>) %&gt;%
+<span class="st">  </span><span class="kw">arrange</span>(<span class="kw">desc</span>(num_flights))
+<span class="kw">View</span>(ten_freq_dests)</code></pre></div>
+<hr />
+<div class="learncheck">
+<p>
+<strong><em>Learning check</em></strong>
+</p>
+</div>
+<p><strong>(LC5.15)</strong> What are some ways to select all three of the <code>dest</code>, <code>air_time</code>, and <code>distance</code> variables from <code>flights</code>? Give the code showing how to do this in at least three different ways.</p>
+<p><strong>(LC5.16)</strong> How could one use <code>starts_with</code>, <code>ends_with</code>, and <code>contains</code> to select columns from the <code>flights</code> data frame? Provide three different examples in total: one for <code>starts_with</code>, one for <code>ends_with</code>, and one for <code>contains</code>.</p>
+<p><strong>(LC5.17)</strong> Why might we want to use the <code>select</code> function on a data frame?</p>
+<p><strong><code>paste0(&quot;(LC&quot;, chap, &quot;.&quot;, (lc &lt;- lc + 1), &quot;)&quot;)</code></strong> Create a new data frame that shows the top 5 airports with the largest arrival delays from NYC in 2013.</p>
+<hr />
+<!--Subsection on Conclusion -->
+</div>
+</div>
+<div id="conclusion-1" class="section level2">
+<h2><span class="header-section-number">5.5</span> Conclusion</h2>
+<div id="resources-1" class="section level3">
+<h3><span class="header-section-number">5.5.1</span> Resources</h3>
+<p>As we saw with the RStudio cheatsheet on <a href="https://www.rstudio.com/wp-content/uploads/2015/12/ggplot2-cheatsheet-2.0.pdf">data visualization</a>, RStudio has also created a cheatsheet for data manipulation entitled “Data Wrangling with dplyr and tidyr” available</p>
+<ul>
+<li>By clicking <a href="https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf">here</a></li>
+<li>Or by clicking the RStudio Menu Bar -&gt; Help -&gt; Cheatsheets -&gt; “Data Manipulation with <code>dplyr</code>, <code>tidyr</code>”</li>
+</ul>
+<p>We will focus only on the <code>dplyr</code> functions in this book, but you are encouraged to also explore <code>tidyr</code> if you are presented with data that is not in the tidy format that we have specified as the preferred option for our purposes.</p>
+</div>
+<div id="script-of-r-code-1" class="section level3">
+<h3><span class="header-section-number">5.5.2</span> Script of R code</h3>
+<p>An R script file of all R code used in this chapter is available <a href="http://ismayc.github.io/moderndiver-book/05-manip.R">here</a>.</p>
+</div>
+<div id="whats-to-come-2" class="section level3">
+<h3><span class="header-section-number">5.5.3</span> What’s to come?</h3>
+<p>This concludes the <strong>Data Exploration</strong> unit of this book. You should be pretty proficient in both plotting variables (or multiple variables together) in various data sets and manipulating data as we’ve done in this chapter. You are encouraged to step back through the code in earlier chapters and make changes as you see fit based on your updated knowledge.</p>
+<p>In Chapter <a href="6-sim.html#sim">6</a>, we’ll begin to build the pieces needed to understand how this unit of <strong>Data Exploration</strong> can tie into statistical inference in the <strong>Inference</strong> part of the book. Remember that the focus throughout is on data visualization and we’ll see that next when we discuss sampling, resampling, and bootstrapping. These ideas will lead us into hypothesis testing and confidence intervals.</p>
+
+</div>
+</div>
+</div>
+
+
+
+<h3>References</h3>
+<div id="refs" class="references">
+<div id="ref-R-dplyr">
+<p>Wickham, Hadley, and Romain Francois. 2016. <em>Dplyr: A Grammar of Data Manipulation</em>. <a href="https://CRAN.R-project.org/package=dplyr" class="uri">https://CRAN.R-project.org/package=dplyr</a>.</p>
+</div>
+<div id="ref-rds2016">
+<p>Grolemund, Garrett, and Hadley Wickham. 2016. <em>R for Data Science</em>. <a href="http://r4ds.had.co.nz/" class="uri">http://r4ds.had.co.nz/</a>.</p>
+</div>
+</div>
+            </section>
+
+          </div>
+        </div>
+      </div>
+<a href="4-viz.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
+<a href="6-sim.html" class="navigation navigation-next " aria-label="Next page""><i class="fa fa-angle-right"></i></a>
+
+<script src="libs/gitbook-2.6.7/js/app.min.js"></script>
+<script src="libs/gitbook-2.6.7/js/lunr.js"></script>
+<script src="libs/gitbook-2.6.7/js/plugin-search.js"></script>
+<script src="libs/gitbook-2.6.7/js/plugin-sharing.js"></script>
+<script src="libs/gitbook-2.6.7/js/plugin-fontsettings.js"></script>
+<script src="libs/gitbook-2.6.7/js/plugin-bookdown.js"></script>
+<script src="libs/gitbook-2.6.7/js/jquery.highlight.js"></script>
+<script>
+require(["gitbook"], function(gitbook) {
+gitbook.start({
+"sharing": {
+"github": false,
+"facebook": true,
+"twitter": true,
+"google": false,
+"weibo": false,
+"instapper": false,
+"vk": false,
+"all": ["facebook", "google", "twitter", "weibo", "instapaper"]
+},
+"fontsettings": {
+"theme": "white",
+"family": "sans",
+"size": 2
+},
+"edit": {
+"link": "https://github.com/ismayc/moderndiver-source/edit/master/05-manip.Rmd",
+"text": "Edit"
+},
+"download": ["ismaykim.pdf"],
+"toc": {
+"collapse": "section",
+"scroll_highlight": true
+}
+});
+});
+</script>
+
+<!-- dynamically load mathjax for compatibility with self-contained -->
+<script>
+  (function () {
+    var script = document.createElement("script");
+    script.type = "text/javascript";
+    script.src  = "https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML";
+    if (location.protocol !== "file:" && /^https?:/.test(script.src))
+      script.src  = script.src.replace(/^https?:/, '');
+    document.getElementsByTagName("head")[0].appendChild(script);
+  })();
+</script>
+</body>
+
+</html>
diff --git a/docs/6-simulating-randomness-via-mosaic.html b/docs/6-sim.html
similarity index 72%
rename from docs/6-simulating-randomness-via-mosaic.html
rename to docs/6-sim.html
index f43cba854..92bbaec80 100644
--- a/docs/6-simulating-randomness-via-mosaic.html
+++ b/docs/6-sim.html
@@ -26,14 +26,14 @@
 <meta name="author" content="Chester Ismay and Albert Y. Kim">
 
 
-<meta name="date" content="2017-01-07">
+<meta name="date" content="2017-01-10">
 
   <meta name="viewport" content="width=device-width, initial-scale=1">
   <meta name="apple-mobile-web-app-capable" content="yes">
   <meta name="apple-mobile-web-app-status-bar-style" content="black">
   
   
-<link rel="prev" href="5-data-manipulation-via-dplyr.html">
+<link rel="prev" href="5-manip.html">
 <link rel="next" href="7-hypo.html">
 
 <script src="libs/jquery-2.2.3/jquery.min.js"></script>
@@ -119,9 +119,9 @@
 
 <ul class="summary">
 <li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> Preamble</a><ul>
-<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book</a></li>
+<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book---for-instructors"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book - For Instructors</a></li>
 <li class="chapter" data-level="1.2" data-path="index.html"><a href="index.html#contribute"><i class="fa fa-check"></i><b>1.2</b> Contribute</a></li>
-<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started"><i class="fa fa-check"></i><b>1.3</b> Getting Started</a></li>
+<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started---for-students"><i class="fa fa-check"></i><b>1.3</b> Getting Started - For Students</a></li>
 <li class="chapter" data-level="" data-path="index.html"><a href="index.html#colophon"><i class="fa fa-check"></i>Colophon</a></li>
 </ul></li>
 <li class="chapter" data-level="2" data-path="2-intro.html"><a href="2-intro.html"><i class="fa fa-check"></i><b>2</b> Introduction</a><ul>
@@ -133,87 +133,100 @@
 </ul></li>
 <li class="part"><span><b>I Data Exploration</b></span></li>
 <li class="chapter" data-level="3" data-path="3-tidy.html"><a href="3-tidy.html"><i class="fa fa-check"></i><b>3</b> Tidy Data</a><ul>
+<li class="chapter" data-level="" data-path="3-tidy.html"><a href="3-tidy.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="3.1" data-path="3-tidy.html"><a href="3-tidy.html#what-is-tidy-data"><i class="fa fa-check"></i><b>3.1</b> What is tidy data?</a></li>
-<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#the-nycflights13-datasets"><i class="fa fa-check"></i><b>3.2</b> The <code>nycflights13</code> datasets</a></li>
-<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a></li>
+<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#datasets-in-the-nycflights13-package"><i class="fa fa-check"></i><b>3.2</b> Datasets in the <code>nycflights13</code> package</a></li>
+<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a><ul>
+<li class="chapter" data-level="3.3.1" data-path="3-tidy.html"><a href="3-tidy.html#identification-variables"><i class="fa fa-check"></i><b>3.3.1</b> Identification variables</a></li>
+</ul></li>
 <li class="chapter" data-level="3.4" data-path="3-tidy.html"><a href="3-tidy.html#normal-forms-of-data"><i class="fa fa-check"></i><b>3.4</b> Normal forms of data</a></li>
 <li class="chapter" data-level="3.5" data-path="3-tidy.html"><a href="3-tidy.html#whats-to-come"><i class="fa fa-check"></i><b>3.5</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via <code id="viz">ggplot2</code></a><ul>
-<li class="chapter" data-level="" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
-<li class="chapter" data-level="4.1.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
-<li class="chapter" data-level="4.1.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
-<li class="chapter" data-level="4.1.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
-<li class="chapter" data-level="4.1.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The <code>ggplot2</code> Package</a></li>
-</ul></li>
-<li class="chapter" data-level="4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
-<li class="chapter" data-level="4.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
-<li class="chapter" data-level="4.3.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatter-plots-via-geom_point"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via <code id="geompoint">geom_point</code></a></li>
-<li class="chapter" data-level="4.3.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
-<li class="chapter" data-level="4.3.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
-<li class="chapter" data-level="4.4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#line-graphs-via-geom_line"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via <code id="geomline">geom_line</code></a></li>
-<li class="chapter" data-level="4.4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.5" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
-<li class="chapter" data-level="4.5.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms-via-geom_histogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via <code id="geomhistogram">geom_histogram</code></a></li>
-<li class="chapter" data-level="4.5.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
-<li class="chapter" data-level="4.5.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.6" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
-<li class="chapter" data-level="4.7" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
-<li class="chapter" data-level="4.7.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#boxplots-via-geom_boxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via <code id="geomboxplot">geom_boxplot</code></a></li>
-<li class="chapter" data-level="4.7.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.8" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
-<li class="chapter" data-level="4.8.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via <code>geom_bar</code></a></li>
-<li class="chapter" data-level="4.8.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
-<li class="chapter" data-level="4.8.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
-<li class="chapter" data-level="4.8.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.9" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
-<li class="chapter" data-level="4.9.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.1</b> What’s to come?</a></li>
-<li class="chapter" data-level="4.9.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
-</ul></li>
-</ul></li>
-<li class="chapter" data-level="5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via <code id="manip">dplyr</code></a><ul>
-<li class="chapter" data-level="" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="5.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
-<li class="chapter" data-level="5.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#four-main-verbs---the-4mv"><i class="fa fa-check"></i><b>5.2</b> Four Main Verbs - The 4MV</a><ul>
-<li class="chapter" data-level="5.2.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#filter-observations-using-filter"><i class="fa fa-check"></i><b>5.2.1</b> Filter observations using <code id="filter">filter</code></a></li>
-<li class="chapter" data-level="5.2.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> Summarize variables using <code>summarize</code></a></li>
-<li class="chapter" data-level="5.2.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.3</b> Create new variables/change old variables using <code>mutate</code></a></li>
-<li class="chapter" data-level="5.2.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#reorder-the-data-frame-using-arrange"><i class="fa fa-check"></i><b>5.2.4</b> Reorder the data frame using <code id="arrange">arrange</code></a></li>
-</ul></li>
-<li class="chapter" data-level="5.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#other-verbs"><i class="fa fa-check"></i><b>5.3</b> Other verbs</a><ul>
-<li class="chapter" data-level="5.3.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#select-variables-using-select"><i class="fa fa-check"></i><b>5.3.1</b> Select variables using <code id="select">select</code></a></li>
-<li class="chapter" data-level="5.3.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#rename-variables-using-rename"><i class="fa fa-check"></i><b>5.3.2</b> Rename variables using <code id="rename">rename</code></a></li>
-<li class="chapter" data-level="5.3.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.3.3</b> Find the top number of values using <code>top_n</code></a></li>
-</ul></li>
-<li class="chapter" data-level="5.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#joiningmerging-data-frames"><i class="fa fa-check"></i><b>5.4</b> Joining/merging data frames</a></li>
-<li class="chapter" data-level="5.5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5</b> Script of R code</a></li>
-<li class="chapter" data-level="5.6" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.6</b> What’s to come?</a></li>
+<li class="chapter" data-level="4" data-path="4-viz.html"><a href="4-viz.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via ggplot2</a><ul>
+<li class="chapter" data-level="" data-path="4-viz.html"><a href="4-viz.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="4.1" data-path="4-viz.html"><a href="4-viz.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
+<li class="chapter" data-level="4.1.1" data-path="4-viz.html"><a href="4-viz.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
+<li class="chapter" data-level="4.1.2" data-path="4-viz.html"><a href="4-viz.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
+<li class="chapter" data-level="4.1.3" data-path="4-viz.html"><a href="4-viz.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
+<li class="chapter" data-level="4.1.4" data-path="4-viz.html"><a href="4-viz.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The ggplot2 Package</a></li>
+</ul></li>
+<li class="chapter" data-level="4.2" data-path="4-viz.html"><a href="4-viz.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
+<li class="chapter" data-level="4.3" data-path="4-viz.html"><a href="4-viz.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
+<li class="chapter" data-level="4.3.1" data-path="4-viz.html"><a href="4-viz.html#geompoint"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via geom_point</a></li>
+<li class="chapter" data-level="4.3.2" data-path="4-viz.html"><a href="4-viz.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
+<li class="chapter" data-level="4.3.3" data-path="4-viz.html"><a href="4-viz.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.4" data-path="4-viz.html"><a href="4-viz.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
+<li class="chapter" data-level="4.4.1" data-path="4-viz.html"><a href="4-viz.html#geomline"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via geom_line</a></li>
+<li class="chapter" data-level="4.4.2" data-path="4-viz.html"><a href="4-viz.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.5" data-path="4-viz.html"><a href="4-viz.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
+<li class="chapter" data-level="4.5.1" data-path="4-viz.html"><a href="4-viz.html#geomhistogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via geom_histogram</a></li>
+<li class="chapter" data-level="4.5.2" data-path="4-viz.html"><a href="4-viz.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
+<li class="chapter" data-level="4.5.3" data-path="4-viz.html"><a href="4-viz.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.6" data-path="4-viz.html"><a href="4-viz.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
+<li class="chapter" data-level="4.7" data-path="4-viz.html"><a href="4-viz.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
+<li class="chapter" data-level="4.7.1" data-path="4-viz.html"><a href="4-viz.html#geomboxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via geom_boxplot</a></li>
+<li class="chapter" data-level="4.7.2" data-path="4-viz.html"><a href="4-viz.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.8" data-path="4-viz.html"><a href="4-viz.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
+<li class="chapter" data-level="4.8.1" data-path="4-viz.html"><a href="4-viz.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via geom_bar</a></li>
+<li class="chapter" data-level="4.8.2" data-path="4-viz.html"><a href="4-viz.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
+<li class="chapter" data-level="4.8.3" data-path="4-viz.html"><a href="4-viz.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
+<li class="chapter" data-level="4.8.4" data-path="4-viz.html"><a href="4-viz.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.9" data-path="4-viz.html"><a href="4-viz.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="4.9.1" data-path="4-viz.html"><a href="4-viz.html#resources"><i class="fa fa-check"></i><b>4.9.1</b> Resources</a></li>
+<li class="chapter" data-level="4.9.2" data-path="4-viz.html"><a href="4-viz.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
+<li class="chapter" data-level="4.9.3" data-path="4-viz.html"><a href="4-viz.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.3</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="5" data-path="5-manip.html"><a href="5-manip.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via dplyr</a><ul>
+<li class="chapter" data-level="" data-path="5-manip.html"><a href="5-manip.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="5.1" data-path="5-manip.html"><a href="5-manip.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
+<li class="chapter" data-level="5.2" data-path="5-manip.html"><a href="5-manip.html#five-main-verbs---the-5mv"><i class="fa fa-check"></i><b>5.2</b> Five Main Verbs - The 5MV</a><ul>
+<li class="chapter" data-level="5.2.1" data-path="5-manip.html"><a href="5-manip.html#filter"><i class="fa fa-check"></i><b>5.2.1</b> 5MV#1: Filter observations using filter</a></li>
+<li class="chapter" data-level="5.2.2" data-path="5-manip.html"><a href="5-manip.html#mv2-summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> 5MV#2: Summarize variables using summarize</a></li>
+<li class="chapter" data-level="5.2.3" data-path="5-manip.html"><a href="5-manip.html#mv3-group-rows-using-group_by"><i class="fa fa-check"></i><b>5.2.3</b> 5MV#3: Group rows using group_by</a></li>
+<li class="chapter" data-level="5.2.4" data-path="5-manip.html"><a href="5-manip.html#mv4-create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.4</b> 5MV#4: Create new variables/change old variables using mutate</a></li>
+<li class="chapter" data-level="5.2.5" data-path="5-manip.html"><a href="5-manip.html#arrange"><i class="fa fa-check"></i><b>5.2.5</b> 5MV#5: Reorder the data frame using arrange</a></li>
+</ul></li>
+<li class="chapter" data-level="5.3" data-path="5-manip.html"><a href="5-manip.html#joining-data-frames"><i class="fa fa-check"></i><b>5.3</b> Joining data frames</a><ul>
+<li class="chapter" data-level="5.3.1" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables"><i class="fa fa-check"></i><b>5.3.1</b> Joining by Key Variables</a></li>
+<li class="chapter" data-level="5.3.2" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables-with-different-names"><i class="fa fa-check"></i><b>5.3.2</b> Joining by Key Variables with Different Names</a></li>
+</ul></li>
+<li class="chapter" data-level="5.4" data-path="5-manip.html"><a href="5-manip.html#optional-other-verbs"><i class="fa fa-check"></i><b>5.4</b> Optional: Other verbs</a><ul>
+<li class="chapter" data-level="5.4.1" data-path="5-manip.html"><a href="5-manip.html#select"><i class="fa fa-check"></i><b>5.4.1</b> Select variables using select</a></li>
+<li class="chapter" data-level="5.4.2" data-path="5-manip.html"><a href="5-manip.html#rename"><i class="fa fa-check"></i><b>5.4.2</b> Rename variables using rename</a></li>
+<li class="chapter" data-level="5.4.3" data-path="5-manip.html"><a href="5-manip.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.4.3</b> Find the top number of values using top_n</a></li>
+</ul></li>
+<li class="chapter" data-level="5.5" data-path="5-manip.html"><a href="5-manip.html#conclusion-1"><i class="fa fa-check"></i><b>5.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="5.5.1" data-path="5-manip.html"><a href="5-manip.html#resources-1"><i class="fa fa-check"></i><b>5.5.1</b> Resources</a></li>
+<li class="chapter" data-level="5.5.2" data-path="5-manip.html"><a href="5-manip.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5.2</b> Script of R code</a></li>
+<li class="chapter" data-level="5.5.3" data-path="5-manip.html"><a href="5-manip.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.5.3</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="part"><span><b>II Inference</b></span></li>
-<li class="chapter" data-level="6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via <code id="sim">mosaic</code></a><ul>
-<li class="chapter" data-level="" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="6.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
-<li class="chapter" data-level="6.1.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
-<li class="chapter" data-level="6.1.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+<li class="chapter" data-level="6" data-path="6-sim.html"><a href="6-sim.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via mosaic</a><ul>
+<li class="chapter" data-level="" data-path="6-sim.html"><a href="6-sim.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="6.1" data-path="6-sim.html"><a href="6-sim.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
+<li class="chapter" data-level="6.1.1" data-path="6-sim.html"><a href="6-sim.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
+<li class="chapter" data-level="6.1.2" data-path="6-sim.html"><a href="6-sim.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
 </ul></li>
-<li class="chapter" data-level="6.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
-<li class="chapter" data-level="6.2.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
-<li class="chapter" data-level="6.2.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
+<li class="chapter" data-level="6.2" data-path="6-sim.html"><a href="6-sim.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
+<li class="chapter" data-level="6.2.1" data-path="6-sim.html"><a href="6-sim.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
+<li class="chapter" data-level="6.2.2" data-path="6-sim.html"><a href="6-sim.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
+</ul></li>
+<li class="chapter" data-level="6.3" data-path="6-sim.html"><a href="6-sim.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
+<li class="chapter" data-level="6.4" data-path="6-sim.html"><a href="6-sim.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
+<li class="chapter" data-level="6.5" data-path="6-sim.html"><a href="6-sim.html#conclusion-2"><i class="fa fa-check"></i><b>6.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="6.5.1" data-path="6-sim.html"><a href="6-sim.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5.1</b> Script of R code</a></li>
+<li class="chapter" data-level="6.5.2" data-path="6-sim.html"><a href="6-sim.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.5.2</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="6.3" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
-<li class="chapter" data-level="6.4" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
-<li class="chapter" data-level="6.5" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5</b> Script of R code</a></li>
-<li class="chapter" data-level="6.6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.6</b> What’s to come?</a></li>
 </ul></li>
 <li class="chapter" data-level="7" data-path="7-hypo.html"><a href="7-hypo.html"><i class="fa fa-check"></i><b>7</b> Hypothesis Testing</a><ul>
-<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="7.1" data-path="7-hypo.html"><a href="7-hypo.html#when-inference-is-not-needed"><i class="fa fa-check"></i><b>7.1</b> When Inference Is Not Needed</a></li>
 <li class="chapter" data-level="7.2" data-path="7-hypo.html"><a href="7-hypo.html#basics-of-hypothesis-testing"><i class="fa fa-check"></i><b>7.2</b> Basics of Hypothesis Testing</a></li>
 <li class="chapter" data-level="7.3" data-path="7-hypo.html"><a href="7-hypo.html#trial"><i class="fa fa-check"></i><b>7.3</b> Criminal trial analogy</a><ul>
@@ -249,44 +262,50 @@
 <li class="chapter" data-level="7.8.1" data-path="7-hypo.html"><a href="7-hypo.html#example-t-test-for-two-independent-samples"><i class="fa fa-check"></i><b>7.8.1</b> EXAMPLE: <span class="math inline">\(t\)</span>-test for two independent samples</a></li>
 <li class="chapter" data-level="7.8.2" data-path="7-hypo.html"><a href="7-hypo.html#conditions-for-t-test"><i class="fa fa-check"></i><b>7.8.2</b> Conditions for t-test</a></li>
 </ul></li>
-<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9</b> What’s to come?</a></li>
+<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#conclusion-3"><i class="fa fa-check"></i><b>7.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="7.9.1" data-path="7-hypo.html"><a href="7-hypo.html#script-of-r-code-3"><i class="fa fa-check"></i><b>7.9.1</b> Script of R code</a></li>
+<li class="chapter" data-level="7.9.2" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="chapter" data-level="8" data-path="8-ci.html"><a href="8-ci.html"><i class="fa fa-check"></i><b>8</b> Confidence Intervals</a><ul>
-<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a></li>
-<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#mean_rating"><i class="fa fa-check"></i><b>8.2</b> mean_rating</a><ul>
-<li class="chapter" data-level="8.2.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.2.1</b> Review of Bootstrapping</a></li>
-</ul></li>
-<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.3</b> Relation to hypothesis testing</a></li>
-<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.4</b> Effect size</a></li>
-<li class="chapter" data-level="8.5" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-3"><i class="fa fa-check"></i><b>8.5</b> Script of R code</a></li>
-<li class="chapter" data-level="8.6" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.6</b> What’s to come?</a></li>
-</ul></li>
-<li class="chapter" data-level="9" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html"><i class="fa fa-check"></i><b>9</b> Regression via <code id="regress">broom</code></a><ul>
-<li class="chapter" data-level="" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="9.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
-<li class="chapter" data-level="9.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
-<li class="chapter" data-level="9.2.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
-</ul></li>
-<li class="chapter" data-level="9.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
-<li class="chapter" data-level="9.3.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
-<li class="chapter" data-level="9.3.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
-<li class="chapter" data-level="9.3.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#interpretting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpretting the slope</a></li>
-<li class="chapter" data-level="9.3.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
-</ul></li>
-<li class="chapter" data-level="9.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
-<li class="chapter" data-level="9.4.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
-<li class="chapter" data-level="9.4.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
-<li class="chapter" data-level="9.4.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
-<li class="chapter" data-level="9.4.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
-<li class="chapter" data-level="9.4.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
-</ul></li>
-<li class="chapter" data-level="9.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
-<li class="chapter" data-level="9.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
-<li class="chapter" data-level="9.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#script-of-r-code-4"><i class="fa fa-check"></i><b>9.7</b> Script of R code</a></li>
-<li class="chapter" data-level="9.8" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.8</b> What’s to come?</a></li>
+<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a><ul>
+<li class="chapter" data-level="8.1.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.1.1</b> Review of Bootstrapping</a></li>
+</ul></li>
+<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.2</b> Relation to hypothesis testing</a></li>
+<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.3</b> Effect size</a></li>
+<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#conclusion-4"><i class="fa fa-check"></i><b>8.4</b> Conclusion</a><ul>
+<li class="chapter" data-level="8.4.1" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-4"><i class="fa fa-check"></i><b>8.4.1</b> Script of R code</a></li>
+<li class="chapter" data-level="8.4.2" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.4.2</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="9" data-path="9-regress.html"><a href="9-regress.html"><i class="fa fa-check"></i><b>9</b> Regression via broom</a><ul>
+<li class="chapter" data-level="" data-path="9-regress.html"><a href="9-regress.html#needed-packages-6"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="9.1" data-path="9-regress.html"><a href="9-regress.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
+<li class="chapter" data-level="9.2" data-path="9-regress.html"><a href="9-regress.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
+<li class="chapter" data-level="9.2.1" data-path="9-regress.html"><a href="9-regress.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
+</ul></li>
+<li class="chapter" data-level="9.3" data-path="9-regress.html"><a href="9-regress.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
+<li class="chapter" data-level="9.3.1" data-path="9-regress.html"><a href="9-regress.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
+<li class="chapter" data-level="9.3.2" data-path="9-regress.html"><a href="9-regress.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
+<li class="chapter" data-level="9.3.3" data-path="9-regress.html"><a href="9-regress.html#interpreting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpreting the slope</a></li>
+<li class="chapter" data-level="9.3.4" data-path="9-regress.html"><a href="9-regress.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
+</ul></li>
+<li class="chapter" data-level="9.4" data-path="9-regress.html"><a href="9-regress.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
+<li class="chapter" data-level="9.4.1" data-path="9-regress.html"><a href="9-regress.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
+<li class="chapter" data-level="9.4.2" data-path="9-regress.html"><a href="9-regress.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
+<li class="chapter" data-level="9.4.3" data-path="9-regress.html"><a href="9-regress.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
+<li class="chapter" data-level="9.4.4" data-path="9-regress.html"><a href="9-regress.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.5" data-path="9-regress.html"><a href="9-regress.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
+<li class="chapter" data-level="9.4.6" data-path="9-regress.html"><a href="9-regress.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.7" data-path="9-regress.html"><a href="9-regress.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
+</ul></li>
+<li class="chapter" data-level="9.5" data-path="9-regress.html"><a href="9-regress.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
+<li class="chapter" data-level="9.6" data-path="9-regress.html"><a href="9-regress.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
+<li class="chapter" data-level="9.7" data-path="9-regress.html"><a href="9-regress.html#conclusion-5"><i class="fa fa-check"></i><b>9.7</b> Conclusion</a><ul>
+<li class="chapter" data-level="9.7.1" data-path="9-regress.html"><a href="9-regress.html#script-of-r-code-5"><i class="fa fa-check"></i><b>9.7.1</b> Script of R code</a></li>
+<li class="chapter" data-level="9.7.2" data-path="9-regress.html"><a href="9-regress.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.7.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="part"><span><b>III Conclusion</b></span></li>
 <li class="chapter" data-level="10" data-path="10-effective-data-storytelling.html"><a href="10-effective-data-storytelling.html"><i class="fa fa-check"></i><b>10</b> Effective Data Storytelling</a><ul>
@@ -304,7 +323,8 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="B" data-path="B-appendixB.html"><a href="B-appendixB.html"><i class="fa fa-check"></i><b>B</b> Inference Examples</a><ul>
-<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-6"><i class="fa fa-check"></i><b>B.1</b> Needed packages</a></li>
+<li class="chapter" data-level="" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-7"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#inference-mind-map"><i class="fa fa-check"></i><b>B.1</b> Inference Mind Map</a></li>
 <li class="chapter" data-level="B.2" data-path="B-appendixB.html"><a href="B-appendixB.html#one-mean"><i class="fa fa-check"></i><b>B.2</b> One Mean</a><ul>
 <li class="chapter" data-level="B.2.1" data-path="B-appendixB.html"><a href="B-appendixB.html#problem-statement"><i class="fa fa-check"></i><b>B.2.1</b> Problem Statement</a></li>
 <li class="chapter" data-level="B.2.2" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses"><i class="fa fa-check"></i><b>B.2.2</b> Competing Hypotheses</a></li>
@@ -352,6 +372,7 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="C" data-path="C-appendixC.html"><a href="C-appendixC.html"><i class="fa fa-check"></i><b>C</b> Reach for the Starts</a><ul>
+<li class="chapter" data-level="" data-path="C-appendixC.html"><a href="C-appendixC.html#needed-packages-8"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="C.1" data-path="C-appendixC.html"><a href="C-appendixC.html#sorted-barplots"><i class="fa fa-check"></i><b>C.1</b> Sorted barplots</a></li>
 <li class="chapter" data-level="C.2" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-graphics"><i class="fa fa-check"></i><b>C.2</b> Interactive graphics</a><ul>
 <li class="chapter" data-level="C.2.1" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-line-graphs"><i class="fa fa-check"></i><b>C.2.1</b> Interactive line-graphs</a></li>
@@ -375,13 +396,13 @@ <h1>
           <div class="page-inner">
 
             <section class="normal" id="section-">
-<div id="simulating-randomness-via-mosaic" class="section level1">
-<h1><span class="header-section-number">6</span> Simulating Randomness via <code id="sim">mosaic</code></h1>
-<p>In this chapter we will introduce new concepts that will serve as the basis for the remainder of the text: <strong>sampling</strong> and <strong>resampling</strong>. We will see that the tools that you learned in the Data Exploration part of this book (tidy data, data manipulation, and data visualization) will also play an important role here. As mentioned before, the concepts all build into a culmination allowing you to create better stories with data.</p>
+<div id="sim" class="section level1">
+<h1><span class="header-section-number">6</span> Simulating Randomness via mosaic</h1>
+<p>In this chapter we will introduce new concepts that will serve as the basis for the remainder of the text: <strong>sampling</strong> and <strong>resampling</strong>. We will see that the tools that you learned in the Data Exploration part of this book (tidy data, data visualization, and data manipulation) will also play an important role here. As mentioned before, the concepts throughout this text all build into a culmination allowing you to create better stories with data.</p>
 <p>We begin with some helpful definitions that will help us better understand why statistical inference exists and why it is needed. We will then progress with introducing the second of our main data sets (in addition to the <code>nycflights13</code> data you’ve been working with) about OKCupid dating profiles to see how one can think of the distribution of a sample being an approximation of the distribution of the population. We will also focus on representative, random samples versus convenience samples in this context.</p>
 <p>We then shift to a famous example from statistics lore on a lady tasting tea. This section will focus on introducing concepts without a lot of statistical jargon. The chapter will conclude with a summary of the different functions introduced in the <code>mosaic</code> package in this chapter.</p>
-<div id="needed-packages-2" class="section level2 unnumbered">
-<h2>Needed packages</h2>
+<div id="needed-packages-3" class="section level3 unnumbered">
+<h3>Needed packages</h3>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(dplyr)
 <span class="kw">library</span>(ggplot2)
 <span class="kw">library</span>(okcupiddata)
@@ -408,15 +429,6 @@ <h3><span class="header-section-number">6.1.1</span> Tasting soup</h3>
 <li>“Does this actually taste good?”</li>
 </ul>
 <p>How can we answer these questions? Does it matter where we take a bite of soup from? Is there anything we should do to the soup before we taste? Is one taste enough?</p>
-<hr />
-<div class="learncheck">
-<p>
-<strong><em>Learning check</em></strong>
-</p>
-</div>
-<p><strong>(LC6.1)</strong> Explain in your own words how tasting soup relates to the concepts of sampling covered here.</p>
-<p><strong>(LC6.2)</strong> Describe a different scenario (not food or drink related) that is analogous to sampling concepts covered here.</p>
-<hr />
 </div>
 <div id="common-terms" class="section level3">
 <h3><span class="header-section-number">6.1.2</span> Common terms</h3>
@@ -439,6 +451,15 @@ <h3><span class="header-section-number">6.1.2</span> Common terms</h3>
 <p><strong>Definition: statistic</strong></p>
 <p>A <em>statistic</em> is a calculated based on one or more variables measured in the sample. Parameters are usually denoted by lower case Arabic letters with other symbols added sometimes. These include <span class="math inline">\(\bar{x}\)</span>, <span class="math inline">\(\hat{p}\)</span>, <span class="math inline">\(s\)</span>, <span class="math inline">\(p\)</span>, and <span class="math inline">\(b\)</span>.</p>
 <hr />
+<hr />
+<div class="learncheck">
+<p>
+<strong><em>Learning check</em></strong>
+</p>
+</div>
+<p><strong>(LC6.1)</strong> Explain in your own words how tasting soup relates to the concepts of sampling covered here.</p>
+<p><strong>(LC6.2)</strong> Describe a different scenario (not food or drink related) that is analogous to sampling concepts covered here.</p>
+<hr />
 <p>Let’s explore these terms for our tasting soup example:</p>
 <p><em>Population</em> - the entire container of soup that we have cooked.</p>
 <p><em>Sample</em> - any smaller portion of soup collected that isn’t the whole container of soup. We could say that each spoonful of soup represents one sample.</p>
@@ -469,7 +490,7 @@ <h3><span class="header-section-number">6.1.2</span> Common terms</h3>
 </div>
 <div id="visualizing-sampling" class="section level2">
 <h2><span class="header-section-number">6.2</span> Visualizing sampling</h2>
-<p>Let’s explore how sampling and these other terms relate to working with data and data visualization. Here we introduce the <code>okcupiddata</code> R package. Note that permission to use this data to create the R package was explicitly granted by OkCupid. More information about this package is available <a href="https://github.com/rudeboybert/okcupiddata">here</a>. The <code>profiles</code> data frame in this R data package contains data about 59,946 OkCupid users who were living within 25 miles of San Francisco, had active profiles on June 26, 2012, were online in the previous year, and had at least one picture in their profile. We will be focusing on the <code>height</code> variable, which corresponds to self-reported heights of the individual on their profile. Note that this is measured in inches.</p>
+<p>Let’s explore how sampling and these other terms relate to working with data and data visualization. Here we introduce the <code>okcupiddata</code> R package <span class="citation">(Kim and Escobedo-Land <a href="#ref-R-okcupiddata">2016</a>)</span>. Note that permission to use this data to create the R package was explicitly granted by OkCupid. More information about this package is available <a href="https://github.com/rudeboybert/okcupiddata">here</a>. The <code>profiles</code> data frame in this R data package contains data about 59,946 OkCupid users who were living within 25 miles of San Francisco, had active profiles on June 26, 2012, were online in the previous year, and had at least one picture in their profile. We will be focusing on the <code>height</code> variable, which corresponds to a self-reported height for each individual on their profile. Note that this is measured in inches.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(okcupiddata)
 <span class="kw">data</span>(profiles)</code></pre></div>
 <p>Let’s take a look at the distribution of <code>height</code> using a histogram and <code>ggplot2</code>:</p>
@@ -488,19 +509,17 @@ <h2><span class="header-section-number">6.2</span> Visualizing sampling</h2>
 <hr />
 <p>To clean up the data a bit, let’s focus on just looking at heights between 55 inches and 85 inches. Remember that the <code>filter</code> function in <code>dplyr</code> allows us to focus on a subset of rows. The specific subset of rows we are interested in corresponds to the argument to the <code>filter</code> function. We will create a new data frame called <code>profiles_subset</code> that contains all rows with heights between 55 and 85 inclusive.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(dplyr)
-profiles_subset &lt;-<span class="st"> </span>profiles %&gt;%<span class="st"> </span>
-<span class="st">  </span><span class="kw">filter</span>(<span class="kw">between</span>(height, <span class="dv">55</span>, <span class="dv">85</span>))</code></pre></div>
+profiles_subset &lt;-<span class="st"> </span>profiles %&gt;%<span class="st"> </span><span class="kw">filter</span>(<span class="kw">between</span>(height, <span class="dv">55</span>, <span class="dv">85</span>))</code></pre></div>
 <p>Next, let’s produce the same histogram as above but using the <code>profiles_subset</code> data frame instead.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(ggplot2)
-<span class="kw">ggplot</span>(<span class="dt">data =</span> profiles_subset, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> height)) +
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> profiles_subset, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> height)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">bins =</span> <span class="dv">20</span>, <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>)</code></pre></div>
 <p><img src="ismaykim_files/figure-html/height-hist2-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 <p>We can think of this data as representing the <em>population</em> of interest. Let’s now take a random sample of size 100 from this population and look to see if this sample represents the overall shape of the population. In other words, we are going to use data visualization as our guide to understand the <em>representativeness</em> of the sample selected.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(mosaic)
 <span class="kw">set.seed</span>(<span class="dv">2017</span>)
 profiles_sample1 &lt;-<span class="st"> </span>profiles_subset %&gt;%<span class="st"> </span><span class="kw">resample</span>(<span class="dt">size =</span> <span class="dv">100</span>, <span class="dt">replace =</span> <span class="ot">FALSE</span>)</code></pre></div>
-<p>The <code>set.seed</code> function is used to ensure that all users get the same random sample when they run the code above. It is a way of interfacing with the pseudo-random number generation scheme that R uses to generate “random” numbers. If that command was not run, you’d obtain a different random sample if you ran the code above for the first time.</p>
-<p>We have introduced the <code>resample</code> function from the <code>mosaic</code> package here. This function can be used for both sampling with and without replacement. Here we have chosen to sample without replacement. In other words, after the first row is chosen from the <code>profiles_subset</code> data frame at random it is kept out of the further 99 samples. Let’s now visualize the 100 values of the <code>height</code> variable in the <code>profiles_sample1</code> data frame. To keep this visualization on the same horizontal scale as our original population presented in <code>profiles_subset</code> we can use the <code>coord_cartesian</code> function along with the <code>c</code> function to specify the limits on the horizontal axis.</p>
+<p>The <code>set.seed</code> function is used to ensure that all users get the same random sample when they run the code above. It is a way of interfacing with the pseudo-random number generation scheme that R uses to generate “random” numbers. If that command was not run, you’d obtain a different random sample than someone else if you ran the code above for the first time.</p>
+<p>We have introduced the <code>resample</code> function from the <code>mosaic</code> package here <span class="citation">(Pruim, Kaplan, and Horton <a href="#ref-R-mosaic">2016</a>)</span>. This function can be used for both sampling with and without replacement. Here we have chosen to sample without replacement. In other words, after the first row is chosen from the <code>profiles_subset</code> data frame at random it is kept out of the further 99 samples. Let’s now visualize the 100 values of the <code>height</code> variable in the <code>profiles_sample1</code> data frame. To keep this visualization on the same horizontal scale as our original population presented in <code>profiles_subset</code> we can use the <code>coord_cartesian</code> function along with the <code>c</code> function to specify the limits on the horizontal axis.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> profiles_sample1, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> height)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">bins =</span> <span class="dv">20</span>, <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>, <span class="dt">fill =</span> <span class="st">&quot;red&quot;</span>) +
 <span class="st">  </span><span class="kw">coord_cartesian</span>(<span class="dt">xlim =</span> <span class="kw">c</span>(<span class="dv">55</span>, <span class="dv">85</span>))</code></pre></div>
@@ -519,15 +538,14 @@ <h2><span class="header-section-number">6.2</span> Visualizing sampling</h2>
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">bins =</span> <span class="dv">20</span>, <span class="dt">color =</span> <span class="st">&quot;black&quot;</span>, <span class="dt">fill =</span> <span class="st">&quot;yellow&quot;</span>) +
 <span class="st">  </span><span class="kw">coord_cartesian</span>(<span class="dt">xlim =</span> <span class="kw">c</span>(<span class="dv">55</span>, <span class="dv">85</span>))</code></pre></div>
 <p><img src="ismaykim_files/figure-html/sample-profiles2-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
-<p>Remember that a sample can never truly quantify all of the properties of a population since it contains less data and, thus, less information. We can use the overall shape as a good guess as to the representativeness of the sample in regards to the population. We see that the above two random samples of size 100 have roughly the same shape as the original population <code>height</code> data. Let’s next explore what is known as a convenience sample and how its distribution compares to the population distribution.</p>
+<p>Remember that a sample can never truly quantify all of the properties of a population since it contains less data and, thus, less information. We can use the overall shape as a good guess as to the representativeness of the sample in regards to the population though. We see that the above two random samples of size 100 have roughly the same shape as the original population <code>height</code> data. Let’s next explore what is known as a convenience sample and how its distribution compares to the population distribution.</p>
 <p>A <strong>convenience sample</strong> is a sample that is chosen conveniently by the person selecting the sample. While certainly less work, convenience samples are generally not representative of the population since they will exclude some (usually large) portion of the population. Let’s look at values of <code>height</code> in our <code>profiles_subset</code> population that are larger than 6 feet tall (72 inches) and have that be the sample we choose.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">profiles_sample3 &lt;-<span class="st"> </span>profiles_subset %&gt;%<span class="st"> </span>
-<span class="st">  </span><span class="kw">filter</span>(height &gt;=<span class="st"> </span><span class="dv">72</span>)
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">profiles_sample3 &lt;-<span class="st"> </span>profiles_subset %&gt;%<span class="st"> </span><span class="kw">filter</span>(height &gt;=<span class="st"> </span><span class="dv">72</span>)
 <span class="kw">ggplot</span>(<span class="dt">data =</span> profiles_sample3, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> height)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">bins =</span> <span class="dv">20</span>, <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>, <span class="dt">fill =</span> <span class="st">&quot;blue&quot;</span>) +
 <span class="st">  </span><span class="kw">coord_cartesian</span>(<span class="dt">xlim =</span> <span class="kw">c</span>(<span class="dv">55</span>, <span class="dv">85</span>))</code></pre></div>
 <p><img src="ismaykim_files/figure-html/sample-profiles3-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
-<p>This is a clear example of a sample that is not representative of the population. The population <code>height</code> is roughly symmetric, whereas this distribution is right-skewed. Further, since it only selects large heights it has completely excluded the small and middle heights. We have seen here that data visualization provides an excellent tool in judging the representativeness of a sample.</p>
+<p>This is a clear example of a sample that is not representative of the population. The population <code>height</code> variable is roughly symmetric, whereas this distribution is right-skewed. Further, since it only selects large heights it has completely excluded the small and middle heights. We have seen here that data visualization provides an excellent tool in judging the representativeness of a sample.</p>
 <div id="sampling-distribution" class="section level3">
 <h3><span class="header-section-number">6.2.1</span> Sampling distribution</h3>
 <p>The representativeness of a sample plays an even larger role than just looking at the shapes of distributions. Let’s suppose we were interested in estimating the mean <code>height</code> of all profiles in the <code>profiles_subset</code> data frame. To do so, we could look at the mean of the <code>height</code> variable in the <code>profiles_sample1</code> data frame:</p>
@@ -541,7 +559,7 @@ <h3><span class="header-section-number">6.2.1</span> Sampling distribution</h3>
 <p>Or maybe even <code>profiles_sample3</code>:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">profiles_sample3 %&gt;%<span class="st"> </span><span class="kw">summarize</span>(<span class="kw">mean</span>(height))</code></pre></div>
 <pre><code>##   mean(height)
-## 1     73.37917</code></pre>
+## 1        73.38</code></pre>
 <p>We see a clear difference here in looking at the mean of <code>height</code> in <code>profiles_sample3</code> versus <code>profiles_sample1</code> and <code>profiles_sample2</code>. This comes from the bias that is used in choosing only the top heights for <code>profiles_sample3</code>. If we had chosen to use this sample as our only sample, we would be quite a ways off from what the actual mean <code>height</code> in our population of <code>profiles_subset</code> is.</p>
 <p>We also see that even random samples produce means that aren’t exactly the same. This sampling variability can be shown via what is called a <em>sampling distribution</em>. This is defined as the behavior of a statistic under repeated sampling. To build this sampling distribution for this example, we’ve created an interactive app using the <code>shiny</code> R package below that is available at <a href="http://ismay.shinyapps.io/okcupidheights/" class="uri">http://ismay.shinyapps.io/okcupidheights/</a>. You can specify the sample size you’d like to work with (100 is chosen by default) and then generate a random sample. You then can see the mean of this generated sample plotted in the bottom visualization. Repeating this process many times, you can start to see the shape of the sampling distribution take form.</p>
 <div class="figure" style="text-align: center"><span id="fig:shiny"></span>
@@ -561,7 +579,7 @@ <h3><span class="header-section-number">6.2.2</span> Repeated sampling via <code
 <span class="kw">ggplot</span>(<span class="dt">data =</span> sample_means, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> mean_height)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">color =</span> <span class="st">&quot;white&quot;</span>, <span class="dt">bins =</span> <span class="dv">20</span>)</code></pre></div>
 <p><img src="ismaykim_files/figure-html/do-first-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
-<p>Note how the range of sample mean height values is much more narrow than the original range of <code>height</code> in the <code>profiles_subset</code> data frame. We also see a characteristic shape to this distribution of <code>sample_mean</code>: the normal curve. This idea is commonly associated with statistics and you hopefully have a good sense of how this distribution comes about. As before, if you aren’t quite sure of this yet, go back and explore the shiny app above a bit more. We see that many values for the sample mean appear near the center of the distribution and a few values out in the tails providing the bell-shaped distribution linked with the normal distribution. You’ll see more examples of this in the chapters to come and in the appendices.</p>
+<p>Note how the range of sample mean height values is much more narrow than the original range of <code>height</code> in the <code>profiles_subset</code> data frame. We also see a characteristic shape to this distribution of <code>mean_height</code>: the normal curve. This idea is commonly associated with statistics and you hopefully have a good sense of how this distribution comes about. As before, if you aren’t quite sure of this yet, go back and explore the shiny app above a bit more. We see that many values for the sample mean appear near the center of the distribution and a few values appear out in the tails providing the bell-shaped distribution linked with the normal distribution. You’ll see more examples of this in the chapters to come and in Appendix <a href="B-appendixB.html#appendixB">B</a>.</p>
 <hr />
 <div class="learncheck">
 <p>
@@ -594,7 +612,7 @@ <h2><span class="header-section-number">6.3</span> Simulation</h2>
 <blockquote>
 <p>In his book, Fisher discusses the various possible outcomes of such an experiment. He describes how to decide how many cups should be presented and in what order and how much to tell the lady about the order of presentations. He works out the probabilities of different outcomes, depending upon whether the lady is or is not correct. Nowhere in this discussion does he indicate that such an experiment was ever run. Nor does he describe the outcome of an actual experiment.</p>
 </blockquote>
-<p>It’s amazing that there is no actual evidence that such an event actually took place. This problem is a great introduction into inference though and we can proceed by testing to see how likely it is for a person to guess correctly, say, 9 out of 10 times assuming that that person is just guessing. In other words, is the person just lucky or do we have reason to suspect that they can actually detect whether milk was put in first or not?</p>
+<p>It’s amazing that there is no actual evidence that such an event actually took place. This problem is a great introduction into inference though and we can proceed by testing to see how likely it is for a person to guess correctly, say, 9 out of 10 times, assuming that person is just guessing. In other words, is the person just lucky or do we have reason to suspect that they can actually detect whether milk was put in first or not?</p>
 <p>We need to think about this problem from the standpoint of hypothesis testing. First, we’ll need to identify some important parts of a hypothesis test before we proceed with the analysis.</p>
 <hr />
 <div class="learncheck">
@@ -632,7 +650,7 @@ <h2><span class="header-section-number">6.3</span> Simulation</h2>
 ## 12 10     5     5  0.5
 ## 13 10     4     6  0.4</code></pre>
 <p>We’ve now done a simulation of what actually happened when you flipped a coin ten times. We have 13 different simulations of flipping a coin 10 times. Note here that <code>heads</code> now corresponds to the number of correct guesses and <code>tails</code> corresponds to the number of incorrect guesses. (This can be tricky to understand at first since we’ve done a switch on what the meaning of “heads” and ``tails&quot; are.)</p>
-<p>If you look at the output above for our simulation of 13 student guesses, we can begin to get a sense for what an “expected” sample proportion of successes may be. Around five out of 10 seems to be the most likely value. What does this say about our assumed <span class="math inline">\(\hat{p}\)</span> of 9/10? To better answer this question, we can simulate 10,000 student guesses and then look at the distribution of the simulated sample proportion of successes, also known as the <strong>null distribution</strong>.</p>
+<p>If you look at the output above for our simulation of 13 student guesses, we can begin to get a sense for what an “expected” sample proportion of successes may be. Around five out of 10 seems to be the most likely value. What does this say about what we actually observed with a success rate of 9/10? To better answer this question, we can simulate 10,000 student guesses and then look at the distribution of the simulated sample proportion of successes, also known as the <strong>null distribution</strong>.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(dplyr)
 simGuesses &lt;-<span class="st"> </span><span class="kw">do</span>(<span class="dv">10000</span>) *<span class="st"> </span><span class="kw">rflip</span>(<span class="dv">10</span>)
 simGuesses %&gt;%<span class="st"> </span>
@@ -652,17 +670,14 @@ <h2><span class="header-section-number">6.3</span> Simulation</h2>
 ## 9      8   408
 ## 10     9    91
 ## 11    10    14</code></pre>
-<!--
-**Note:** Here `as_tibble` converts data frames to tibbles. This is also why the `library(tibble)` command is needed. The conversion to `tibble` format is mostly done for allowing for nice printing of large data sets when we mention the name of a data frame object in a chunk by itself.  (The data sets in `nycflights13` come as tibbles by default.)  You can read more about tibbles in Chapter 10 of Hadley and Garrett's book [@rds2016].
--->
-<p>We can see here that we have created a count of how many of each of the 10,000 sets of 10 flips resulted in 0, 1, 2, …, up to 10 heads. Note the use of the <code>group_by</code> and <code>summarize</code> functions from Chapter <a href="5-data-manipulation-via-dplyr.html#manip"><strong>??</strong></a> here.</p>
-<p>In addition, we can plot the distribution of these simulated <code>heads</code> using the ideas from Chapter <a href="4-data-visualization-via-ggplot2.html#viz"><strong>??</strong></a>. <code>heads</code> is a quantitative variable. Think about which type of plot is most appropriate here before reading further.</p>
+<p>We can see here that we have created a count of how many of each of the 10,000 sets of 10 flips resulted in 0, 1, 2, <span class="math inline">\(\ldots\)</span>, up to 10 heads. Note the use of the <code>group_by</code> and <code>summarize</code> functions from Chapter <a href="5-manip.html#manip">5</a> here.</p>
+<p>In addition, we can plot the distribution of these simulated <code>heads</code> using the ideas from Chapter <a href="4-viz.html#viz">4</a>. <code>heads</code> is a quantitative variable. Think about which type of plot is most appropriate here before reading further.</p>
 <p>We already have an idea as to an appropriate plot by the data summarization that we did in the chunk above. We’d like to see how many heads occurred in the 10,000 sets of 10 flips. In other words, we’d like to see how frequently 9 or more heads occurred in the 10 flips:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(ggplot2)
 simGuesses %&gt;%<span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> heads)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">binwidth =</span> <span class="dv">1</span>, <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>)</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-66"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-66-1.png" alt="Histogram of number of heads in simulation - needs tweaking" width="\textwidth" />
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-71"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-71-1.png" alt="Histogram of number of heads in simulation - needs tweaking" width="\textwidth" />
 <p class="caption">
 Figure 6.3: Histogram of number of heads in simulation - needs tweaking
 </p>
@@ -671,14 +686,14 @@ <h2><span class="header-section-number">6.3</span> Simulation</h2>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(ggplot2)
 simGuesses %&gt;%<span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> <span class="kw">factor</span>(heads))) +
 <span class="st">  </span><span class="kw">geom_bar</span>()</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-67"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-67-1.png" alt="Barplot of number of heads in simulation" width="\textwidth" />
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-72"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-72-1.png" alt="Barplot of number of heads in simulation" width="\textwidth" />
 <p class="caption">
 Figure 6.4: Barplot of number of heads in simulation
 </p>
 </div>
 <p>You’ll frequently need to make this conversion to <code>factor</code> when making a barplot with quantitative variables. Remember from “Getting Used to R, RStudio, and R Markdown” <span class="citation">(Ismay <a href="#ref-usedtor2016">2016</a>)</span>, that a <code>factor</code> variable is useful when there is a natural ordering to the variable and it only takes on discrete values and not fractional values like 2.5. Our <code>heads</code> variable has a natural ordering: 0, 1, 2, <span class="math inline">\(\ldots\)</span>, 10.</p>
-<p>Again, note that the shape of these number of heads follows what appears to be a normal distribution. We’ll see that if appropriate conditions/assumptions are met with the data that we can expect to see a normal distribution result. When these conditions aren’t met, the simulation methodology we’ve presented here still works well whereas the traditional normal-based methods start to fall apart.</p>
+<p>Again, note that the shape of these number of heads follows what appears to be a normal distribution. We’ll see in a related example that if appropriate conditions/assumptions are met with the data that we can expect to see a normal distribution result. When these conditions aren’t met, the simulation methodology we’ve presented here still works well whereas the traditional normal-based methods start to fall apart.</p>
 <p>We will delve further into hypothesis testing in the next few chapters. This null distribution in combination with the <strong>sampling distribution</strong> concept covered earlier will be of utmost importance going forward.</p>
 </div>
 <div id="review-of-mosaic-simulation-functions" class="section level2">
@@ -700,18 +715,27 @@ <h2><span class="header-section-number">6.4</span> Review of <code>mosaic</code>
 <p><strong>(LC6.17)</strong> Recreate <code>shuffle</code> using only the <code>resample</code> function and specifying the appropriate arguments.</p>
 <hr />
 </div>
-<div id="script-of-r-code-2" class="section level2">
-<h2><span class="header-section-number">6.5</span> Script of R code</h2>
+<div id="conclusion-2" class="section level2">
+<h2><span class="header-section-number">6.5</span> Conclusion</h2>
+<div id="script-of-r-code-2" class="section level3">
+<h3><span class="header-section-number">6.5.1</span> Script of R code</h3>
 <p>An R script file of all R code used in this chapter is available <a href="http://ismayc.github.io/moderndiver-book/06-sim.R">here</a>.</p>
 </div>
-<div id="whats-to-come-3" class="section level2">
-<h2><span class="header-section-number">6.6</span> What’s to come?</h2>
-<p>This chapter has served as an introduction into inferential techniques that will be discussed in greater detail in Chapter <a href="7-hypo.html#hypo">7</a> for hypothesis testing and in Chapter <a href="8-ci.html#ci">8</a> for confidence intervals. In these chapters, we will see how we can use a related concept of <strong>resampling</strong> when working with the distributions of two groups. All of these concepts will be further reinforced in Chapter <a href="9-regression-via-broom.html#regress"><strong>??</strong></a> as well.</p>
+<div id="whats-to-come-3" class="section level3">
+<h3><span class="header-section-number">6.5.2</span> What’s to come?</h3>
+<p>This chapter has served as an introduction into inferential techniques that will be discussed in greater detail in Chapter <a href="7-hypo.html#hypo">7</a> for hypothesis testing and in Chapter <a href="8-ci.html#ci">8</a> for confidence intervals. In these chapters, we will see how we can use a related concept of <strong>resampling</strong> when working with the distributions of two groups. All of these concepts will be further reinforced in Chapter <a href="9-regress.html#regress">9</a> as well.</p>
 
+</div>
 </div>
 </div>
 <h3>References</h3>
 <div id="refs" class="references">
+<div id="ref-R-okcupiddata">
+<p>Kim, Albert Y., and Adriana Escobedo-Land. 2016. <em>Okcupiddata: OkCupid Profile Data for Introductory Statistics and Data Science Courses</em>. <a href="https://CRAN.R-project.org/package=okcupiddata" class="uri">https://CRAN.R-project.org/package=okcupiddata</a>.</p>
+</div>
+<div id="ref-R-mosaic">
+<p>Pruim, Randall, Daniel T. Kaplan, and Nicholas J. Horton. 2016. <em>Mosaic: Project Mosaic Statistics and Mathematics Teaching Utilities</em>. <a href="https://CRAN.R-project.org/package=mosaic" class="uri">https://CRAN.R-project.org/package=mosaic</a>.</p>
+</div>
 <div id="ref-salsburg2001">
 <p>Salsburg, David. 2001. <em>The Lady Tasting Tea: How Statistics Revolutionized Science in the Twentieth Century</em>. First Edition. New York, NY: W.H. Freeman.</p>
 </div>
@@ -724,7 +748,7 @@ <h3>References</h3>
           </div>
         </div>
       </div>
-<a href="5-data-manipulation-via-dplyr.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
+<a href="5-manip.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
 <a href="7-hypo.html" class="navigation navigation-next " aria-label="Next page""><i class="fa fa-angle-right"></i></a>
 
 <script src="libs/gitbook-2.6.7/js/app.min.js"></script>
diff --git a/docs/7-hypo.html b/docs/7-hypo.html
index b928b8d2d..3d1fa502c 100644
--- a/docs/7-hypo.html
+++ b/docs/7-hypo.html
@@ -26,14 +26,14 @@
 <meta name="author" content="Chester Ismay and Albert Y. Kim">
 
 
-<meta name="date" content="2017-01-07">
+<meta name="date" content="2017-01-10">
 
   <meta name="viewport" content="width=device-width, initial-scale=1">
   <meta name="apple-mobile-web-app-capable" content="yes">
   <meta name="apple-mobile-web-app-status-bar-style" content="black">
   
   
-<link rel="prev" href="6-simulating-randomness-via-mosaic.html">
+<link rel="prev" href="6-sim.html">
 <link rel="next" href="8-ci.html">
 
 <script src="libs/jquery-2.2.3/jquery.min.js"></script>
@@ -119,9 +119,9 @@
 
 <ul class="summary">
 <li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> Preamble</a><ul>
-<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book</a></li>
+<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book---for-instructors"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book - For Instructors</a></li>
 <li class="chapter" data-level="1.2" data-path="index.html"><a href="index.html#contribute"><i class="fa fa-check"></i><b>1.2</b> Contribute</a></li>
-<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started"><i class="fa fa-check"></i><b>1.3</b> Getting Started</a></li>
+<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started---for-students"><i class="fa fa-check"></i><b>1.3</b> Getting Started - For Students</a></li>
 <li class="chapter" data-level="" data-path="index.html"><a href="index.html#colophon"><i class="fa fa-check"></i>Colophon</a></li>
 </ul></li>
 <li class="chapter" data-level="2" data-path="2-intro.html"><a href="2-intro.html"><i class="fa fa-check"></i><b>2</b> Introduction</a><ul>
@@ -133,87 +133,100 @@
 </ul></li>
 <li class="part"><span><b>I Data Exploration</b></span></li>
 <li class="chapter" data-level="3" data-path="3-tidy.html"><a href="3-tidy.html"><i class="fa fa-check"></i><b>3</b> Tidy Data</a><ul>
+<li class="chapter" data-level="" data-path="3-tidy.html"><a href="3-tidy.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="3.1" data-path="3-tidy.html"><a href="3-tidy.html#what-is-tidy-data"><i class="fa fa-check"></i><b>3.1</b> What is tidy data?</a></li>
-<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#the-nycflights13-datasets"><i class="fa fa-check"></i><b>3.2</b> The <code>nycflights13</code> datasets</a></li>
-<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a></li>
+<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#datasets-in-the-nycflights13-package"><i class="fa fa-check"></i><b>3.2</b> Datasets in the <code>nycflights13</code> package</a></li>
+<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a><ul>
+<li class="chapter" data-level="3.3.1" data-path="3-tidy.html"><a href="3-tidy.html#identification-variables"><i class="fa fa-check"></i><b>3.3.1</b> Identification variables</a></li>
+</ul></li>
 <li class="chapter" data-level="3.4" data-path="3-tidy.html"><a href="3-tidy.html#normal-forms-of-data"><i class="fa fa-check"></i><b>3.4</b> Normal forms of data</a></li>
 <li class="chapter" data-level="3.5" data-path="3-tidy.html"><a href="3-tidy.html#whats-to-come"><i class="fa fa-check"></i><b>3.5</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via <code id="viz">ggplot2</code></a><ul>
-<li class="chapter" data-level="" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
-<li class="chapter" data-level="4.1.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
-<li class="chapter" data-level="4.1.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
-<li class="chapter" data-level="4.1.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
-<li class="chapter" data-level="4.1.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The <code>ggplot2</code> Package</a></li>
+<li class="chapter" data-level="4" data-path="4-viz.html"><a href="4-viz.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via ggplot2</a><ul>
+<li class="chapter" data-level="" data-path="4-viz.html"><a href="4-viz.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="4.1" data-path="4-viz.html"><a href="4-viz.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
+<li class="chapter" data-level="4.1.1" data-path="4-viz.html"><a href="4-viz.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
+<li class="chapter" data-level="4.1.2" data-path="4-viz.html"><a href="4-viz.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
+<li class="chapter" data-level="4.1.3" data-path="4-viz.html"><a href="4-viz.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
+<li class="chapter" data-level="4.1.4" data-path="4-viz.html"><a href="4-viz.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The ggplot2 Package</a></li>
+</ul></li>
+<li class="chapter" data-level="4.2" data-path="4-viz.html"><a href="4-viz.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
+<li class="chapter" data-level="4.3" data-path="4-viz.html"><a href="4-viz.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
+<li class="chapter" data-level="4.3.1" data-path="4-viz.html"><a href="4-viz.html#geompoint"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via geom_point</a></li>
+<li class="chapter" data-level="4.3.2" data-path="4-viz.html"><a href="4-viz.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
+<li class="chapter" data-level="4.3.3" data-path="4-viz.html"><a href="4-viz.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.4" data-path="4-viz.html"><a href="4-viz.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
+<li class="chapter" data-level="4.4.1" data-path="4-viz.html"><a href="4-viz.html#geomline"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via geom_line</a></li>
+<li class="chapter" data-level="4.4.2" data-path="4-viz.html"><a href="4-viz.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
 </ul></li>
-<li class="chapter" data-level="4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
-<li class="chapter" data-level="4.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
-<li class="chapter" data-level="4.3.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatter-plots-via-geom_point"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via <code id="geompoint">geom_point</code></a></li>
-<li class="chapter" data-level="4.3.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
-<li class="chapter" data-level="4.3.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
+<li class="chapter" data-level="4.5" data-path="4-viz.html"><a href="4-viz.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
+<li class="chapter" data-level="4.5.1" data-path="4-viz.html"><a href="4-viz.html#geomhistogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via geom_histogram</a></li>
+<li class="chapter" data-level="4.5.2" data-path="4-viz.html"><a href="4-viz.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
+<li class="chapter" data-level="4.5.3" data-path="4-viz.html"><a href="4-viz.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
 </ul></li>
-<li class="chapter" data-level="4.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
-<li class="chapter" data-level="4.4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#line-graphs-via-geom_line"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via <code id="geomline">geom_line</code></a></li>
-<li class="chapter" data-level="4.4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
+<li class="chapter" data-level="4.6" data-path="4-viz.html"><a href="4-viz.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
+<li class="chapter" data-level="4.7" data-path="4-viz.html"><a href="4-viz.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
+<li class="chapter" data-level="4.7.1" data-path="4-viz.html"><a href="4-viz.html#geomboxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via geom_boxplot</a></li>
+<li class="chapter" data-level="4.7.2" data-path="4-viz.html"><a href="4-viz.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
 </ul></li>
-<li class="chapter" data-level="4.5" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
-<li class="chapter" data-level="4.5.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms-via-geom_histogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via <code id="geomhistogram">geom_histogram</code></a></li>
-<li class="chapter" data-level="4.5.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
-<li class="chapter" data-level="4.5.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
+<li class="chapter" data-level="4.8" data-path="4-viz.html"><a href="4-viz.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
+<li class="chapter" data-level="4.8.1" data-path="4-viz.html"><a href="4-viz.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via geom_bar</a></li>
+<li class="chapter" data-level="4.8.2" data-path="4-viz.html"><a href="4-viz.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
+<li class="chapter" data-level="4.8.3" data-path="4-viz.html"><a href="4-viz.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
+<li class="chapter" data-level="4.8.4" data-path="4-viz.html"><a href="4-viz.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
 </ul></li>
-<li class="chapter" data-level="4.6" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
-<li class="chapter" data-level="4.7" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
-<li class="chapter" data-level="4.7.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#boxplots-via-geom_boxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via <code id="geomboxplot">geom_boxplot</code></a></li>
-<li class="chapter" data-level="4.7.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
+<li class="chapter" data-level="4.9" data-path="4-viz.html"><a href="4-viz.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="4.9.1" data-path="4-viz.html"><a href="4-viz.html#resources"><i class="fa fa-check"></i><b>4.9.1</b> Resources</a></li>
+<li class="chapter" data-level="4.9.2" data-path="4-viz.html"><a href="4-viz.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
+<li class="chapter" data-level="4.9.3" data-path="4-viz.html"><a href="4-viz.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.3</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="4.8" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
-<li class="chapter" data-level="4.8.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via <code>geom_bar</code></a></li>
-<li class="chapter" data-level="4.8.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
-<li class="chapter" data-level="4.8.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
-<li class="chapter" data-level="4.8.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
 </ul></li>
-<li class="chapter" data-level="4.9" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
-<li class="chapter" data-level="4.9.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.1</b> What’s to come?</a></li>
-<li class="chapter" data-level="4.9.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
+<li class="chapter" data-level="5" data-path="5-manip.html"><a href="5-manip.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via dplyr</a><ul>
+<li class="chapter" data-level="" data-path="5-manip.html"><a href="5-manip.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="5.1" data-path="5-manip.html"><a href="5-manip.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
+<li class="chapter" data-level="5.2" data-path="5-manip.html"><a href="5-manip.html#five-main-verbs---the-5mv"><i class="fa fa-check"></i><b>5.2</b> Five Main Verbs - The 5MV</a><ul>
+<li class="chapter" data-level="5.2.1" data-path="5-manip.html"><a href="5-manip.html#filter"><i class="fa fa-check"></i><b>5.2.1</b> 5MV#1: Filter observations using filter</a></li>
+<li class="chapter" data-level="5.2.2" data-path="5-manip.html"><a href="5-manip.html#mv2-summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> 5MV#2: Summarize variables using summarize</a></li>
+<li class="chapter" data-level="5.2.3" data-path="5-manip.html"><a href="5-manip.html#mv3-group-rows-using-group_by"><i class="fa fa-check"></i><b>5.2.3</b> 5MV#3: Group rows using group_by</a></li>
+<li class="chapter" data-level="5.2.4" data-path="5-manip.html"><a href="5-manip.html#mv4-create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.4</b> 5MV#4: Create new variables/change old variables using mutate</a></li>
+<li class="chapter" data-level="5.2.5" data-path="5-manip.html"><a href="5-manip.html#arrange"><i class="fa fa-check"></i><b>5.2.5</b> 5MV#5: Reorder the data frame using arrange</a></li>
 </ul></li>
+<li class="chapter" data-level="5.3" data-path="5-manip.html"><a href="5-manip.html#joining-data-frames"><i class="fa fa-check"></i><b>5.3</b> Joining data frames</a><ul>
+<li class="chapter" data-level="5.3.1" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables"><i class="fa fa-check"></i><b>5.3.1</b> Joining by Key Variables</a></li>
+<li class="chapter" data-level="5.3.2" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables-with-different-names"><i class="fa fa-check"></i><b>5.3.2</b> Joining by Key Variables with Different Names</a></li>
 </ul></li>
-<li class="chapter" data-level="5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via <code id="manip">dplyr</code></a><ul>
-<li class="chapter" data-level="" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="5.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
-<li class="chapter" data-level="5.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#four-main-verbs---the-4mv"><i class="fa fa-check"></i><b>5.2</b> Four Main Verbs - The 4MV</a><ul>
-<li class="chapter" data-level="5.2.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#filter-observations-using-filter"><i class="fa fa-check"></i><b>5.2.1</b> Filter observations using <code id="filter">filter</code></a></li>
-<li class="chapter" data-level="5.2.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> Summarize variables using <code>summarize</code></a></li>
-<li class="chapter" data-level="5.2.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.3</b> Create new variables/change old variables using <code>mutate</code></a></li>
-<li class="chapter" data-level="5.2.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#reorder-the-data-frame-using-arrange"><i class="fa fa-check"></i><b>5.2.4</b> Reorder the data frame using <code id="arrange">arrange</code></a></li>
+<li class="chapter" data-level="5.4" data-path="5-manip.html"><a href="5-manip.html#optional-other-verbs"><i class="fa fa-check"></i><b>5.4</b> Optional: Other verbs</a><ul>
+<li class="chapter" data-level="5.4.1" data-path="5-manip.html"><a href="5-manip.html#select"><i class="fa fa-check"></i><b>5.4.1</b> Select variables using select</a></li>
+<li class="chapter" data-level="5.4.2" data-path="5-manip.html"><a href="5-manip.html#rename"><i class="fa fa-check"></i><b>5.4.2</b> Rename variables using rename</a></li>
+<li class="chapter" data-level="5.4.3" data-path="5-manip.html"><a href="5-manip.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.4.3</b> Find the top number of values using top_n</a></li>
 </ul></li>
-<li class="chapter" data-level="5.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#other-verbs"><i class="fa fa-check"></i><b>5.3</b> Other verbs</a><ul>
-<li class="chapter" data-level="5.3.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#select-variables-using-select"><i class="fa fa-check"></i><b>5.3.1</b> Select variables using <code id="select">select</code></a></li>
-<li class="chapter" data-level="5.3.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#rename-variables-using-rename"><i class="fa fa-check"></i><b>5.3.2</b> Rename variables using <code id="rename">rename</code></a></li>
-<li class="chapter" data-level="5.3.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.3.3</b> Find the top number of values using <code>top_n</code></a></li>
+<li class="chapter" data-level="5.5" data-path="5-manip.html"><a href="5-manip.html#conclusion-1"><i class="fa fa-check"></i><b>5.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="5.5.1" data-path="5-manip.html"><a href="5-manip.html#resources-1"><i class="fa fa-check"></i><b>5.5.1</b> Resources</a></li>
+<li class="chapter" data-level="5.5.2" data-path="5-manip.html"><a href="5-manip.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5.2</b> Script of R code</a></li>
+<li class="chapter" data-level="5.5.3" data-path="5-manip.html"><a href="5-manip.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.5.3</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="5.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#joiningmerging-data-frames"><i class="fa fa-check"></i><b>5.4</b> Joining/merging data frames</a></li>
-<li class="chapter" data-level="5.5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5</b> Script of R code</a></li>
-<li class="chapter" data-level="5.6" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.6</b> What’s to come?</a></li>
 </ul></li>
 <li class="part"><span><b>II Inference</b></span></li>
-<li class="chapter" data-level="6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via <code id="sim">mosaic</code></a><ul>
-<li class="chapter" data-level="" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="6.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
-<li class="chapter" data-level="6.1.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
-<li class="chapter" data-level="6.1.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+<li class="chapter" data-level="6" data-path="6-sim.html"><a href="6-sim.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via mosaic</a><ul>
+<li class="chapter" data-level="" data-path="6-sim.html"><a href="6-sim.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="6.1" data-path="6-sim.html"><a href="6-sim.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
+<li class="chapter" data-level="6.1.1" data-path="6-sim.html"><a href="6-sim.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
+<li class="chapter" data-level="6.1.2" data-path="6-sim.html"><a href="6-sim.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
 </ul></li>
-<li class="chapter" data-level="6.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
-<li class="chapter" data-level="6.2.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
-<li class="chapter" data-level="6.2.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
+<li class="chapter" data-level="6.2" data-path="6-sim.html"><a href="6-sim.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
+<li class="chapter" data-level="6.2.1" data-path="6-sim.html"><a href="6-sim.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
+<li class="chapter" data-level="6.2.2" data-path="6-sim.html"><a href="6-sim.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
+</ul></li>
+<li class="chapter" data-level="6.3" data-path="6-sim.html"><a href="6-sim.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
+<li class="chapter" data-level="6.4" data-path="6-sim.html"><a href="6-sim.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
+<li class="chapter" data-level="6.5" data-path="6-sim.html"><a href="6-sim.html#conclusion-2"><i class="fa fa-check"></i><b>6.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="6.5.1" data-path="6-sim.html"><a href="6-sim.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5.1</b> Script of R code</a></li>
+<li class="chapter" data-level="6.5.2" data-path="6-sim.html"><a href="6-sim.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.5.2</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="6.3" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
-<li class="chapter" data-level="6.4" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
-<li class="chapter" data-level="6.5" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5</b> Script of R code</a></li>
-<li class="chapter" data-level="6.6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.6</b> What’s to come?</a></li>
 </ul></li>
 <li class="chapter" data-level="7" data-path="7-hypo.html"><a href="7-hypo.html"><i class="fa fa-check"></i><b>7</b> Hypothesis Testing</a><ul>
-<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="7.1" data-path="7-hypo.html"><a href="7-hypo.html#when-inference-is-not-needed"><i class="fa fa-check"></i><b>7.1</b> When Inference Is Not Needed</a></li>
 <li class="chapter" data-level="7.2" data-path="7-hypo.html"><a href="7-hypo.html#basics-of-hypothesis-testing"><i class="fa fa-check"></i><b>7.2</b> Basics of Hypothesis Testing</a></li>
 <li class="chapter" data-level="7.3" data-path="7-hypo.html"><a href="7-hypo.html#trial"><i class="fa fa-check"></i><b>7.3</b> Criminal trial analogy</a><ul>
@@ -249,44 +262,50 @@
 <li class="chapter" data-level="7.8.1" data-path="7-hypo.html"><a href="7-hypo.html#example-t-test-for-two-independent-samples"><i class="fa fa-check"></i><b>7.8.1</b> EXAMPLE: <span class="math inline">\(t\)</span>-test for two independent samples</a></li>
 <li class="chapter" data-level="7.8.2" data-path="7-hypo.html"><a href="7-hypo.html#conditions-for-t-test"><i class="fa fa-check"></i><b>7.8.2</b> Conditions for t-test</a></li>
 </ul></li>
-<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9</b> What’s to come?</a></li>
+<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#conclusion-3"><i class="fa fa-check"></i><b>7.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="7.9.1" data-path="7-hypo.html"><a href="7-hypo.html#script-of-r-code-3"><i class="fa fa-check"></i><b>7.9.1</b> Script of R code</a></li>
+<li class="chapter" data-level="7.9.2" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="chapter" data-level="8" data-path="8-ci.html"><a href="8-ci.html"><i class="fa fa-check"></i><b>8</b> Confidence Intervals</a><ul>
-<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a></li>
-<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#mean_rating"><i class="fa fa-check"></i><b>8.2</b> mean_rating</a><ul>
-<li class="chapter" data-level="8.2.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.2.1</b> Review of Bootstrapping</a></li>
+<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a><ul>
+<li class="chapter" data-level="8.1.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.1.1</b> Review of Bootstrapping</a></li>
 </ul></li>
-<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.3</b> Relation to hypothesis testing</a></li>
-<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.4</b> Effect size</a></li>
-<li class="chapter" data-level="8.5" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-3"><i class="fa fa-check"></i><b>8.5</b> Script of R code</a></li>
-<li class="chapter" data-level="8.6" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.6</b> What’s to come?</a></li>
+<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.2</b> Relation to hypothesis testing</a></li>
+<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.3</b> Effect size</a></li>
+<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#conclusion-4"><i class="fa fa-check"></i><b>8.4</b> Conclusion</a><ul>
+<li class="chapter" data-level="8.4.1" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-4"><i class="fa fa-check"></i><b>8.4.1</b> Script of R code</a></li>
+<li class="chapter" data-level="8.4.2" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.4.2</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="9" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html"><i class="fa fa-check"></i><b>9</b> Regression via <code id="regress">broom</code></a><ul>
-<li class="chapter" data-level="" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="9.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
-<li class="chapter" data-level="9.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
-<li class="chapter" data-level="9.2.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
 </ul></li>
-<li class="chapter" data-level="9.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
-<li class="chapter" data-level="9.3.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
-<li class="chapter" data-level="9.3.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
-<li class="chapter" data-level="9.3.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#interpretting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpretting the slope</a></li>
-<li class="chapter" data-level="9.3.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
+<li class="chapter" data-level="9" data-path="9-regress.html"><a href="9-regress.html"><i class="fa fa-check"></i><b>9</b> Regression via broom</a><ul>
+<li class="chapter" data-level="" data-path="9-regress.html"><a href="9-regress.html#needed-packages-6"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="9.1" data-path="9-regress.html"><a href="9-regress.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
+<li class="chapter" data-level="9.2" data-path="9-regress.html"><a href="9-regress.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
+<li class="chapter" data-level="9.2.1" data-path="9-regress.html"><a href="9-regress.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
 </ul></li>
-<li class="chapter" data-level="9.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
-<li class="chapter" data-level="9.4.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
-<li class="chapter" data-level="9.4.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
-<li class="chapter" data-level="9.4.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
-<li class="chapter" data-level="9.4.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
-<li class="chapter" data-level="9.4.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
+<li class="chapter" data-level="9.3" data-path="9-regress.html"><a href="9-regress.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
+<li class="chapter" data-level="9.3.1" data-path="9-regress.html"><a href="9-regress.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
+<li class="chapter" data-level="9.3.2" data-path="9-regress.html"><a href="9-regress.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
+<li class="chapter" data-level="9.3.3" data-path="9-regress.html"><a href="9-regress.html#interpreting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpreting the slope</a></li>
+<li class="chapter" data-level="9.3.4" data-path="9-regress.html"><a href="9-regress.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
+</ul></li>
+<li class="chapter" data-level="9.4" data-path="9-regress.html"><a href="9-regress.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
+<li class="chapter" data-level="9.4.1" data-path="9-regress.html"><a href="9-regress.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
+<li class="chapter" data-level="9.4.2" data-path="9-regress.html"><a href="9-regress.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
+<li class="chapter" data-level="9.4.3" data-path="9-regress.html"><a href="9-regress.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
+<li class="chapter" data-level="9.4.4" data-path="9-regress.html"><a href="9-regress.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.5" data-path="9-regress.html"><a href="9-regress.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
+<li class="chapter" data-level="9.4.6" data-path="9-regress.html"><a href="9-regress.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.7" data-path="9-regress.html"><a href="9-regress.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
+</ul></li>
+<li class="chapter" data-level="9.5" data-path="9-regress.html"><a href="9-regress.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
+<li class="chapter" data-level="9.6" data-path="9-regress.html"><a href="9-regress.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
+<li class="chapter" data-level="9.7" data-path="9-regress.html"><a href="9-regress.html#conclusion-5"><i class="fa fa-check"></i><b>9.7</b> Conclusion</a><ul>
+<li class="chapter" data-level="9.7.1" data-path="9-regress.html"><a href="9-regress.html#script-of-r-code-5"><i class="fa fa-check"></i><b>9.7.1</b> Script of R code</a></li>
+<li class="chapter" data-level="9.7.2" data-path="9-regress.html"><a href="9-regress.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.7.2</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="9.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
-<li class="chapter" data-level="9.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
-<li class="chapter" data-level="9.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#script-of-r-code-4"><i class="fa fa-check"></i><b>9.7</b> Script of R code</a></li>
-<li class="chapter" data-level="9.8" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.8</b> What’s to come?</a></li>
 </ul></li>
 <li class="part"><span><b>III Conclusion</b></span></li>
 <li class="chapter" data-level="10" data-path="10-effective-data-storytelling.html"><a href="10-effective-data-storytelling.html"><i class="fa fa-check"></i><b>10</b> Effective Data Storytelling</a><ul>
@@ -304,7 +323,8 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="B" data-path="B-appendixB.html"><a href="B-appendixB.html"><i class="fa fa-check"></i><b>B</b> Inference Examples</a><ul>
-<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-6"><i class="fa fa-check"></i><b>B.1</b> Needed packages</a></li>
+<li class="chapter" data-level="" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-7"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#inference-mind-map"><i class="fa fa-check"></i><b>B.1</b> Inference Mind Map</a></li>
 <li class="chapter" data-level="B.2" data-path="B-appendixB.html"><a href="B-appendixB.html#one-mean"><i class="fa fa-check"></i><b>B.2</b> One Mean</a><ul>
 <li class="chapter" data-level="B.2.1" data-path="B-appendixB.html"><a href="B-appendixB.html#problem-statement"><i class="fa fa-check"></i><b>B.2.1</b> Problem Statement</a></li>
 <li class="chapter" data-level="B.2.2" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses"><i class="fa fa-check"></i><b>B.2.2</b> Competing Hypotheses</a></li>
@@ -352,6 +372,7 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="C" data-path="C-appendixC.html"><a href="C-appendixC.html"><i class="fa fa-check"></i><b>C</b> Reach for the Starts</a><ul>
+<li class="chapter" data-level="" data-path="C-appendixC.html"><a href="C-appendixC.html#needed-packages-8"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="C.1" data-path="C-appendixC.html"><a href="C-appendixC.html#sorted-barplots"><i class="fa fa-check"></i><b>C.1</b> Sorted barplots</a></li>
 <li class="chapter" data-level="C.2" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-graphics"><i class="fa fa-check"></i><b>C.2</b> Interactive graphics</a><ul>
 <li class="chapter" data-level="C.2.1" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-line-graphs"><i class="fa fa-check"></i><b>C.2.1</b> Interactive line-graphs</a></li>
@@ -377,13 +398,12 @@ <h1>
             <section class="normal" id="section-">
 <div id="hypo" class="section level1">
 <h1><span class="header-section-number">7</span> Hypothesis Testing</h1>
-<p>We saw some of the main concepts of hypothesis testing introduced in Chapter <a href="6-simulating-randomness-via-mosaic.html#sim"><strong>??</strong></a>. We will expand further on these ideas here and also provide a framework for understanding hypothesis tests in general. Instead of presenting you with lots of different formulas and scenarios, we hope to build a way to think about all hypothesis tests. You can then adapt to different scenarios as needed down the road when you encounter different statistical situations.</p>
-<p>The same can be said for confidence intervals. There is one general framework that applies to all confidence intervals and we will elaborate on this further in Chapter <a href="8-ci.html#ci">8</a>. The specifics may change slightly for each variation, but the important idea is to understand the general framework so that you can apply it to more specific problems. We believe that this approach is much better in the long-term than teaching you specific tests and confidence intervals rigorously. You can find full worked out examples for five common hypothesis tests and their corresponding confidence intervals in Appendix <a href="B-appendixB.html#appendixB">B</a>. We recommend that you carefully review these examples as they also cover how the general frameworks apply to traditional normal-based methodologies like the <span class="math inline">\(t\)</span>-test and normal-theory confidence intervals. You’ll see there that these methods are just approximations for the general computational frameworks, but require conditions to be met for their results to be valid. The general frameworks using randomization, simulation, and bootstrapping do not hold the same sorts of restrictions and further advance computational thinking, which is one big reason for their emphasis throughout this textbook.</p>
-<div id="needed-packages-3" class="section level2 unnumbered">
-<h2>Needed packages</h2>
+<p>We saw some of the main concepts of hypothesis testing introduced in Chapter <a href="6-sim.html#sim">6</a>. We will expand further on these ideas here and also provide a framework for understanding hypothesis tests in general. Instead of presenting you with lots of different formulas and scenarios, we hope to build a way to think about all hypothesis tests. You can then adapt to different scenarios as needed down the road when you encounter different statistical situations.</p>
+<p>The same can be said for confidence intervals. There is one general framework that applies to all confidence intervals and we will elaborate on this further in Chapter <a href="8-ci.html#ci">8</a>. The specifics may change slightly for each variation, but the important idea is to understand the general framework so that you can apply it to more specific problems. We believe that this approach is much better in the long-term than teaching you specific tests and confidence intervals rigorously. You can find fully-worked out examples for five common hypothesis tests and their corresponding confidence intervals in Appendix <a href="B-appendixB.html#appendixB">B</a>. We recommend that you carefully review these examples as they also cover how the general frameworks apply to traditional normal-based methodologies like the <span class="math inline">\(t\)</span>-test and normal-theory confidence intervals. You’ll see there that these methods are just approximations for the general computational frameworks, but require conditions to be met for their results to be valid. The general frameworks using randomization, simulation, and bootstrapping do not hold the same sorts of restrictions and further advance computational thinking, which is one big reason for their emphasis throughout this textbook.</p>
+<div id="needed-packages-4" class="section level3 unnumbered">
+<h3>Needed packages</h3>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(dplyr)
 <span class="kw">library</span>(ggplot2)
-<span class="kw">library</span>(okcupiddata)
 <span class="kw">library</span>(mosaic)
 <span class="kw">library</span>(knitr)
 <span class="kw">library</span>(nycflights13)</code></pre></div>
@@ -415,12 +435,12 @@ <h2><span class="header-section-number">7.1</span> When Inference Is Not Needed<
 <tr class="odd">
 <td align="left">BOS</td>
 <td align="right">38.35</td>
-<td align="right">5.726732</td>
+<td align="right">5.727</td>
 </tr>
 <tr class="even">
 <td align="left">SFO</td>
 <td align="right">345.61</td>
-<td align="right">15.354988</td>
+<td align="right">15.355</td>
 </tr>
 </tbody>
 </table>
@@ -437,7 +457,7 @@ <h2><span class="header-section-number">7.1</span> When Inference Is Not Needed<
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(ggplot2)
 <span class="kw">ggplot</span>(<span class="dt">data =</span> bos_sfo, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> dest, <span class="dt">y =</span> air_time)) +
 <span class="st">  </span><span class="kw">geom_boxplot</span>()</code></pre></div>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-72-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-77-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 <p>Since there is no overlap at all, we can conclude that the <code>air_time</code> for San Francisco flights is statistically greater (at any level of significance) than the <code>air_time</code> for Boston flights. This is a clear example of not needing to do anything more than some simple descriptive statistics to get an appropriate inferential conclusion. This is one reason why you should <strong>ALWAYS</strong> investigate the sample data first using <code>dplyr</code> and <code>ggplot2</code> via exploratory data analysis.</p>
 <p>As you get more and more practice with hypothesis testing, you’ll be better able to determine in many cases whether or not the results will be statistically significant. There are circumstances where it is difficult to tell, but you should always try to make a guess FIRST about significance after you have completed your data exploration and before you actually begin the inferential techniques.</p>
 </div>
@@ -462,8 +482,9 @@ <h2><span class="header-section-number">7.2</span> Basics of Hypothesis Testing<
 </div>
 <div id="trial" class="section level2">
 <h2><span class="header-section-number">7.3</span> Criminal trial analogy</h2>
-<p>We can think of hypothesis testing in the same context as a criminal trial in the United States. A criminal trial in the United States is a familiar situation in which a choice between two contradictory claims must be made. 1. The accuser of the crime must be judged either guilty or not guilty.</p>
-<ol start="2" style="list-style-type: decimal">
+<p>We can think of hypothesis testing in the same context as a criminal trial in the United States. A criminal trial in the United States is a familiar situation in which a choice between two contradictory claims must be made.</p>
+<ol style="list-style-type: decimal">
+<li><p>The accuser of the crime must be judged either guilty or not guilty.</p></li>
 <li><p>Under the U.S. system of justice, the individual on trial is initially presumed not guilty.</p></li>
 <li><p>Only STRONG EVIDENCE to the contrary causes the not guilty claim to be rejected in favor of a guilty verdict.</p></li>
 <li><p>The phrase “beyond a reasonable doubt” is often used to set the cutoff value for when enough evidence has been given to convict.</p></li>
@@ -497,10 +518,14 @@ <h2><span class="header-section-number">7.4</span> Types of Errors in Hypothesis
 <li>an innocent person is convicted (found guilty) or</li>
 <li>a guilty person is set free (found not guilty).</li>
 </ul>
-<p>The possible errors in a hypothesis test are - rejecting <span class="math inline">\(H_0\)</span> when in fact <span class="math inline">\(H_0\)</span> is true (Type I Error) - failing to reject <span class="math inline">\(H_0\)</span> when in fact <span class="math inline">\(H_0\)</span> is false (Type II Error)</p>
+<p>The possible errors in a hypothesis test are</p>
+<ul>
+<li>rejecting <span class="math inline">\(H_0\)</span> when in fact <span class="math inline">\(H_0\)</span> is true (Type I Error) or</li>
+<li>failing to reject <span class="math inline">\(H_0\)</span> when in fact <span class="math inline">\(H_0\)</span> is false (Type II Error).</li>
+</ul>
 <p>The risk of error is the price researchers pay for basing an inference about a population on a sample. With any reasonable sample-based procedure, there is some chance that a Type I error will be made and some chance that a Type II error will occur.</p>
 <p>To help understand the concepts of Type I error and Type II error, observe the following table:</p>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-73"></span>
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-78"></span>
 <img src="images/errors.png" alt="Type I and Type II errors" width="\textwidth" />
 <p class="caption">
 Figure 7.2: Type I and Type II errors
@@ -518,19 +543,19 @@ <h2><span class="header-section-number">7.4</span> Types of Errors in Hypothesis
 </ul>
 <p>Ideally, we want <span class="math inline">\(\alpha = 0\)</span> and <span class="math inline">\(\beta = 0\)</span>, meaning that the chance of making an error does not exist. When we have to use incomplete information (sample data), it is not possible to have both <span class="math inline">\(\alpha = 0\)</span> and <span class="math inline">\(\beta = 0\)</span>. We will always have the possibility of at least one error existing when we use sample data.</p>
 <p>Usually, what is done is that <span class="math inline">\(\alpha\)</span> is set before the hypothesis test is conducted and then the evidence is judged against that significance level. Common values for <span class="math inline">\(\alpha\)</span> are 0.05, 0.01, and 0.10. If <span class="math inline">\(\alpha = 0.05\)</span>, we are using a testing procedure that, used over and over with different samples, rejects a TRUE null hypothesis five percent of the time.</p>
-<p>So if we can set <span class="math inline">\(\alpha\)</span> to be whatever we want, why choose 0.05 instead of 0.01 or even better 0.0000000000000001? Well, a small <span class="math inline">\(\alpha\)</span> means the test procedure requires the evidence against <span class="math inline">\(H_0\)</span> to be <strong>very strong</strong> before we can reject <span class="math inline">\(H_0\)</span>. This means we will almost never reject <span class="math inline">\(H_0\)</span> if <span class="math inline">\(\alpha\)</span> is very small. If we almost never reject <span class="math inline">\(H_0\)</span>, the probability of a Type II Error – failing to reject <span class="math inline">\(H_0\)</span> when we should – will <em>increase</em>! Thus, as <span class="math inline">\(\alpha\)</span> decreases, <span class="math inline">\(\beta\)</span> increases and as <span class="math inline">\(\alpha\)</span> increases, <span class="math inline">\(\beta\)</span> decreases. We, therefore, need to strike a balance in <span class="math inline">\(\alpha\)</span> and <span class="math inline">\(\beta\)</span> and the common values of 0.05, 0.01, and 0.10 usually lead to a nice balance.</p>
+<p>So if we can set <span class="math inline">\(\alpha\)</span> to be whatever we want, why choose 0.05 instead of 0.01 or even better 0.0000000000000001? Well, a small <span class="math inline">\(\alpha\)</span> means the test procedure requires the evidence against <span class="math inline">\(H_0\)</span> to be <strong>very strong</strong> before we can reject <span class="math inline">\(H_0\)</span>. This means we will almost never reject <span class="math inline">\(H_0\)</span> if <span class="math inline">\(\alpha\)</span> is very small. If we almost never reject <span class="math inline">\(H_0\)</span>, the probability of a Type II Error – failing to reject <span class="math inline">\(H_0\)</span> when we should – will <em>increase</em>! Thus, as <span class="math inline">\(\alpha\)</span> decreases, <span class="math inline">\(\beta\)</span> increases and as <span class="math inline">\(\alpha\)</span> increases, <span class="math inline">\(\beta\)</span> decreases. We, therefore, need to strike a balance in <span class="math inline">\(\alpha\)</span> and <span class="math inline">\(\beta\)</span> and the common values for <span class="math inline">\(\alpha\)</span> of 0.05, 0.01, and 0.10 usually lead to a nice balance.</p>
 <hr />
 <div class="learncheck">
 <p>
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC7.2)</strong> Reproduce the table above, but for a hypothesis test, instead of the one provided for a criminal trial.</p>
+<p><strong>(LC7.2)</strong> Reproduce the table above about errors, but for a hypothesis test, instead of the one provided for a criminal trial.</p>
 <hr />
 <div id="logic-of-hypothesis-testing" class="section level3">
 <h3><span class="header-section-number">7.4.1</span> Logic of Hypothesis Testing</h3>
 <ul>
-<li>Take a random sample (or samples) from a population (or two populations)</li>
+<li>Take a random sample (or samples) from a population (or multiple populations)</li>
 <li>If the sample data are consistent with the null hypothesis, do not reject the null hypothesis.</li>
 <li>If the sample data are inconsistent with the null hypothesis (in the direction of the alternative hypothesis), reject the null hypothesis and conclude that there is evidence the alternative hypothesis is true (based on the particular sample collected).</li>
 </ul>
@@ -538,11 +563,7 @@ <h3><span class="header-section-number">7.4.1</span> Logic of Hypothesis Testing
 </div>
 <div id="statistical-significance" class="section level2">
 <h2><span class="header-section-number">7.5</span> Statistical Significance</h2>
-<p>The idea that sample results are more extreme than we would reasonably expect to see by random chance if the null hypothesis were true is the fundamental idea behind statistical hypothesis tests. If data as extreme would be very unlikely if the null hypothesis were true, we say the data are <strong>statistically significant</strong>. Statistically significant data provide convincing evidence against the null hypothesis in favor of the alternative, and allow us to generalize our sample results to the claim about the population.</p>
-<hr />
-<p><strong>Definition: Statistical Significance</strong></p>
-<p>When results as extreme as the observed sample statistic are unlikely to occur by random chance alone (assuming the null hypothesis is true), we say the sample results/statistics are <em>statistically significant</em>. If our sample is statistically significant, we have convincing evidence against <span class="math inline">\(H_0\)</span> and in favor of <span class="math inline">\(H_a\)</span>.</p>
-<hr />
+<p>The idea that sample results are more extreme than we would reasonably expect to see by random chance if the null hypothesis were true is the fundamental idea behind statistical hypothesis tests. If data at least as extreme would be very unlikely if the null hypothesis were true, we say the data are <strong>statistically significant</strong>. Statistically significant data provide convincing evidence against the null hypothesis in favor of the alternative, and allow us to generalize our sample results to the claim about the population.</p>
 <div class="learncheck">
 <p>
 <strong><em>Learning check</em></strong>
@@ -562,10 +583,10 @@ <h2><span class="header-section-number">7.6</span> EXAMPLE: Revisiting the Lady
 Figure 7.3: Hypothesis Testing Framework
 </p>
 </div>
-<p>We will now walk-through how each of the steps to the diagram apply to determining whether the lady tasting tea was actually better than chance at determining whether or not milk was added first. We will see that the process of creating a null distribution is a statistical way to quantifying surprise.</p>
+<p>We will now walk through how each of the steps to the diagram apply to determining whether the lady tasting tea was actually better than chance at determining whether or not milk was added first. We will see that the process of creating a null distribution is a statistical way to quantifying surprise.</p>
 <div id="data" class="section level3">
 <h3><span class="header-section-number">7.6.1</span> Data</h3>
-<p>Let’s assume as we did in Chapter <a href="6-simulating-randomness-via-mosaic.html#sim"><strong>??</strong></a>, that the lady is correct in determining whether milk was added first or not in 9 out of 10 trials. Our data, therefore, may look something like</p>
+<p>Let’s assume as we did in Chapter <a href="6-sim.html#sim">6</a> that the lady is correct in determining whether milk was added first or not in 9 out of 10 trials. Our data, therefore, may look something like</p>
 <table>
 <tbody>
 <tr class="odd">
@@ -615,12 +636,13 @@ <h3><span class="header-section-number">7.6.4</span> Model of <span class="math
 </div>
 <div id="simulated-data" class="section level3">
 <h3><span class="header-section-number">7.6.5</span> Simulated Data</h3>
-<p>We now want to use this null hypothesis to simulate the test statistic assuming that the null hypothesis is true. Therefore, we want to figure out a way to simulate in 10 trials, getting either the choice Correct or Incorrect, assuming that the probability of success (getting it Correct) in any given trial is 0.5.</p>
+<p>We now want to use this null hypothesis to simulate the test statistic assuming that the null hypothesis is true. Therefore, we want to figure out a way to simulate 10 trials, getting either the choice Correct or Incorrect, assuming that the probability of success (getting it Correct) in any given trial is 0.5.</p>
 <p><strong>Tactile simulation</strong></p>
 <p>When you are presented with a hypothesis testing problem, frequently the most challenging portion is setting up how to simulate the data assuming the null hypothesis is true. To facilitate with this, setting up a tactile, hands on experiment can help.</p>
-<p>In this case, flipping a fair coin is a great way to simulate this process. To simulate 10 trials, we could flip the fair coin and record Heads as Correct and Tails as Incorrect.</p>
+<p>In this case, flipping a fair coin is a great way to simulate this process. This simulates how the sample could be collected assuming the null hypothesis is true. To simulate 10 trials, we could flip the fair coin and record Heads as Correct and Tails as Incorrect.</p>
 <p>Some simulated data using this coin flipping procedure may look like the following. Note that this data frame is not tidy, but is a convenient way to look at the results of the simulation in this wide format. The numbers on the fair left correspond to the number of the trial.</p>
 <table>
+<caption><span id="tab:sample-table">Table 7.1: </span>A table of three sets of 10 coin flips</caption>
 <thead>
 <tr class="header">
 <th></th>
@@ -697,12 +719,12 @@ <h3><span class="header-section-number">7.6.5</span> Simulated Data</h3>
 </div>
 <div id="distribution-of-delta-under-h_0" class="section level3">
 <h3><span class="header-section-number">7.6.6</span> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></h3>
-<p>We could continue this process say 10,000 times by flipping a coin in sets of 10 for 10,000 repetitions and counting and taking note of how many heads out of 10 we have for each set. It’s at this point that you realize that a computer can do this procedure much faster and more efficient than the tactile experiment with a coin.</p>
+<p>We could continue this process, say, 10,000 times by flipping a coin in sets of 10 for 10,000 repetitions and counting and taking note of how many heads out of 10 we have for each set. It’s at this point that you surely realize that a computer can do this procedure much faster and more efficient than the tactile experiment with a coin.</p>
 <p>Recall that we’ve already created the distribution of 10,000 such coin flips and we’ve stored these values in the <code>heads</code> variable in the <code>simGuesses</code> data frame:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(ggplot2)
 <span class="kw">ggplot</span>(<span class="dt">data =</span> simGuesses, <span class="kw">aes</span>(<span class="dt">x =</span> <span class="kw">factor</span>(heads))) +
 <span class="st">  </span><span class="kw">geom_bar</span>()</code></pre></div>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-77-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-81-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 </div>
 <div id="the-p-value" class="section level3">
 <h3><span class="header-section-number">7.6.7</span> The p-value</h3>
@@ -711,682 +733,40 @@ <h3><span class="header-section-number">7.6.7</span> The p-value</h3>
 <p>The <strong>p-value</strong> is the probability of observing a sample statistic as extreme or more extreme than what was observed, assuming that the null hypothesis of a by chance operation is true.</p>
 <hr />
 <p>This definition may be a little intimidating the first time you read it, but it’s important to come back to this “The Lady Tasting Tea” problem whenever you encounter <span class="math inline">\(p\)</span>-values as you begin to learn about the concept. Here the <span class="math inline">\(p\)</span>-value corresponds to how many times in our <strong>null distribution</strong> of <code>heads</code> 9 or more heads occurred.</p>
-<p>We can use another neat feature of R to calculate the <span class="math inline">\(p\)</span>-value for this problem. Note that “more extreme” in this case corresponds to looking at values of 9 or greater since our alternative hypothesis invokes a right-tail test corresponding to a “greater than” hypothesis of <span class="math inline">\(H_a: \pi &gt; 0.5\)</span>. In other words, we are looking to see how likely it is for the lady to pick 9 or more correct instead of 9 or less correct. We’d like to go in the right direction.</p>
+<p>We can use another neat feature of R to calculate the <span class="math inline">\(p\)</span>-value for this problem. Note that “more extreme” in this case corresponds to looking at values of 9 or greater since our alternative hypothesis invokes a right-tail test corresponding to a “greater than” hypothesis of <span class="math inline">\(H_a: \tau &gt; 5\)</span>. In other words, we are looking to see how likely it is for the lady to pick 9 or more correct instead of 9 or less correct. We’d like to go in the right direction.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">pvalue_tea &lt;-<span class="st"> </span>simGuesses %&gt;%
 <span class="st">  </span><span class="kw">filter</span>(heads &gt;=<span class="st"> </span><span class="dv">9</span>) %&gt;%
 <span class="st">  </span><span class="kw">nrow</span>() /<span class="st"> </span><span class="kw">nrow</span>(simGuesses)</code></pre></div>
 <p>Let’s walk through each step of this calculation:</p>
 <ol style="list-style-type: decimal">
 <li><p>First, <code>pvalue_tea</code> will be the name of our calculated <span class="math inline">\(p\)</span>-value and the assignment operator <code>&lt;-</code> directs us to this naming.</p></li>
-<li><p>We are working with the <code>simGuesses</code> data frame here so that comes immediately before the pipe operator.</p></li>
-<li><p>We would like to only focus on the rows in our <code>simGuesses</code> data frame that have <code>heads</code> values of 9 or 10. This represents simulated statistics “as extreme or more extreme” than what we observed (9 correct guesses out of 10). Let’s get a glimpse of what we have up to this point:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">kable</span>(simGuesses %&gt;%<span class="st"> </span><span class="kw">filter</span>(heads &gt;=<span class="st"> </span><span class="dv">9</span>))    </code></pre></div>
-<table>
-<thead>
-<tr class="header">
-<th align="right">n</th>
-<th align="right">heads</th>
-<th align="right">tails</th>
-<th align="right">prop</th>
-</tr>
-</thead>
-<tbody>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">10</td>
-<td align="right">0</td>
-<td align="right">1.0</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">10</td>
-<td align="right">0</td>
-<td align="right">1.0</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">10</td>
-<td align="right">0</td>
-<td align="right">1.0</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">10</td>
-<td align="right">0</td>
-<td align="right">1.0</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">10</td>
-<td align="right">0</td>
-<td align="right">1.0</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">10</td>
-<td align="right">0</td>
-<td align="right">1.0</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">10</td>
-<td align="right">0</td>
-<td align="right">1.0</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">10</td>
-<td align="right">0</td>
-<td align="right">1.0</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">10</td>
-<td align="right">0</td>
-<td align="right">1.0</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">10</td>
-<td align="right">0</td>
-<td align="right">1.0</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">10</td>
-<td align="right">0</td>
-<td align="right">1.0</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">10</td>
-<td align="right">0</td>
-<td align="right">1.0</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">10</td>
-<td align="right">0</td>
-<td align="right">1.0</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">10</td>
-<td align="right">0</td>
-<td align="right">1.0</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="even">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-<tr class="odd">
-<td align="right">10</td>
-<td align="right">9</td>
-<td align="right">1</td>
-<td align="right">0.9</td>
-</tr>
-</tbody>
-</table></li>
+<li>We are working with the <code>simGuesses</code> data frame here so that comes immediately before the pipe operator.<br />
+</li>
+<li><p>We would like to only focus on the rows in our <code>simGuesses</code> data frame that have <code>heads</code> values of 9 or 10. This represents simulated statistics “as extreme or more extreme” than what we observed (9 correct guesses out of 10). To get a glimpse of what we have up to this point, run <code>simGuesses %&gt;% filter(heads &gt;= 9) %&gt;% View()</code>.</p></li>
 <li><p>Now that we have changed the focus to only those rows that have number of heads out of 10 flips corresponding to 9 or more, we count how many of those there are. The function <code>nrow</code> gives how many entries are in this filtered data frame and lastly we calculate the proportion that are at least as extreme as our observed value of 9 by dividing by the number of total simulations (10,000).</p></li>
 </ol>
-<p>We can see that the observed statistic of 9 correct guesses is not a likely outcome assuming the null hypothesis is true. Only around 1% of the outcomes in our 10,000 simulations fall at or above 9 successes. We have evidence supporting the conclusion that the person is actually better than just guessing at random at determining whether milk has been added first or not. To better visualize this we can also make use of pink shading on the histogram corresponding to the <span class="math inline">\(p\)</span>-value:</p>
+<p>We can see that the observed statistic of 9 correct guesses is not a likely outcome assuming the null hypothesis is true. Only around 1% of the outcomes in our 10,000 simulations fall at or above 9 successes. We have evidence supporting the conclusion that the person is actually better than just guessing at random at determining whether milk has been added first or not. To better visualize this we can also make use of blue shading on the histogram corresponding to the <span class="math inline">\(p\)</span>-value:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(ggplot2)
   <span class="kw">ggplot</span>(<span class="dt">data =</span> simGuesses, <span class="kw">aes</span>(<span class="dt">x =</span> <span class="kw">factor</span>(heads), <span class="dt">fill =</span> (heads &gt;=<span class="st"> </span><span class="dv">9</span>))) +
 <span class="st">  </span><span class="kw">geom_bar</span>() +
 <span class="st">  </span><span class="kw">labs</span>(<span class="dt">x =</span> <span class="st">&quot;heads&quot;</span>)</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-80"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-80-1.png" alt="Barplot of heads with p-value highlighted" width="\textwidth" />
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-83"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-83-1.png" alt="Barplot of heads with p-value highlighted" width="\textwidth" />
 <p class="caption">
 Figure 7.4: Barplot of heads with p-value highlighted
 </p>
 </div>
-<p>This helps us better see just how few of the values of <code>heads</code> are at our observed value or more extreme.</p>
-<p>We’ll see in Chapters <a href="7-hypo.html#hypo">7</a> and <a href="8-ci.html#ci">8</a> that this idea of a <span class="math inline">\(p\)</span>-value can be extended to the more traditional methods using normal and <span class="math inline">\(t\)</span> distributions in the traditional way that introductory statistics has been presented. These traditional methods were used because statisticians haven’t always been able to do 10,000 simulations on the computer within seconds. We’ll elaborate on this more in these later chapters.</p>
+<p>This helps us better see just how few of the values of <code>heads</code> are at our observed value or more extreme. This idea of a <span class="math inline">\(p\)</span>-value can be extended to the more traditional methods using normal and <span class="math inline">\(t\)</span> distributions in the traditional way that introductory statistics has been presented. These traditional methods were used because statisticians haven’t always been able to do 10,000 simulations on the computer within seconds. We’ll elaborate on this more in a few sections.</p>
 <hr />
 <div class="learncheck">
 <p>
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC7.6)</strong> What is meant by “pseudo-random number generation?”</p>
-<p><strong>(LC7.7)</strong> How can simulation be used to help us address the question of whether or not an observed result is statistically significant?</p>
-<p><strong>(LC7.8)</strong> In Chapter <a href="4-data-visualization-via-ggplot2.html#viz"><strong>??</strong></a>, we noted that barplots should be used when creating a plot of categorical variables. Why are we using barplots to make a plot of a numerical variable <code>heads</code> in this chapter?</p>
+<p><strong>(LC7.6)</strong> How could we make Table <a href="7-hypo.html#tab:sample-table">7.1</a> into a tidy data frame?</p>
+<p><strong>(LC7.7)</strong> What is meant by “pseudo-random number generation?”</p>
+<p><strong>(LC7.8)</strong> How can simulation be used to help us address the question of whether or not an observed result is statistically significant?</p>
+<p><strong>(LC7.9)</strong> In Chapter <a href="4-viz.html#viz">4</a>, we noted that barplots should be used when creating a plot of categorical variables. Why are we using barplots to make a plot of a numerical variable <code>heads</code> in this chapter?</p>
 <hr />
 </div>
 </div>
@@ -1394,13 +774,13 @@ <h3><span class="header-section-number">7.6.7</span> The p-value</h3>
 <h2><span class="header-section-number">7.7</span> EXAMPLE: Comparing two means</h2>
 <div id="randomizationpermutation" class="section level3">
 <h3><span class="header-section-number">7.7.1</span> Randomization/Permutation</h3>
-<p>We will now focus on building hypotheses looking at the difference between two population means in an example. We will denote population means using the Greek symbol <span class="math inline">\(\mu\)</span> (pronounced “mu”). Thus, we will be looking to see if one group “out-performs” another group. This is quite possibly the most common type of statistical inference and serves as a basis for many other types of analyses when comparing two groups.</p>
+<p>We will now focus on building hypotheses looking at the difference between two population means in an example. We will denote population means using the Greek symbol <span class="math inline">\(\mu\)</span> (pronounced “mu”). Thus, we will be looking to see if one group “out-performs” another group. This is quite possibly the most common type of statistical inference and serves as a basis for many other types of analyses when comparing the relationship between two variables.</p>
 <p>Our null hypothesis will be of the form <span class="math inline">\(H_0: \mu_1 = \mu_2\)</span>, which can also be written as <span class="math inline">\(H_0: \mu_1 - \mu_2 = 0\)</span>. Our alternative hypothesis will be of the form <span class="math inline">\(H_0: \mu_1 \star \mu_2\)</span> (or <span class="math inline">\(H_a: \mu_1 - \mu_2 \, \star \, 0\)</span>) where <span class="math inline">\(\star\)</span> = <span class="math inline">\(&lt;\)</span>, <span class="math inline">\(\ne\)</span>, or <span class="math inline">\(&gt;\)</span> depending on the context of the problem. You needn’t focus on these new symbols too much at this point. It will just be a shortcut way for us to describe our hypotheses.</p>
 <p>As we saw earlier, simulation is a valuable tool when conducting inferences based on one population variable. We will see that the process of <strong>randomization</strong> (also known as <strong>permutation</strong>) will be valuable in conducting tests comparing quantitative values from two groups.</p>
 </div>
 <div id="comparing-action-and-romance-movies" class="section level3">
 <h3><span class="header-section-number">7.7.2</span> Comparing Action and Romance Movies</h3>
-<p>The <code>movies</code> data set in the <code>ggplot2movies</code> package contains information on a large number of movies that have been rated by users of IMDB.com. We are interested in the question here of whether <code>Action</code> movies are rated higher on IMDB than <code>Romance</code> movies. We will first need to do a little bit of data manipulation using the ideas from Chapter <a href="5-data-manipulation-via-dplyr.html#manip"><strong>??</strong></a> to get the data in the form that we would like:</p>
+<p>The <code>movies</code> data set in the <code>ggplot2movies</code> package contains information on a large number of movies that have been rated by users of IMDB.com <span class="citation">(Wickham <a href="#ref-R-ggplot2movies">2015</a>)</span>. We are interested in the question here of whether <code>Action</code> movies are rated higher on IMDB than <code>Romance</code> movies. We will first need to do a little bit of data manipulation using the ideas from Chapter <a href="5-manip.html#manip">5</a> to get the data in the form that we would like:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(dplyr)
 <span class="kw">library</span>(ggplot2movies)
 (movies_trimmed &lt;-<span class="st"> </span>movies %&gt;%<span class="st"> </span><span class="kw">select</span>(title, year, rating, Action, Romance))</code></pre></div>
@@ -1435,15 +815,15 @@ <h3><span class="header-section-number">7.7.2</span> Comparing Action and Romanc
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC7.9)</strong> Why are the different genre variables stored as binary variables (1s and 0s) instead of just listing the <code>genre</code> as a column of values like “Action”, “Comedy”, etc.?</p>
-<p><strong>(LC7.10)</strong> What complications could come above with us excluding action romance movies? Should we question the results of our hypothesis test? Explain.</p>
+<p><strong>(LC7.10)</strong> Why are the different genre variables stored as binary variables (1s and 0s) instead of just listing the <code>genre</code> as a column of values like “Action”, “Comedy”, etc.?</p>
+<p><strong>(LC7.11)</strong> What complications could come above with us excluding action romance movies? Should we question the results of our hypothesis test? Explain.</p>
 <hr />
 <p>Let’s now visualize the distributions of <code>rating</code> across both levels of <code>genre</code>. Think about what type(s) of plot is/are appropriate here before you proceed:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(ggplot2)
 <span class="kw">ggplot</span>(<span class="dt">data =</span> movies_trimmed, <span class="kw">aes</span>(<span class="dt">x =</span> genre, <span class="dt">y =</span> rating)) +
 <span class="st">  </span><span class="kw">geom_boxplot</span>()</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-84"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-84-1.png" alt="Rating vs genre in the population" width="\textwidth" />
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-87"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-87-1.png" alt="Rating vs genre in the population" width="\textwidth" />
 <p class="caption">
 Figure 7.5: Rating vs genre in the population
 </p>
@@ -1462,7 +842,7 @@ <h3><span class="header-section-number">7.7.2</span> Comparing Action and Romanc
 </div>
 <div id="sampling-rightarrow-randomization" class="section level3">
 <h3><span class="header-section-number">7.7.3</span> Sampling <span class="math inline">\(\rightarrow\)</span> Randomization</h3>
-<p>We can use hypothesis testing to investigate ways to determine, for example, whether a <strong>treatment</strong> has an effect over a <strong>control</strong> and other ways to statistically analyze if one group performs better than, worse than, or different than another. We will also use confidence intervals to determine the size of the effect if it exists. You’ll see more on this in Chapter <a href="8-ci.html#ci">8</a>.</p>
+<p>We can use hypothesis testing to investigate ways to determine, for example, whether a <strong>treatment</strong> has an effect over a <strong>control</strong> and other ways to statistically analyze if one group performs better than, worse than, or different than another. We will also use confidence intervals to determine the size of the effect, if it exists. You’ll see more on this in Chapter <a href="8-ci.html#ci">8</a>.</p>
 <p>We are interested here in seeing how we can use a random sample of action movies and a random sample of romance movies from <code>movies</code> to determine if a statistical difference exists in the mean ratings of each group.</p>
 <hr />
 <div class="learncheck">
@@ -1470,7 +850,7 @@ <h3><span class="header-section-number">7.7.3</span> Sampling <span class="math
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC7.11)</strong> Define the relevant parameters here in terms of the populations of movies.</p>
+<p><strong>(LC7.12)</strong> Define the relevant parameters here in terms of the populations of movies.</p>
 <hr />
 </div>
 <div id="data-1" class="section level3">
@@ -1483,10 +863,10 @@ <h3><span class="header-section-number">7.7.4</span> Data</h3>
 <span class="st">  </span><span class="kw">group_by</span>(genre) %&gt;%
 <span class="st">  </span><span class="kw">sample_n</span>(<span class="dv">34</span>)</code></pre></div>
 <p>We can now observe the distributions of our two sample ratings for both groups. Remember that these plots should be rough approximations of our population distributions of movie ratings for <code>&quot;Action&quot;</code> and <code>&quot;Romance&quot;</code> in our population of all movies in the <code>movies</code> data frame.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"> <span class="kw">ggplot</span>(<span class="dt">data =</span> movies_genre_sample, <span class="kw">aes</span>(<span class="dt">x =</span> genre, <span class="dt">y =</span> rating)) +
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> movies_genre_sample, <span class="kw">aes</span>(<span class="dt">x =</span> genre, <span class="dt">y =</span> rating)) +
 <span class="st">  </span><span class="kw">geom_boxplot</span>()</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-86"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-86-1.png" alt="Genre vs rating for our sample" width="\textwidth" />
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-89"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-89-1.png" alt="Genre vs rating for our sample" width="\textwidth" />
 <p class="caption">
 Figure 7.7: Genre vs rating for our sample
 </p>
@@ -1494,8 +874,8 @@ <h3><span class="header-section-number">7.7.4</span> Data</h3>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> movies_genre_sample, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> rating)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">binwidth =</span> <span class="dv">1</span>, <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>, <span class="dt">fill =</span> <span class="st">&quot;dodgerblue&quot;</span>) +
 <span class="st">  </span><span class="kw">facet_grid</span>(genre ~<span class="st"> </span>.)</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-87"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-87-1.png" alt="Genre vs rating for our sample as faceted histogram" width="\textwidth" />
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-90"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-90-1.png" alt="Genre vs rating for our sample as faceted histogram" width="\textwidth" />
 <p class="caption">
 Figure 7.8: Genre vs rating for our sample as faceted histogram
 </p>
@@ -1506,7 +886,7 @@ <h3><span class="header-section-number">7.7.4</span> Data</h3>
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC7.12)</strong> What single value could we change to improve the approximation using the sample distribution on the population distribution?</p>
+<p><strong>(LC7.13)</strong> What single value could we change to improve the approximation using the sample distribution on the population distribution?</p>
 <hr />
 <p>Do we have reason to believe, based on the sample distributions of <code>rating</code> over the two groups of <code>genre</code>, that there is a significant difference between the mean <code>rating</code> for action movies compared to romance movies? It’s hard to say just based on the plots. The boxplot does show that the median sample rating is higher for romance movies, but the histogram isn’t as clear. The two groups have somewhat differently shaped distributions but they are both over similar values of <code>rating</code>. It’s often useful to calculate the mean and standard deviation as well, conditioned on the two levels.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">summary_ratings &lt;-<span class="st"> </span>movies_genre_sample %&gt;%<span class="st"> </span>
@@ -1514,19 +894,38 @@ <h3><span class="header-section-number">7.7.4</span> Data</h3>
 <span class="st">  </span><span class="kw">summarize</span>(<span class="dt">mean =</span> <span class="kw">mean</span>(rating),
             <span class="dt">std_dev =</span> <span class="kw">sd</span>(rating),
             <span class="dt">n =</span> <span class="kw">n</span>())
-summary_ratings</code></pre></div>
-<pre><code>## # A tibble: 2 × 4
-##     genre     mean  std_dev     n
-##     &lt;chr&gt;    &lt;dbl&gt;    &lt;dbl&gt; &lt;int&gt;
-## 1  Action 5.197059 1.464837    34
-## 2 Romance 6.026471 1.202096    34</code></pre>
+summary_ratings %&gt;%<span class="st"> </span><span class="kw">kable</span>()</code></pre></div>
+<table>
+<thead>
+<tr class="header">
+<th align="left">genre</th>
+<th align="right">mean</th>
+<th align="right">std_dev</th>
+<th align="right">n</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td align="left">Action</td>
+<td align="right">5.197</td>
+<td align="right">1.465</td>
+<td align="right">34</td>
+</tr>
+<tr class="even">
+<td align="left">Romance</td>
+<td align="right">6.027</td>
+<td align="right">1.202</td>
+<td align="right">34</td>
+</tr>
+</tbody>
+</table>
 <hr />
 <div class="learncheck">
 <p>
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC7.13)</strong> Why did we not specify <code>na.rm = TRUE</code> here as we did in Chapter <a href="5-data-manipulation-via-dplyr.html#manip"><strong>??</strong></a>?</p>
+<p><strong>(LC7.14)</strong> Why did we not specify <code>na.rm = TRUE</code> here as we did in Chapter <a href="5-manip.html#manip">5</a>?</p>
 <hr />
 <p>We see that the sample mean rating for romance movies, <span class="math inline">\(\bar{x}_{r}\)</span>, is greater than the similar measure for action movies, <span class="math inline">\(\bar{x}_a\)</span>. But is it statistically significantly greater (thus, leading us to conclude that the means are statistically different)? The standard deviation can provide some insight here but with these standard deviations being so similar it’s still hard to say for sure.</p>
 <hr />
@@ -1535,7 +934,7 @@ <h3><span class="header-section-number">7.7.4</span> Data</h3>
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC7.14)</strong> Why might the standard deviation provide some insight about the means being statistically different or not?</p>
+<p><strong>(LC7.15)</strong> Why might the standard deviation provide some insight about the means being statistically different or not?</p>
 <hr />
 </div>
 <div id="model-of-h_0-1" class="section level3">
@@ -1555,7 +954,7 @@ <h3><span class="header-section-number">7.7.7</span> Observed effect <span class
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">mean_ratings &lt;-<span class="st"> </span>movies_genre_sample %&gt;%<span class="st"> </span><span class="kw">group_by</span>(genre) %&gt;%
 <span class="st">  </span><span class="kw">summarize</span>(<span class="dt">mean =</span> <span class="kw">mean</span>(rating))
 obs_diff &lt;-<span class="st"> </span><span class="kw">diff</span>(mean_ratings$mean)</code></pre></div>
-<p>We see here that the <code>diff</code> function calculates <span class="math inline">\(\bar{x}_r - \bar{x}_a = 6.0264706 - 5.1970588 = 0.8294118\)</span>. We will now proceed similarly to how we conducted the hypothesis test above for the Lady Tasting Tea using simulation. Our goal is figure out a random process with which to simulate the null hypothesis being true. Earlier in this chapter, we used flipping of a fair coin as the random process we were simulating with the null hypothesis being true (<span class="math inline">\(H_0: \tau = 5\)</span>).</p>
+<p>We see here that the <code>diff</code> function calculates <span class="math inline">\(\bar{x}_r - \bar{x}_a = 6.0265 - 5.1971 = 0.8294\)</span>. We will now proceed similarly to how we conducted the hypothesis test above for the Lady Tasting Tea using simulation. Our goal is figure out a random process with which to simulate the null hypothesis being true. Earlier in this chapter, we used flipping of a fair coin as the random process we were simulating with the null hypothesis being true (<span class="math inline">\(H_0: \tau = 5\)</span>).</p>
 </div>
 <div id="simulated-data-1" class="section level3">
 <h3><span class="header-section-number">7.7.8</span> Simulated Data</h3>
@@ -1563,22 +962,22 @@ <h3><span class="header-section-number">7.7.8</span> Simulated Data</h3>
 <p>Here, with us assuming the two population means are equal (<span class="math inline">\(H_0: \mu_r - \mu_a = 0\)</span>), we can look at this from a tactile point of view by using index cards. There are <span class="math inline">\(n_r = 34\)</span> data elements corresponding to romance movies and <span class="math inline">\(n_a = 34\)</span> for action movies. We can write the 34 ratings from our sample for romance movies on one set of 34 index cards and the 34 ratings for action movies on another set of 34 index cards. (Note that the sample sizes need not be the same.)</p>
 <p>The next step is to put the two stacks of index cards together, creating a new set of 68 cards. If we assume that the two population means are equal, we are saying that there is no association between ratings and genre (romance vs action). We can use the index cards to create two <strong>new</strong> stacks for romance and action movies. First, we must shuffle all the cards thoroughly. After doing so, in this case with equal values of sample sizes, we split the deck in half.</p>
 <p>We then calculate the new sample mean rating of the romance deck, and also the new sample mean rating of the action deck. This creates one simulation of the samples that were collected originally. We next want to calculate a statistic from these two samples. Instead of actually doing the calculation using index cards, we can use R as we have before to simulate this process.</p>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(mosaic)
+shuffled_ratings &lt;-<span class="st"> </span>movies_trimmed %&gt;%
+<span class="st">     </span><span class="kw">mutate</span>(<span class="dt">rating =</span> <span class="kw">shuffle</span>(rating)) %&gt;%<span class="st"> </span>
+<span class="st">     </span><span class="kw">group_by</span>(genre) %&gt;%
+<span class="st">     </span><span class="kw">summarize</span>(<span class="dt">mean =</span> <span class="kw">mean</span>(rating))
+<span class="kw">diff</span>(shuffled_ratings$mean)</code></pre></div>
+<pre><code>## [1] -0.02288</code></pre>
 <hr />
 <div class="learncheck">
 <p>
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC7.15)</strong> How would the tactile shuffling of index cards change if we had different samples of say 20 action movies and 60 romance movies? Describe each step that would change.</p>
-<p><strong>(LC7.16)</strong> Why are we taking the difference in the means of the cards in the new shuffled decks?</p>
+<p><strong>(LC7.16)</strong> How would the tactile shuffling of index cards change if we had different samples of say 20 action movies and 60 romance movies? Describe each step that would change.</p>
+<p><strong>(LC7.17)</strong> Why are we taking the difference in the means of the cards in the new shuffled decks?</p>
 <hr />
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(mosaic)
-shuffled_ratings &lt;-<span class="st"> </span>movies_trimmed %&gt;%
-<span class="st">     </span><span class="kw">mutate</span>(<span class="dt">genre =</span> <span class="kw">shuffle</span>(genre)) %&gt;%<span class="st"> </span>
-<span class="st">     </span><span class="kw">group_by</span>(genre) %&gt;%
-<span class="st">     </span><span class="kw">summarize</span>(<span class="dt">mean =</span> <span class="kw">mean</span>(rating))
-<span class="kw">diff</span>(shuffled_ratings$mean)</code></pre></div>
-<pre><code>## [1] -0.0170207</code></pre>
 </div>
 <div id="distribution-of-delta-under-h_0-1" class="section level3">
 <h3><span class="header-section-number">7.7.9</span> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></h3>
@@ -1597,8 +996,8 @@ <h3><span class="header-section-number">7.7.9</span> Distribution of <span class
 <p>We can now plot the distribution of these simulated differences in means:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> rand_distn, <span class="kw">aes</span>(<span class="dt">x =</span> diffmean)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">color =</span> <span class="st">&quot;white&quot;</span>, <span class="dt">bins =</span> <span class="dv">20</span>)</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-93"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-93-1.png" alt="Simulated differences in means histogram" width="\textwidth" />
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-96"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-96-1.png" alt="Simulated differences in means histogram" width="\textwidth" />
 <p class="caption">
 Figure 7.9: Simulated differences in means histogram
 </p>
@@ -1606,22 +1005,22 @@ <h3><span class="header-section-number">7.7.9</span> Distribution of <span class
 </div>
 <div id="the-p-value-1" class="section level3">
 <h3><span class="header-section-number">7.7.10</span> The p-value</h3>
-<p>Remember that we are interested in seeing where our observed sample mean difference of 0.8294118 falls on this null/randomization distribution. We are interested in simply a difference here so “more extreme” corresponds to values in both tails on the distribution. Let’s shade our null distribution to show a visual representation of our <span class="math inline">\(p\)</span>-value:</p>
+<p>Remember that we are interested in seeing where our observed sample mean difference of 0.8294 falls on this null/randomization distribution. We are interested in simply a difference here so “more extreme” corresponds to values in both tails on the distribution. Let’s shade our null distribution to show a visual representation of our <span class="math inline">\(p\)</span>-value:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> rand_distn, <span class="kw">aes</span>(<span class="dt">x =</span> diffmean, <span class="dt">fill =</span> (<span class="kw">abs</span>(diffmean) &gt;=<span class="st"> </span>obs_diff))) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">color =</span> <span class="st">&quot;white&quot;</span>, <span class="dt">bins =</span> <span class="dv">20</span>)</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-94"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-94-1.png" alt="Shaded histogram to show p-value" width="\textwidth" />
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-97"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-97-1.png" alt="Shaded histogram to show p-value" width="\textwidth" />
 <p class="caption">
 Figure 7.10: Shaded histogram to show p-value
 </p>
 </div>
-<p>You may initially think there is an error here, but remember that the observed difference in means was 0.8294118. It falls far outside the range of simulated differences. We can add a vertical line to represent both it and its negative (since this is a two-tailed test) instead:</p>
+<p>You may initially think there is an error here, but remember that the observed difference in means was 0.8294. It falls far outside the range of simulated differences. We can add a vertical line to represent both it and its negative (since this is a two-tailed test) instead:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> rand_distn, <span class="kw">aes</span>(<span class="dt">x =</span> diffmean)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">color =</span> <span class="st">&quot;white&quot;</span>, <span class="dt">bins =</span> <span class="dv">100</span>) +
 <span class="st">  </span><span class="kw">geom_vline</span>(<span class="dt">xintercept =</span> obs_diff, <span class="dt">color =</span> <span class="st">&quot;red&quot;</span>) +
 <span class="st">  </span><span class="kw">geom_vline</span>(<span class="dt">xintercept =</span> -obs_diff, <span class="dt">color =</span> <span class="st">&quot;red&quot;</span>)</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-95"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-95-1.png" alt="Histogram with vertical lines corresponding to observed statistic" width="\textwidth" />
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-98"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-98-1.png" alt="Histogram with vertical lines corresponding to observed statistic" width="\textwidth" />
 <p class="caption">
 Figure 7.11: Histogram with vertical lines corresponding to observed statistic
 </p>
@@ -1633,13 +1032,13 @@ <h3><span class="header-section-number">7.7.10</span> The p-value</h3>
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC7.17)</strong> Conduct the same analysis comparing action movies versus romantic movies using the median rating instead of the mean rating? Make sure to use the <code>%&gt;%</code> as much as possible. What was different and what was the same?</p>
-<p><strong>(LC7.18)</strong> What conclusions can you make from viewing the faceted histogram looking at <code>rating</code> versus <code>genre</code> that you couldn’t see when looking at the boxplot?</p>
-<p><strong>(LC7.19)</strong> Describe in a paragraph how we used Allen Downey’s diagram to conclude if a statistical difference existed between mean movie ratings for action and romance movies.</p>
-<p><strong>(LC7.20)</strong> Why are we relatively confident that the distributions of the sample ratings will be good approximations of the population distributions of ratings for the two genres?</p>
-<p><strong>(LC7.21)</strong> Using the definition of “<span class="math inline">\(p\)</span>-value”, write in words what the <span class="math inline">\(p\)</span>-value represents for the hypothesis test above comparing the mean rating of romance to action movies.</p>
-<p><strong>(LC7.22)</strong> What is the value of the <span class="math inline">\(p\)</span>-value for the hypothesis test comparing the mean rating of romance to action movies?</p>
-<p><strong>(LC7.23)</strong> Do the results of the hypothesis test match up with the original plots we made looking at the population of movies? Why or why not?</p>
+<p><strong>(LC7.18)</strong> Conduct the same analysis comparing action movies versus romantic movies using the median rating instead of the mean rating? Make sure to use the <code>%&gt;%</code> as much as possible. What was different and what was the same?</p>
+<p><strong>(LC7.19)</strong> What conclusions can you make from viewing the faceted histogram looking at <code>rating</code> versus <code>genre</code> that you couldn’t see when looking at the boxplot?</p>
+<p><strong>(LC7.20)</strong> Describe in a paragraph how we used Allen Downey’s diagram to conclude if a statistical difference existed between mean movie ratings for action and romance movies.</p>
+<p><strong>(LC7.21)</strong> Why are we relatively confident that the distributions of the sample ratings will be good approximations of the population distributions of ratings for the two genres?</p>
+<p><strong>(LC7.22)</strong> Using the definition of “<span class="math inline">\(p\)</span>-value”, write in words what the <span class="math inline">\(p\)</span>-value represents for the hypothesis test above comparing the mean rating of romance to action movies.</p>
+<p><strong>(LC7.23)</strong> What is the value of the <span class="math inline">\(p\)</span>-value for the hypothesis test comparing the mean rating of romance to action movies?</p>
+<p><strong>(LC7.24)</strong> Do the results of the hypothesis test match up with the original plots we made looking at the population of movies? Why or why not?</p>
 <hr />
 </div>
 <div id="summary-5" class="section level3">
@@ -1659,10 +1058,10 @@ <h2><span class="header-section-number">7.8</span> Building theory-based methods
 <p>These traditional methods have been used for many decades back to the time when researchers didn’t have access to computers that could run 10,000 simulations in under a minute. They had to base their methods on probability theory instead. Many fields and researchers continue to use these methods and that is the biggest reason for their inclusion here. It’s important to remember that a <span class="math inline">\(t\)</span>-test or a <span class="math inline">\(z\)</span>-test is really just an approximation of what you have seen in this chapter already using simulation and randomization. The focus here is on understanding how the shape of the <span class="math inline">\(t\)</span>-curve comes about without digging big into the mathematical underpinnings.</p>
 <div id="example-t-test-for-two-independent-samples" class="section level3">
 <h3><span class="header-section-number">7.8.1</span> EXAMPLE: <span class="math inline">\(t\)</span>-test for two independent samples</h3>
-<p>What is commonly done in statistics is the process of normalization. What this entails is calculating the mean and standard deviation of a variable. Then you subtract the mean from each value of your variable and divide by the standard deviation. The most common normalization is known as the <span class="math inline">\(z\)</span>-score. The formula for a <span class="math inline">\(z\)</span>-score is <span class="math display">\[Z = \frac{x - \mu}{\sigma}\]</span>, where <span class="math inline">\(x\)</span> represent the value of a variable, <span class="math inline">\(\mu\)</span> represents the mean of the variable, and <span class="math inline">\(\sigma\)</span> represents the standard deviation of the variable. <span class="math inline">\(z\)</span>-scores are normally distributed with mean 0 and standard deviation 1. They have the common, bell-shaped pattern.</p>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-96-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p>What is commonly done in statistics is the process of normalization. What this entails is calculating the mean and standard deviation of a variable. Then you subtract the mean from each value of your variable and divide by the standard deviation. The most common normalization is known as the <span class="math inline">\(z\)</span>-score. The formula for a <span class="math inline">\(z\)</span>-score is <span class="math display">\[Z = \frac{x - \mu}{\sigma},\]</span> where <span class="math inline">\(x\)</span> represent the value of a variable, <span class="math inline">\(\mu\)</span> represents the mean of the variable, and <span class="math inline">\(\sigma\)</span> represents the standard deviation of the variable. Thus, if your variable has 10 elements, each one has a corresponding <span class="math inline">\(z\)</span>-score that gives how many standard deviations away that value is from its mean. <span class="math inline">\(z\)</span>-scores are normally distributed with mean 0 and standard deviation 1. They have the common, bell-shaped pattern seen below.</p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-99-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 <p>Recall, that we hardly ever know the mean and standard deviation of the population of interest. This is almost always the case when considering the means of two independent groups. To help account for us not knowing the population parameter values, we can use the sample statistics instead, but this comes with a bit of a price in terms of complexity.</p>
-<p>Another form of normalization occurs when we need to use the sample standard deviations as estimates for the unknown population standard deviations. This normalization is often called the <span class="math inline">\(t\)</span>-score. For the two independent samples case like what we have for comparing action movies to romance movies, the formula is <span class="math display">\[T =\dfrac{ (\bar{x}_1 - \bar{x}_2) - (\mu_1 - \mu_2)}{ \sqrt{\dfrac{s_1^2}{n_1} + \dfrac{s_2^2}{n_2}}  }\]</span></p>
+<p>Another form of normalization occurs when we need to use the sample standard deviations as estimates for the unknown population standard deviations. This normalization is often called the <span class="math inline">\(t\)</span>-score. For the two independent samples case like what we have for comparing action movies to romance movies, the formula is <span class="math display">\[T =\dfrac{ (\bar{x}_1 - \bar{x}_2) - (\mu_1 - \mu_2)}{ \sqrt{\dfrac{{s_1}^2}{n_1} + \dfrac{{s_2}^2}{n_2}}  }\]</span></p>
 <p>There is a lot to try to unpack here.</p>
 <ul>
 <li><span class="math inline">\(\bar{x}_1\)</span> is the sample mean response of the first group</li>
@@ -1676,17 +1075,17 @@ <h3><span class="header-section-number">7.8.1</span> EXAMPLE: <span class="math
 </ul>
 <p>Assuming that the null hypothesis is true (<span class="math inline">\(H_0: \mu_1 - \mu_2 = 0\)</span>), <span class="math inline">\(T\)</span> is said to be distributed following a <span class="math inline">\(t\)</span> distribution with degrees of freedom equal to the smaller value of <span class="math inline">\(n_1 - 1\)</span> and <span class="math inline">\(n_2 - 1\)</span>. The “degrees of freedom” can be thought of measuring how different the <span class="math inline">\(t\)</span> distribution will be as compared to a normal distribution. Small sample sizes lead to small degrees of freedom and, thus, <span class="math inline">\(t\)</span> distributions that have more values in the tails of their distributions. Large sample sizes lead to large degrees of freedom and, thus, <span class="math inline">\(t\)</span> distributions that closely align with the standard normal, bell-shaped curve.</p>
 <p>So, assuming <span class="math inline">\(H_0\)</span> is true, our formula simplifies a bit:</p>
-<p><span class="math display">\[T =\dfrac{ \bar{x}_1 - \bar{x}_2}{ \sqrt{\dfrac{s_1^2}{n_1} + \dfrac{s_2^2}{n_2}}  }\]</span></p>
+<p><span class="math display">\[T =\dfrac{ \bar{x}_1 - \bar{x}_2}{ \sqrt{\dfrac{{s_1}^2}{n_1} + \dfrac{{s_2}^2}{n_2}}  }.\]</span></p>
 <p>We have already built an approximation for what we think the distribution of <span class="math inline">\(\delta = \bar{x}_1 - \bar{x}_2\)</span> looks like using randomization above. Recall this distribution:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> rand_distn, <span class="kw">aes</span>(<span class="dt">x =</span> diffmean)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">color =</span> <span class="st">&quot;white&quot;</span>, <span class="dt">bins =</span> <span class="dv">20</span>)</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-97"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-97-1.png" alt="Simulated differences in means histogram" width="\textwidth" />
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-100"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-100-1.png" alt="Simulated differences in means histogram" width="\textwidth" />
 <p class="caption">
 Figure 7.12: Simulated differences in means histogram
 </p>
 </div>
-<p>If we’d like to have a guess as to what the distribution of <span class="math inline">\(T\)</span> might look like instead, we need only to divide every value in <code>rand_distn</code> by <span class="math inline">\(\sqrt{\dfrac{s_1^2}{n_1} + \dfrac{s_2^2}{n_2}}\)</span>. As we did before, we will assign Romance to be group 1 and Action to be group 2. (This was done since Romance comes second alphabetically and the reason why we have a number mismatch below with 1 and 2.) Remember that we’ve already calculated these values:</p>
+<p>If we’d like to have a guess as to what the distribution of <span class="math inline">\(T\)</span> might look like instead, we need only to divide every value in <code>rand_distn</code> by <span class="math display">\[\sqrt{\dfrac{{s_1}^2}{n_1} + \dfrac{{s_2}^2}{n_2}}.\]</span> As we did before, we will assign Romance to be group 1 and Action to be group 2. (This was done since Romance comes second alphabetically and the reason why we have a number mismatch below with 1 and 2.) Remember that we’ve already calculated these values:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">kable</span>(summary_ratings)</code></pre></div>
 <table>
 <thead>
@@ -1700,14 +1099,14 @@ <h3><span class="header-section-number">7.8.1</span> EXAMPLE: <span class="math
 <tbody>
 <tr class="odd">
 <td align="left">Action</td>
-<td align="right">5.197059</td>
-<td align="right">1.464837</td>
+<td align="right">5.197</td>
+<td align="right">1.465</td>
 <td align="right">34</td>
 </tr>
 <tr class="even">
 <td align="left">Romance</td>
-<td align="right">6.026471</td>
-<td align="right">1.202096</td>
+<td align="right">6.027</td>
+<td align="right">1.202</td>
 <td align="right">34</td>
 </tr>
 </tbody>
@@ -1717,43 +1116,44 @@ <h3><span class="header-section-number">7.8.1</span> EXAMPLE: <span class="math
 s2 &lt;-<span class="st"> </span>summary_ratings$std_dev[<span class="dv">1</span>]
 n1 &lt;-<span class="st"> </span>summary_ratings$n[<span class="dv">2</span>]
 n2 &lt;-<span class="st"> </span>summary_ratings$n[<span class="dv">1</span>]</code></pre></div>
-<p>Here, we have <span class="math inline">\(s_1 = 1.2020964\)</span>, <span class="math inline">\(s_2 = 1.4648374\)</span>, <span class="math inline">\(n_1 = 34\)</span>, and <span class="math inline">\(n_2 = 34\)</span>.</p>
+<p>Here, we have <span class="math inline">\(s_1 = 1.2021\)</span>, <span class="math inline">\(s_2 = 1.4648\)</span>, <span class="math inline">\(n_1 = 34\)</span>, and <span class="math inline">\(n_2 = 34\)</span>.</p>
 <p>We can calculate the denominator via</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">denom_T &lt;-<span class="st"> </span><span class="kw">sqrt</span>( (s1^<span class="dv">2</span> /<span class="st"> </span>n1) +<span class="st"> </span>(s2^<span class="dv">2</span> /<span class="st"> </span>n2) )</code></pre></div>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">(denom_T &lt;-<span class="st"> </span><span class="kw">sqrt</span>( (s1^<span class="dv">2</span> /<span class="st"> </span>n1) +<span class="st"> </span>(s2^<span class="dv">2</span> /<span class="st"> </span>n2) ))</code></pre></div>
+<pre><code>## [1] 0.325</code></pre>
 <p>Now if we divide all of the values of <code>diffmean</code> in <code>rand_distn</code> by <code>denom_T</code> we can have a simulated distribution of <span class="math inline">\(T\)</span> test statistics instead:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">rand_distn &lt;-<span class="st"> </span>rand_distn %&gt;%<span class="st"> </span>
-<span class="st">  </span><span class="kw">mutate</span>(<span class="dt">t_stat =</span> diffmean /<span class="st"> </span>denom_T)
+<span class="st">  </span><span class="kw">mutate</span>(<span class="dt">t_stat =</span> diffmean /<span class="st"> </span>denom_T *<span class="st"> </span><span class="dv">10</span>)
 <span class="kw">ggplot</span>(<span class="dt">data =</span> rand_distn, <span class="kw">aes</span>(<span class="dt">x =</span> t_stat)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">color =</span> <span class="st">&quot;white&quot;</span>, <span class="dt">bins =</span> <span class="dv">20</span>)</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-101"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-101-1.png" alt="Simulated T statistics histogram" width="\textwidth" />
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-104"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-104-1.png" alt="Simulated T statistics histogram" width="\textwidth" />
 <p class="caption">
 Figure 7.13: Simulated T statistics histogram
 </p>
 </div>
 <p>We see that the shape of this distribution is the same as that of <code>diffmean</code>. The scale has changed though with <code>t_stat</code> having less spread than <code>diffmean</code>.</p>
-<p>A traditional <span class="math inline">\(t\)</span>-test doesn’t look at this simulated distribution, but instead it looks at the <span class="math inline">\(t\)</span>-curve with degrees of freedom equal to 33 (the minimum of <span class="math inline">\(n_1 = 34 - 1 = 33\)</span> and <span class="math inline">\(n_2 = 34 - 1 = 33\)</span>). We now overlay what this <span class="math inline">\(t\)</span>-curve looks like on top of the histogram showing the simulated <span class="math inline">\(T\)</span> statistics.</p>
+<p>A traditional <span class="math inline">\(t\)</span>-test doesn’t look at this simulated distribution, but instead it looks at the <span class="math inline">\(t\)</span>-curve with degrees of freedom equal to 33 (the minimum of <span class="math inline">\(n_1 = 34 - 1 = 33\)</span> and <span class="math inline">\(n_2 = 34 - 1 = 33\)</span>). This curve is frequently called a <em>density</em> curve and this is the reason why we specify the use of <code>y = ..density..</code> here in the <code>geom_histogram</code>. We now overlay what this <span class="math inline">\(t\)</span>-curve looks like on top of the histogram showing the simulated <span class="math inline">\(T\)</span> statistics.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> rand_distn, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> t_stat)) +
-<span class="st">  </span><span class="kw">geom_histogram</span>(<span class="kw">aes</span>(<span class="dt">y =</span> ..density..), <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>, <span class="dt">binwidth =</span> <span class="fl">0.1</span>) +
+<span class="st">  </span><span class="kw">geom_histogram</span>(<span class="kw">aes</span>(<span class="dt">y =</span> ..density..), <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>, <span class="dt">binwidth =</span> <span class="fl">0.3</span>) +
 <span class="st">  </span><span class="kw">stat_function</span>(<span class="dt">fun =</span> dt,
     <span class="dt">args =</span> <span class="kw">list</span>(<span class="dt">df =</span> <span class="kw">min</span>(n1 -<span class="st"> </span><span class="dv">1</span>, n2 -<span class="st"> </span><span class="dv">1</span>)), 
     <span class="dt">color =</span> <span class="st">&quot;royalblue&quot;</span>, <span class="dt">size =</span> <span class="dv">2</span>)</code></pre></div>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-102-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-105-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 <p>We can see that the curve does a good job of approximating the randomization distribution here. (More on when to expect for this to be the case when we discuss conditions for the <span class="math inline">\(t\)</span>-test in a bit.) To calculate the <span class="math inline">\(p\)</span>-value in this case, we need to figure out how much of the total area under the <span class="math inline">\(t\)</span>-curve is at our observed <span class="math inline">\(T\)</span>-statistic or more, plus also adding the area under the curve at the negative value of the observed <span class="math inline">\(T\)</span>-statistic or below. (Remember this is a two-tailed test so we are looking for a difference–values in the tails of either direction.) Just as we converted all of the simulated values to <span class="math inline">\(T\)</span>-statistics, we must also do so for our observed effect <span class="math inline">\(\delta^*\)</span>:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">(t_obs &lt;-<span class="st"> </span>obs_diff /<span class="st"> </span>denom_T)</code></pre></div>
-<pre><code>## [1] 2.552202</code></pre>
-<p>So graphically we are interested in finding the percentage of values that are at or above 2.5522017 or at or below -2.5522017.</p>
+<pre><code>## [1] 2.552</code></pre>
+<p>So graphically we are interested in finding the percentage of values that are at or above 2.5522 or at or below -2.5522.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> rand_distn, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> t_stat)) +
 <span class="st">  </span><span class="kw">stat_function</span>(<span class="dt">fun =</span> dt,
     <span class="dt">args =</span> <span class="kw">list</span>(<span class="dt">df =</span> <span class="kw">min</span>(n1 -<span class="st"> </span><span class="dv">1</span>, n2 -<span class="st"> </span><span class="dv">1</span>)), 
     <span class="dt">color =</span> <span class="st">&quot;royalblue&quot;</span>, <span class="dt">size =</span> <span class="dv">2</span>) +
 <span class="st">  </span><span class="kw">geom_vline</span>(<span class="dt">xintercept =</span> t_obs, <span class="dt">color =</span> <span class="st">&quot;red&quot;</span>) +
 <span class="st">  </span><span class="kw">geom_vline</span>(<span class="dt">xintercept =</span> -t_obs, <span class="dt">color =</span> <span class="st">&quot;red&quot;</span>)</code></pre></div>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-104-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-107-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 <p>At this point, you should make a guess as to what a reasonable value may be for the <span class="math inline">\(p\)</span>-value. Let’s say the <span class="math inline">\(p\)</span>-value is 0.01 or so. To actually perform this calculation by hand, you’d need to do some calculus. Let’s have R do it for us instead using the <code>pt</code> function.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">pt</span>(t_obs, <span class="dt">df =</span> <span class="kw">min</span>(n1 -<span class="st"> </span><span class="dv">1</span>, n2 -<span class="st"> </span><span class="dv">1</span>), <span class="dt">lower.tail =</span> <span class="ot">FALSE</span>) +
 <span class="st">  </span><span class="kw">pt</span>(-t_obs, <span class="dt">df =</span> <span class="kw">min</span>(n1 -<span class="st"> </span><span class="dv">1</span>, n2 -<span class="st"> </span><span class="dv">1</span>), <span class="dt">lower.tail =</span> <span class="ot">TRUE</span>)</code></pre></div>
-<pre><code>## [1] 0.01551859</code></pre>
+<pre><code>## [1] 0.01552</code></pre>
 </div>
 <div id="conditions-for-t-test" class="section level3">
 <h3><span class="header-section-number">7.8.2</span> Conditions for t-test</h3>
@@ -1767,14 +1167,26 @@ <h3><span class="header-section-number">7.8.2</span> Conditions for t-test</h3>
 <p>Condition 2: Recall from Figure <a href="7-hypo.html#fig:movie-hist">7.6</a>, that we know how the populations are distributed. Both of them are close to normally distributed. If we are a little concerned about this assumption, we also do have samples of size larger than 30 (<span class="math inline">\(n_1 = n_2 = 34\)</span>).</p>
 <p>Condition 3: This is met since there is no natural pairing of a movie in the Action group to a movie in the Romance group.</p>
 <p>Since all three conditions are met, we can be reasonably certain that the theory-based test will match the results of the randomization-based test using shuffling. Remember that theory-based tests can produce some incorrect results in these assumptions are not carefully checked. The only assumption for randomization and computational-based methods is that the sample is selected at random. They are our preference and we strongly believe they should be yours as well, but it’s important to also see how the theory-based tests can be done and used as an approximation for the computational techniques until at least more researchers are using these techniques that utilize the power of computers.</p>
-<p>An R script file of all R code used in this chapter is available <a href="http://ismayc.github.io/moderndiver-book/07-hypo.R">here</a>.</p>
 </div>
 </div>
-<div id="whats-to-come-4" class="section level2">
-<h2><span class="header-section-number">7.9</span> What’s to come?</h2>
-<p>This chapter examined the basics of hypothesis testing with terminology and also an example of how to apply the “There is Only One Test” diagram to the Lady Tasting Tea example presented in Chapter <a href="6-simulating-randomness-via-mosaic.html#sim"><strong>??</strong></a> and to an example on comparing the IMDB ratings of action movies and romance movies. We’ll see in Chapter <a href="8-ci.html#ci">8</a> how we can provide a range of possible values for an unknown population parameter instead of just running a Yes/No decision from a hypothesis test.</p>
-<p>We will see in Chapter <a href="9-regression-via-broom.html#regress"><strong>??</strong></a> many of the same ideas we have seen with hypothesis testing and confidence intervals in the last two chapters. Regression is frequently associated both correctly and incorrectly with statistics and data analysis, so you’ll need to make sure you understand when it is appropriate and when it is not.</p>
+<div id="conclusion-3" class="section level2">
+<h2><span class="header-section-number">7.9</span> Conclusion</h2>
+<div id="script-of-r-code-3" class="section level3">
+<h3><span class="header-section-number">7.9.1</span> Script of R code</h3>
+<p>An R script file of all R code used in this chapter is available <a href="http://ismayc.github.io/moderndiver-book/07-hypo.R">here</a>.</p>
+</div>
+<div id="whats-to-come-4" class="section level3">
+<h3><span class="header-section-number">7.9.2</span> What’s to come?</h3>
+<p>This chapter examined the basics of hypothesis testing with terminology and also an example of how to apply the “There is Only One Test” diagram to the Lady Tasting Tea example presented in Chapter <a href="6-sim.html#sim">6</a> and to an example on comparing the IMDB ratings of action movies and romance movies. We’ll see in Chapter <a href="8-ci.html#ci">8</a> how we can provide a range of possible values for an unknown population parameter instead of just running a Yes/No decision from a hypothesis test.</p>
+<p>We will see in Chapter <a href="9-regress.html#regress">9</a> many of the same ideas we have seen with hypothesis testing and confidence intervals in the last two chapters. Regression is frequently associated both correctly and incorrectly with statistics and data analysis, so you’ll need to make sure you understand when it is appropriate and when it is not.</p>
 
+</div>
+</div>
+</div>
+<h3>References</h3>
+<div id="refs" class="references">
+<div id="ref-R-ggplot2movies">
+<p>Wickham, Hadley. 2015. <em>Ggplot2movies: Movies Data</em>. <a href="https://CRAN.R-project.org/package=ggplot2movies" class="uri">https://CRAN.R-project.org/package=ggplot2movies</a>.</p>
 </div>
 </div>
             </section>
@@ -1782,7 +1194,7 @@ <h2><span class="header-section-number">7.9</span> What’s to come?</h2>
           </div>
         </div>
       </div>
-<a href="6-simulating-randomness-via-mosaic.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
+<a href="6-sim.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
 <a href="8-ci.html" class="navigation navigation-next " aria-label="Next page""><i class="fa fa-angle-right"></i></a>
 
 <script src="libs/gitbook-2.6.7/js/app.min.js"></script>
diff --git a/docs/8-ci.html b/docs/8-ci.html
index d0df1a895..91a1c8c2e 100644
--- a/docs/8-ci.html
+++ b/docs/8-ci.html
@@ -26,7 +26,7 @@
 <meta name="author" content="Chester Ismay and Albert Y. Kim">
 
 
-<meta name="date" content="2017-01-07">
+<meta name="date" content="2017-01-10">
 
   <meta name="viewport" content="width=device-width, initial-scale=1">
   <meta name="apple-mobile-web-app-capable" content="yes">
@@ -34,7 +34,7 @@
   
   
 <link rel="prev" href="7-hypo.html">
-<link rel="next" href="9-regression-via-broom.html">
+<link rel="next" href="9-regress.html">
 
 <script src="libs/jquery-2.2.3/jquery.min.js"></script>
 <link href="libs/gitbook-2.6.7/css/style.css" rel="stylesheet" />
@@ -119,9 +119,9 @@
 
 <ul class="summary">
 <li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> Preamble</a><ul>
-<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book</a></li>
+<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book---for-instructors"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book - For Instructors</a></li>
 <li class="chapter" data-level="1.2" data-path="index.html"><a href="index.html#contribute"><i class="fa fa-check"></i><b>1.2</b> Contribute</a></li>
-<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started"><i class="fa fa-check"></i><b>1.3</b> Getting Started</a></li>
+<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started---for-students"><i class="fa fa-check"></i><b>1.3</b> Getting Started - For Students</a></li>
 <li class="chapter" data-level="" data-path="index.html"><a href="index.html#colophon"><i class="fa fa-check"></i>Colophon</a></li>
 </ul></li>
 <li class="chapter" data-level="2" data-path="2-intro.html"><a href="2-intro.html"><i class="fa fa-check"></i><b>2</b> Introduction</a><ul>
@@ -133,87 +133,100 @@
 </ul></li>
 <li class="part"><span><b>I Data Exploration</b></span></li>
 <li class="chapter" data-level="3" data-path="3-tidy.html"><a href="3-tidy.html"><i class="fa fa-check"></i><b>3</b> Tidy Data</a><ul>
+<li class="chapter" data-level="" data-path="3-tidy.html"><a href="3-tidy.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="3.1" data-path="3-tidy.html"><a href="3-tidy.html#what-is-tidy-data"><i class="fa fa-check"></i><b>3.1</b> What is tidy data?</a></li>
-<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#the-nycflights13-datasets"><i class="fa fa-check"></i><b>3.2</b> The <code>nycflights13</code> datasets</a></li>
-<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a></li>
+<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#datasets-in-the-nycflights13-package"><i class="fa fa-check"></i><b>3.2</b> Datasets in the <code>nycflights13</code> package</a></li>
+<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a><ul>
+<li class="chapter" data-level="3.3.1" data-path="3-tidy.html"><a href="3-tidy.html#identification-variables"><i class="fa fa-check"></i><b>3.3.1</b> Identification variables</a></li>
+</ul></li>
 <li class="chapter" data-level="3.4" data-path="3-tidy.html"><a href="3-tidy.html#normal-forms-of-data"><i class="fa fa-check"></i><b>3.4</b> Normal forms of data</a></li>
 <li class="chapter" data-level="3.5" data-path="3-tidy.html"><a href="3-tidy.html#whats-to-come"><i class="fa fa-check"></i><b>3.5</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via <code id="viz">ggplot2</code></a><ul>
-<li class="chapter" data-level="" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
-<li class="chapter" data-level="4.1.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
-<li class="chapter" data-level="4.1.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
-<li class="chapter" data-level="4.1.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
-<li class="chapter" data-level="4.1.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The <code>ggplot2</code> Package</a></li>
-</ul></li>
-<li class="chapter" data-level="4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
-<li class="chapter" data-level="4.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
-<li class="chapter" data-level="4.3.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatter-plots-via-geom_point"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via <code id="geompoint">geom_point</code></a></li>
-<li class="chapter" data-level="4.3.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
-<li class="chapter" data-level="4.3.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
-<li class="chapter" data-level="4.4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#line-graphs-via-geom_line"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via <code id="geomline">geom_line</code></a></li>
-<li class="chapter" data-level="4.4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.5" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
-<li class="chapter" data-level="4.5.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms-via-geom_histogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via <code id="geomhistogram">geom_histogram</code></a></li>
-<li class="chapter" data-level="4.5.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
-<li class="chapter" data-level="4.5.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.6" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
-<li class="chapter" data-level="4.7" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
-<li class="chapter" data-level="4.7.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#boxplots-via-geom_boxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via <code id="geomboxplot">geom_boxplot</code></a></li>
-<li class="chapter" data-level="4.7.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.8" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
-<li class="chapter" data-level="4.8.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via <code>geom_bar</code></a></li>
-<li class="chapter" data-level="4.8.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
-<li class="chapter" data-level="4.8.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
-<li class="chapter" data-level="4.8.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.9" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
-<li class="chapter" data-level="4.9.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.1</b> What’s to come?</a></li>
-<li class="chapter" data-level="4.9.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
-</ul></li>
-</ul></li>
-<li class="chapter" data-level="5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via <code id="manip">dplyr</code></a><ul>
-<li class="chapter" data-level="" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="5.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
-<li class="chapter" data-level="5.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#four-main-verbs---the-4mv"><i class="fa fa-check"></i><b>5.2</b> Four Main Verbs - The 4MV</a><ul>
-<li class="chapter" data-level="5.2.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#filter-observations-using-filter"><i class="fa fa-check"></i><b>5.2.1</b> Filter observations using <code id="filter">filter</code></a></li>
-<li class="chapter" data-level="5.2.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> Summarize variables using <code>summarize</code></a></li>
-<li class="chapter" data-level="5.2.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.3</b> Create new variables/change old variables using <code>mutate</code></a></li>
-<li class="chapter" data-level="5.2.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#reorder-the-data-frame-using-arrange"><i class="fa fa-check"></i><b>5.2.4</b> Reorder the data frame using <code id="arrange">arrange</code></a></li>
-</ul></li>
-<li class="chapter" data-level="5.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#other-verbs"><i class="fa fa-check"></i><b>5.3</b> Other verbs</a><ul>
-<li class="chapter" data-level="5.3.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#select-variables-using-select"><i class="fa fa-check"></i><b>5.3.1</b> Select variables using <code id="select">select</code></a></li>
-<li class="chapter" data-level="5.3.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#rename-variables-using-rename"><i class="fa fa-check"></i><b>5.3.2</b> Rename variables using <code id="rename">rename</code></a></li>
-<li class="chapter" data-level="5.3.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.3.3</b> Find the top number of values using <code>top_n</code></a></li>
-</ul></li>
-<li class="chapter" data-level="5.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#joiningmerging-data-frames"><i class="fa fa-check"></i><b>5.4</b> Joining/merging data frames</a></li>
-<li class="chapter" data-level="5.5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5</b> Script of R code</a></li>
-<li class="chapter" data-level="5.6" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.6</b> What’s to come?</a></li>
+<li class="chapter" data-level="4" data-path="4-viz.html"><a href="4-viz.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via ggplot2</a><ul>
+<li class="chapter" data-level="" data-path="4-viz.html"><a href="4-viz.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="4.1" data-path="4-viz.html"><a href="4-viz.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
+<li class="chapter" data-level="4.1.1" data-path="4-viz.html"><a href="4-viz.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
+<li class="chapter" data-level="4.1.2" data-path="4-viz.html"><a href="4-viz.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
+<li class="chapter" data-level="4.1.3" data-path="4-viz.html"><a href="4-viz.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
+<li class="chapter" data-level="4.1.4" data-path="4-viz.html"><a href="4-viz.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The ggplot2 Package</a></li>
+</ul></li>
+<li class="chapter" data-level="4.2" data-path="4-viz.html"><a href="4-viz.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
+<li class="chapter" data-level="4.3" data-path="4-viz.html"><a href="4-viz.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
+<li class="chapter" data-level="4.3.1" data-path="4-viz.html"><a href="4-viz.html#geompoint"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via geom_point</a></li>
+<li class="chapter" data-level="4.3.2" data-path="4-viz.html"><a href="4-viz.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
+<li class="chapter" data-level="4.3.3" data-path="4-viz.html"><a href="4-viz.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.4" data-path="4-viz.html"><a href="4-viz.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
+<li class="chapter" data-level="4.4.1" data-path="4-viz.html"><a href="4-viz.html#geomline"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via geom_line</a></li>
+<li class="chapter" data-level="4.4.2" data-path="4-viz.html"><a href="4-viz.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.5" data-path="4-viz.html"><a href="4-viz.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
+<li class="chapter" data-level="4.5.1" data-path="4-viz.html"><a href="4-viz.html#geomhistogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via geom_histogram</a></li>
+<li class="chapter" data-level="4.5.2" data-path="4-viz.html"><a href="4-viz.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
+<li class="chapter" data-level="4.5.3" data-path="4-viz.html"><a href="4-viz.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.6" data-path="4-viz.html"><a href="4-viz.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
+<li class="chapter" data-level="4.7" data-path="4-viz.html"><a href="4-viz.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
+<li class="chapter" data-level="4.7.1" data-path="4-viz.html"><a href="4-viz.html#geomboxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via geom_boxplot</a></li>
+<li class="chapter" data-level="4.7.2" data-path="4-viz.html"><a href="4-viz.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.8" data-path="4-viz.html"><a href="4-viz.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
+<li class="chapter" data-level="4.8.1" data-path="4-viz.html"><a href="4-viz.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via geom_bar</a></li>
+<li class="chapter" data-level="4.8.2" data-path="4-viz.html"><a href="4-viz.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
+<li class="chapter" data-level="4.8.3" data-path="4-viz.html"><a href="4-viz.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
+<li class="chapter" data-level="4.8.4" data-path="4-viz.html"><a href="4-viz.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.9" data-path="4-viz.html"><a href="4-viz.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="4.9.1" data-path="4-viz.html"><a href="4-viz.html#resources"><i class="fa fa-check"></i><b>4.9.1</b> Resources</a></li>
+<li class="chapter" data-level="4.9.2" data-path="4-viz.html"><a href="4-viz.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
+<li class="chapter" data-level="4.9.3" data-path="4-viz.html"><a href="4-viz.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.3</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="5" data-path="5-manip.html"><a href="5-manip.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via dplyr</a><ul>
+<li class="chapter" data-level="" data-path="5-manip.html"><a href="5-manip.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="5.1" data-path="5-manip.html"><a href="5-manip.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
+<li class="chapter" data-level="5.2" data-path="5-manip.html"><a href="5-manip.html#five-main-verbs---the-5mv"><i class="fa fa-check"></i><b>5.2</b> Five Main Verbs - The 5MV</a><ul>
+<li class="chapter" data-level="5.2.1" data-path="5-manip.html"><a href="5-manip.html#filter"><i class="fa fa-check"></i><b>5.2.1</b> 5MV#1: Filter observations using filter</a></li>
+<li class="chapter" data-level="5.2.2" data-path="5-manip.html"><a href="5-manip.html#mv2-summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> 5MV#2: Summarize variables using summarize</a></li>
+<li class="chapter" data-level="5.2.3" data-path="5-manip.html"><a href="5-manip.html#mv3-group-rows-using-group_by"><i class="fa fa-check"></i><b>5.2.3</b> 5MV#3: Group rows using group_by</a></li>
+<li class="chapter" data-level="5.2.4" data-path="5-manip.html"><a href="5-manip.html#mv4-create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.4</b> 5MV#4: Create new variables/change old variables using mutate</a></li>
+<li class="chapter" data-level="5.2.5" data-path="5-manip.html"><a href="5-manip.html#arrange"><i class="fa fa-check"></i><b>5.2.5</b> 5MV#5: Reorder the data frame using arrange</a></li>
+</ul></li>
+<li class="chapter" data-level="5.3" data-path="5-manip.html"><a href="5-manip.html#joining-data-frames"><i class="fa fa-check"></i><b>5.3</b> Joining data frames</a><ul>
+<li class="chapter" data-level="5.3.1" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables"><i class="fa fa-check"></i><b>5.3.1</b> Joining by Key Variables</a></li>
+<li class="chapter" data-level="5.3.2" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables-with-different-names"><i class="fa fa-check"></i><b>5.3.2</b> Joining by Key Variables with Different Names</a></li>
+</ul></li>
+<li class="chapter" data-level="5.4" data-path="5-manip.html"><a href="5-manip.html#optional-other-verbs"><i class="fa fa-check"></i><b>5.4</b> Optional: Other verbs</a><ul>
+<li class="chapter" data-level="5.4.1" data-path="5-manip.html"><a href="5-manip.html#select"><i class="fa fa-check"></i><b>5.4.1</b> Select variables using select</a></li>
+<li class="chapter" data-level="5.4.2" data-path="5-manip.html"><a href="5-manip.html#rename"><i class="fa fa-check"></i><b>5.4.2</b> Rename variables using rename</a></li>
+<li class="chapter" data-level="5.4.3" data-path="5-manip.html"><a href="5-manip.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.4.3</b> Find the top number of values using top_n</a></li>
+</ul></li>
+<li class="chapter" data-level="5.5" data-path="5-manip.html"><a href="5-manip.html#conclusion-1"><i class="fa fa-check"></i><b>5.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="5.5.1" data-path="5-manip.html"><a href="5-manip.html#resources-1"><i class="fa fa-check"></i><b>5.5.1</b> Resources</a></li>
+<li class="chapter" data-level="5.5.2" data-path="5-manip.html"><a href="5-manip.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5.2</b> Script of R code</a></li>
+<li class="chapter" data-level="5.5.3" data-path="5-manip.html"><a href="5-manip.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.5.3</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="part"><span><b>II Inference</b></span></li>
-<li class="chapter" data-level="6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via <code id="sim">mosaic</code></a><ul>
-<li class="chapter" data-level="" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="6.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
-<li class="chapter" data-level="6.1.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
-<li class="chapter" data-level="6.1.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+<li class="chapter" data-level="6" data-path="6-sim.html"><a href="6-sim.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via mosaic</a><ul>
+<li class="chapter" data-level="" data-path="6-sim.html"><a href="6-sim.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="6.1" data-path="6-sim.html"><a href="6-sim.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
+<li class="chapter" data-level="6.1.1" data-path="6-sim.html"><a href="6-sim.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
+<li class="chapter" data-level="6.1.2" data-path="6-sim.html"><a href="6-sim.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+</ul></li>
+<li class="chapter" data-level="6.2" data-path="6-sim.html"><a href="6-sim.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
+<li class="chapter" data-level="6.2.1" data-path="6-sim.html"><a href="6-sim.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
+<li class="chapter" data-level="6.2.2" data-path="6-sim.html"><a href="6-sim.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
 </ul></li>
-<li class="chapter" data-level="6.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
-<li class="chapter" data-level="6.2.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
-<li class="chapter" data-level="6.2.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
+<li class="chapter" data-level="6.3" data-path="6-sim.html"><a href="6-sim.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
+<li class="chapter" data-level="6.4" data-path="6-sim.html"><a href="6-sim.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
+<li class="chapter" data-level="6.5" data-path="6-sim.html"><a href="6-sim.html#conclusion-2"><i class="fa fa-check"></i><b>6.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="6.5.1" data-path="6-sim.html"><a href="6-sim.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5.1</b> Script of R code</a></li>
+<li class="chapter" data-level="6.5.2" data-path="6-sim.html"><a href="6-sim.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.5.2</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="6.3" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
-<li class="chapter" data-level="6.4" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
-<li class="chapter" data-level="6.5" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5</b> Script of R code</a></li>
-<li class="chapter" data-level="6.6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.6</b> What’s to come?</a></li>
 </ul></li>
 <li class="chapter" data-level="7" data-path="7-hypo.html"><a href="7-hypo.html"><i class="fa fa-check"></i><b>7</b> Hypothesis Testing</a><ul>
-<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="7.1" data-path="7-hypo.html"><a href="7-hypo.html#when-inference-is-not-needed"><i class="fa fa-check"></i><b>7.1</b> When Inference Is Not Needed</a></li>
 <li class="chapter" data-level="7.2" data-path="7-hypo.html"><a href="7-hypo.html#basics-of-hypothesis-testing"><i class="fa fa-check"></i><b>7.2</b> Basics of Hypothesis Testing</a></li>
 <li class="chapter" data-level="7.3" data-path="7-hypo.html"><a href="7-hypo.html#trial"><i class="fa fa-check"></i><b>7.3</b> Criminal trial analogy</a><ul>
@@ -249,44 +262,50 @@
 <li class="chapter" data-level="7.8.1" data-path="7-hypo.html"><a href="7-hypo.html#example-t-test-for-two-independent-samples"><i class="fa fa-check"></i><b>7.8.1</b> EXAMPLE: <span class="math inline">\(t\)</span>-test for two independent samples</a></li>
 <li class="chapter" data-level="7.8.2" data-path="7-hypo.html"><a href="7-hypo.html#conditions-for-t-test"><i class="fa fa-check"></i><b>7.8.2</b> Conditions for t-test</a></li>
 </ul></li>
-<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9</b> What’s to come?</a></li>
+<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#conclusion-3"><i class="fa fa-check"></i><b>7.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="7.9.1" data-path="7-hypo.html"><a href="7-hypo.html#script-of-r-code-3"><i class="fa fa-check"></i><b>7.9.1</b> Script of R code</a></li>
+<li class="chapter" data-level="7.9.2" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="chapter" data-level="8" data-path="8-ci.html"><a href="8-ci.html"><i class="fa fa-check"></i><b>8</b> Confidence Intervals</a><ul>
-<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a></li>
-<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#mean_rating"><i class="fa fa-check"></i><b>8.2</b> mean_rating</a><ul>
-<li class="chapter" data-level="8.2.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.2.1</b> Review of Bootstrapping</a></li>
-</ul></li>
-<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.3</b> Relation to hypothesis testing</a></li>
-<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.4</b> Effect size</a></li>
-<li class="chapter" data-level="8.5" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-3"><i class="fa fa-check"></i><b>8.5</b> Script of R code</a></li>
-<li class="chapter" data-level="8.6" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.6</b> What’s to come?</a></li>
-</ul></li>
-<li class="chapter" data-level="9" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html"><i class="fa fa-check"></i><b>9</b> Regression via <code id="regress">broom</code></a><ul>
-<li class="chapter" data-level="" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="9.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
-<li class="chapter" data-level="9.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
-<li class="chapter" data-level="9.2.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
-</ul></li>
-<li class="chapter" data-level="9.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
-<li class="chapter" data-level="9.3.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
-<li class="chapter" data-level="9.3.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
-<li class="chapter" data-level="9.3.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#interpretting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpretting the slope</a></li>
-<li class="chapter" data-level="9.3.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
-</ul></li>
-<li class="chapter" data-level="9.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
-<li class="chapter" data-level="9.4.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
-<li class="chapter" data-level="9.4.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
-<li class="chapter" data-level="9.4.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
-<li class="chapter" data-level="9.4.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
-<li class="chapter" data-level="9.4.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
-</ul></li>
-<li class="chapter" data-level="9.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
-<li class="chapter" data-level="9.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
-<li class="chapter" data-level="9.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#script-of-r-code-4"><i class="fa fa-check"></i><b>9.7</b> Script of R code</a></li>
-<li class="chapter" data-level="9.8" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.8</b> What’s to come?</a></li>
+<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a><ul>
+<li class="chapter" data-level="8.1.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.1.1</b> Review of Bootstrapping</a></li>
+</ul></li>
+<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.2</b> Relation to hypothesis testing</a></li>
+<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.3</b> Effect size</a></li>
+<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#conclusion-4"><i class="fa fa-check"></i><b>8.4</b> Conclusion</a><ul>
+<li class="chapter" data-level="8.4.1" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-4"><i class="fa fa-check"></i><b>8.4.1</b> Script of R code</a></li>
+<li class="chapter" data-level="8.4.2" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.4.2</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="9" data-path="9-regress.html"><a href="9-regress.html"><i class="fa fa-check"></i><b>9</b> Regression via broom</a><ul>
+<li class="chapter" data-level="" data-path="9-regress.html"><a href="9-regress.html#needed-packages-6"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="9.1" data-path="9-regress.html"><a href="9-regress.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
+<li class="chapter" data-level="9.2" data-path="9-regress.html"><a href="9-regress.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
+<li class="chapter" data-level="9.2.1" data-path="9-regress.html"><a href="9-regress.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
+</ul></li>
+<li class="chapter" data-level="9.3" data-path="9-regress.html"><a href="9-regress.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
+<li class="chapter" data-level="9.3.1" data-path="9-regress.html"><a href="9-regress.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
+<li class="chapter" data-level="9.3.2" data-path="9-regress.html"><a href="9-regress.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
+<li class="chapter" data-level="9.3.3" data-path="9-regress.html"><a href="9-regress.html#interpreting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpreting the slope</a></li>
+<li class="chapter" data-level="9.3.4" data-path="9-regress.html"><a href="9-regress.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
+</ul></li>
+<li class="chapter" data-level="9.4" data-path="9-regress.html"><a href="9-regress.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
+<li class="chapter" data-level="9.4.1" data-path="9-regress.html"><a href="9-regress.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
+<li class="chapter" data-level="9.4.2" data-path="9-regress.html"><a href="9-regress.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
+<li class="chapter" data-level="9.4.3" data-path="9-regress.html"><a href="9-regress.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
+<li class="chapter" data-level="9.4.4" data-path="9-regress.html"><a href="9-regress.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.5" data-path="9-regress.html"><a href="9-regress.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
+<li class="chapter" data-level="9.4.6" data-path="9-regress.html"><a href="9-regress.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.7" data-path="9-regress.html"><a href="9-regress.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
+</ul></li>
+<li class="chapter" data-level="9.5" data-path="9-regress.html"><a href="9-regress.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
+<li class="chapter" data-level="9.6" data-path="9-regress.html"><a href="9-regress.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
+<li class="chapter" data-level="9.7" data-path="9-regress.html"><a href="9-regress.html#conclusion-5"><i class="fa fa-check"></i><b>9.7</b> Conclusion</a><ul>
+<li class="chapter" data-level="9.7.1" data-path="9-regress.html"><a href="9-regress.html#script-of-r-code-5"><i class="fa fa-check"></i><b>9.7.1</b> Script of R code</a></li>
+<li class="chapter" data-level="9.7.2" data-path="9-regress.html"><a href="9-regress.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.7.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="part"><span><b>III Conclusion</b></span></li>
 <li class="chapter" data-level="10" data-path="10-effective-data-storytelling.html"><a href="10-effective-data-storytelling.html"><i class="fa fa-check"></i><b>10</b> Effective Data Storytelling</a><ul>
@@ -304,7 +323,8 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="B" data-path="B-appendixB.html"><a href="B-appendixB.html"><i class="fa fa-check"></i><b>B</b> Inference Examples</a><ul>
-<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-6"><i class="fa fa-check"></i><b>B.1</b> Needed packages</a></li>
+<li class="chapter" data-level="" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-7"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#inference-mind-map"><i class="fa fa-check"></i><b>B.1</b> Inference Mind Map</a></li>
 <li class="chapter" data-level="B.2" data-path="B-appendixB.html"><a href="B-appendixB.html#one-mean"><i class="fa fa-check"></i><b>B.2</b> One Mean</a><ul>
 <li class="chapter" data-level="B.2.1" data-path="B-appendixB.html"><a href="B-appendixB.html#problem-statement"><i class="fa fa-check"></i><b>B.2.1</b> Problem Statement</a></li>
 <li class="chapter" data-level="B.2.2" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses"><i class="fa fa-check"></i><b>B.2.2</b> Competing Hypotheses</a></li>
@@ -352,6 +372,7 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="C" data-path="C-appendixC.html"><a href="C-appendixC.html"><i class="fa fa-check"></i><b>C</b> Reach for the Starts</a><ul>
+<li class="chapter" data-level="" data-path="C-appendixC.html"><a href="C-appendixC.html#needed-packages-8"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="C.1" data-path="C-appendixC.html"><a href="C-appendixC.html#sorted-barplots"><i class="fa fa-check"></i><b>C.1</b> Sorted barplots</a></li>
 <li class="chapter" data-level="C.2" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-graphics"><i class="fa fa-check"></i><b>C.2</b> Interactive graphics</a><ul>
 <li class="chapter" data-level="C.2.1" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-line-graphs"><i class="fa fa-check"></i><b>C.2.1</b> Interactive line-graphs</a></li>
@@ -382,8 +403,8 @@ <h1><span class="header-section-number">8</span> Confidence Intervals</h1>
 <p>A <em>confidence interval</em> gives a range of plausible values for a parameter. It depends on a specified <em>confidence level</em> with higher confidence levels corresponding to wider confidence intervals and lower confidence levels corresponding to narrower confidence intervals. Common confidence levels include 90%, 95%, and 99%.</p>
 <hr />
 <p>Usually we don’t just begin chapters with a definition, but <em>confidence intervals</em> are simple to define and play an important role in the sciences and any field that uses data. You can think of a confidence interval as playing the role of a net when fishing. Instead of just trying to catch a fish with a single spear (estimating an unknown parameter by using a single point estimate/statistic), we can use a net to try to provide a range of possible locations for the fish (use a range of possible values based around our statistic to make a plausible guess as to the location of the parameter).</p>
-<div id="needed-packages-4" class="section level2 unnumbered">
-<h2>Needed packages</h2>
+<div id="needed-packages-5" class="section level3 unnumbered">
+<h3>Needed packages</h3>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(dplyr)
 <span class="kw">library</span>(ggplot2)
 <span class="kw">library</span>(mosaic)
@@ -395,21 +416,19 @@ <h2>Needed packages</h2>
 </div>
 <div id="bootstrapping" class="section level2">
 <h2><span class="header-section-number">8.1</span> Bootstrapping</h2>
-<p>Just as we did in Chapter <a href="7-hypo.html#hypo">7</a> with the Lady Tasting Tea when making hypotheses about a population total with which we would like to test which one is more plausible, we can also use simulation to infer conclusions about a population quantitative statistic such as the mean. In this case, we will focus on constructing confidence intervals to produce plausible values for a population mean. (We can do a similar analysis for a population median or other summary measure as well.)</p>
+<p>Just as we did in Chapter <a href="7-hypo.html#hypo">7</a> with the Lady Tasting Tea when making hypotheses about a population total with which we would like to test which one is more plausible, we can also use computation to infer conclusions about a population quantitative statistic such as the mean. In this case, we will focus on constructing confidence intervals to produce plausible values for a population mean. (We can do a similar analysis for a population median or other summary measure as well.)</p>
 <p>Traditionally, the way to construct confidence intervals for a mean is to assume a normal distribution for the population or to invoke the Central Limit Theorem and get, what often appears to be magic, results. (This is similar to what was done in Section <a href="7-hypo.html#theory-hypo">7.8</a>.) These methods are often not intuitive, especially for those that lack a strong mathematical background. They also come with their fair share of assumptions and often turn Statistics, a field that is full of tons of useful applications to many different fields and disciplines, into a robotic procedural-based topic. It doesn’t have to be that way!</p>
-<p>In this section, we will introduce the concept of <strong>bootstrapping</strong>. It will be a useful tool that will allow us to estimate the variability of our statistic from sample to sample. One neat feature of bootstrapping is that it enables us to approximate the sampling distribution and estimate the distribution’s standard deviation using ONLY the information in the one selected (original) sample.</p>
-<p>It sounds just as plagued with the magical type qualities of traditional theory-based inference on initial glance but we will see that it provides an intuitive and useful way to make inferences, especially when the samples are of medium to large size.</p>
+<p>In this section, we will introduce the concept of <strong>bootstrapping</strong>. It will be a useful tool that will allow us to estimate the variability of our statistic from sample to sample. One neat feature of bootstrapping is that it enables us to approximate the sampling distribution and estimate the distribution’s standard deviation using ONLY the information in the one selected (original) sample. It sounds just as plagued with the magical type qualities of traditional theory-based inference on initial glance but we will see that it provides an intuitive and useful way to make inferences, especially when the samples are of medium to large size.</p>
 <p>To introduce the concept of bootstrapping, we again will use the <code>movies</code> data set in the <code>ggplot2movies</code> data frame. Remember that we load this data frame into R in much the same way as we loaded <code>flights</code> and <code>weather</code> from the <code>nycflights13</code> package.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(ggplot2movies)
 <span class="kw">data</span>(movies, <span class="dt">package =</span> <span class="st">&quot;ggplot2movies&quot;</span>)</code></pre></div>
-<p>Recall that you can also glance at this data frame using the <code>View</code> function and look at the help documentation for <code>movies</code> using the <code>?</code> function.</p>
-<p>We will explore many other features of this data set in the chapters to come, but here we will be focusing on the <code>rating</code> variable corresponding to the average IMDB user rating.</p>
-<p>You may notice that this data set is quite large: 58,788 movies have data collected about them here. This will correspond to our population of ALL movies. Remember from Chapter <a href="6-simulating-randomness-via-mosaic.html#sim"><strong>??</strong></a> that our population is rarely known. We use this data set as our population here to show you the power of bootstrapping in estimating population parameters. We’ll see how <strong>confidence intervals</strong> built using the bootstrap distribution do at including our population parameter of interest. Here we can actually calculate these values since our population is known, but remember that in general this isn’t the case.</p>
+<p>Recall that you can also glance at this data frame using the <code>View</code> function and look at the help documentation for <code>movies</code> using the <code>?</code> function. We will explore many other features of this data set in the chapters to come, but here we will be focusing on the <code>rating</code> variable corresponding to the average IMDB user rating.</p>
+<p>You may notice that this data set is quite large: 58,788 movies have data collected about them here. This will correspond to our population of ALL movies. Remember from Chapter <a href="6-sim.html#sim">6</a> that our population is rarely known. We use this data set as our population here to show you the power of bootstrapping in estimating population parameters. We’ll see how <strong>confidence intervals</strong> built using the bootstrap distribution perform at including our population parameter of interest. Here we can actually calculate these values since our population is known, but remember that in general this isn’t the case.</p>
 <p>Let’s take a look at what the distribution of our population <code>ratings</code> looks like. We’ll see that we will use the distribution of our sample(s) as an estimate of this population histogram.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">movies %&gt;%<span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> rating)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">color =</span> <span class="st">&quot;white&quot;</span>, <span class="dt">bins =</span> <span class="dv">20</span>)</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-109"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-109-1.png" alt="Population ratings histogram" width="\textwidth" />
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-112"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-112-1.png" alt="Population ratings histogram" width="\textwidth" />
 <p class="caption">
 Figure 8.1: Population ratings histogram
 </p>
@@ -421,18 +440,19 @@ <h2><span class="header-section-number">8.1</span> Bootstrapping</h2>
 </p>
 </div>
 <p><strong>(LC8.1)</strong> Why was a histogram chosen as the plot to make for the <code>rating</code> variable above?</p>
-<p><strong>(LC8.2)</strong> Why does the shape of the <code>rating</code> histogram tell us about how IMDB users rate movies? What stands out about the plot?</p>
+<p><strong>(LC8.2)</strong> What does the shape of the <code>rating</code> histogram tell us about how IMDB users rate movies? What stands out about the plot?</p>
 <hr />
 <p>It’s important to think about what our goal is here. We would like to produce a confidence interval for the population mean <code>rating</code>. We will have to pretend for a moment that we don’t have all 58,788 movies. Let’s say that we only have a random sample of 50 movies from this data set instead. In order to get a random sample, we can use the <code>resample</code> function in the <code>mosaic</code> package with <code>replace = FALSE</code>. We could also use the <code>sample_n</code> function from <code>dplyr</code>.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">set.seed</span>(<span class="dv">2017</span>)
 <span class="kw">library</span>(mosaic)
+<span class="kw">library</span>(dplyr)
 movies_sample &lt;-<span class="st"> </span>movies %&gt;%<span class="st"> </span><span class="kw">resample</span>(<span class="dt">size =</span> <span class="dv">50</span>, <span class="dt">replace =</span> <span class="ot">FALSE</span>)</code></pre></div>
 <p>The <code>resample</code> function has filtered the data frame <code>movies</code> “at random” to choose only 50 rows from the larger <code>movies</code> data frame. We store information on these 50 movies in the <code>movies_sample</code> data frame.</p>
 <p>Let’s now explore what the <code>rating</code> variable looks like for these 50 movies:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">movies_sample %&gt;%<span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> rating)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">color =</span> <span class="st">&quot;white&quot;</span>, <span class="dt">bins =</span> <span class="dv">20</span>)</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-111"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-111-1.png" alt="Sample ratings histogram" width="\textwidth" />
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-114"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-114-1.png" alt="Sample ratings histogram" width="\textwidth" />
 <p class="caption">
 Figure 8.2: Sample ratings histogram
 </p>
@@ -446,8 +466,7 @@ <h2><span class="header-section-number">8.1</span> Bootstrapping</h2>
 <p>Note the use of the <code>( )</code> at the beginning and the end of this creation of the <code>movies_sample_mean</code> object. If you’d like to print out your newly created object, you can enclose it in the parentheses as we have here.</p>
 <p>This value of 5.894 is just one guess at the population mean. The idea behind <em>bootstrapping</em> is to sample <strong>with replacement</strong> from the original sample to create new <strong>resamples</strong> of the same size as our original sample.</p>
 <p>Returning to our example, let’s investigate what one such resample of the <code>movies_sample</code> data set accomplishes. We can create one resample/bootstrap sample by using the <code>resample</code> function in the <code>mosaic</code> package.</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(mosaic)
-boot1 &lt;-<span class="st"> </span><span class="kw">resample</span>(movies_sample) %&gt;%
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">boot1 &lt;-<span class="st"> </span><span class="kw">resample</span>(movies_sample) %&gt;%
 <span class="st">  </span><span class="kw">arrange</span>(orig.id)</code></pre></div>
 <p>The important thing to note here is the original row numbers from the <code>movies_sample</code> data frame in the far right column called <code>orig.ids</code>. Since we are sampling with replacement, there is a strong likelihood that some of the 50 observational units are going to be selected again.</p>
 <p>You may be asking yourself what does this mean and how does this lead us to creating a distribution for the sample mean. Recall that the original sample mean of our data was calculated using the <code>summarize</code> function above.</p>
@@ -484,13 +503,13 @@ <h2><span class="header-section-number">8.1</span> Bootstrapping</h2>
 ## 9  6.098
 ## 10 5.608</code></pre>
 <p>You should see some variability begin to tease its way out here. Many of the simulated means will be close to our original sample mean but many will stray pretty far away. This occurs because outliers may have been selected a couple of times in the resampling or small values were selected more than larger. There are myriad reasons why this might be the case.</p>
-<p>So what’s the next step now? Just as we repeated the repetitions thousands of times with the “Lady Tasting Tea” example, we can do a similar thing here.</p>
+<p>So what’s the next step now? Just as we repeated the repetitions thousands of times with the “Lady Tasting Tea” example, we can do a similar thing here:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">trials &lt;-<span class="st"> </span><span class="kw">do</span>(<span class="dv">10000</span>) *<span class="st"> </span><span class="kw">summarize</span>(<span class="kw">resample</span>(movies_sample), 
                                 <span class="dt">mean =</span> <span class="kw">mean</span>(rating))
 <span class="kw">ggplot</span>(<span class="dt">data =</span> trials, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> mean)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">bins =</span> <span class="dv">30</span>, <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>)</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-116"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-116-1.png" alt="Bootstrapped means histogram" width="\textwidth" />
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-119"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-119-1.png" alt="Bootstrapped means histogram" width="\textwidth" />
 <p class="caption">
 Figure 8.3: Bootstrapped means histogram
 </p>
@@ -501,7 +520,7 @@ <h2><span class="header-section-number">8.1</span> Bootstrapping</h2>
 ## 1 mean 5.456 6.296  0.95 percentile    5.894</code></pre>
 <p>It’s always important at this point to interpret the results of this confidence interval calculation. In this context, we can say something like the following:</p>
 <blockquote>
-<p>Based on the sample data and bootstrapping techniques, we can be 95% confident that the true mean rating of ALL IMDB ratings is between 5.456 and 6.296.</p>
+<p>Based on the sample data and bootstrapping techniques, we can be 95% confident that the true mean rating of <strong>ALL</strong> IMDB ratings is between 5.456 and 6.296.</p>
 </blockquote>
 <p>This statement may seem a little confusing to you. Another way to think about this is that this confidence interval was constructed using the sample data by a procedure that is <strong>95% reliable.</strong> We will get invalid results 5% of the time. Just as we had a trade-off with <span class="math inline">\(\alpha\)</span> and <span class="math inline">\(\beta\)</span> with hypothesis tests, we have a similar trade-off here with setting the confidence level.</p>
 <p>To further reiterate this point, the graphic below from <span class="citation">Diez, Barr, and Çetinkaya-Rundel (<a href="#ref-isrs2014">2014</a>)</span> shows us that if we repeated a confidence interval process 25 times with 25 different samples, we would expect about 95% of them to actually contain the population parameter of interest. This parameter is marked with a dotted vertical line. We can see that only one confidence interval does not overlap with this value. (The one marked in red.) Therefore 24 in 25 (96%), which is quite close to our 95% reliability, do include the population parameter.</p>
@@ -512,24 +531,23 @@ <h2><span class="header-section-number">8.1</span> Bootstrapping</h2>
 </p>
 </div>
 <p>Remember that we are pretending like we don’t know what the mean IMDB rating for ALL movies is. Our population here is all of the movies listed in the <code>movies</code> data frame from <code>ggplot2movies</code>. So does our bootstrapped confidence interval here contain the actual mean value?</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">movies %&gt;%<span class="st"> </span><span class="kw">summarize</span>(<span class="dt">mean_rating =</span> <span class="kw">mean</span>(rating)) %&gt;%<span class="st"> </span>
-<span class="st">  </span><span class="kw">kable</span>()</code></pre></div>
-</div>
-<div id="mean_rating" class="section level2">
-<h2><span class="header-section-number">8.2</span> mean_rating</h2>
-<pre><code> 5.93285</code></pre>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">movies %&gt;%<span class="st"> </span><span class="kw">summarize</span>(<span class="dt">mean_rating =</span> <span class="kw">mean</span>(rating))</code></pre></div>
+<pre><code>## # A tibble: 1 × 1
+##   mean_rating
+##         &lt;dbl&gt;
+## 1       5.933</code></pre>
 <p>We see here that the population mean does fall in our range of plausible values generated from the bootstrapped samples.</p>
-<p>We can also get an idea of how the theory-based inference techniques would have approximated this confidence interval by using the formula <span class="math display">\[\bar{x} \pm (2 * SE),\]</span> where <span class="math inline">\(\bar{x}\)</span> is our original sample mean and <span class="math inline">\(SE\)</span> stands for <strong>standard error</strong> and corresponds to the standard deviation of the bootstrap distribution. The value of 2 here corresponds to it being a 95% confidence interval. This formula assumes that the bootstrap distribution is symmetric and bell-shaped. This is often the case with bootstrap distributions, especially those in which the original distribution of the sample is not highly skewed.</p>
+<p>We can also get an idea of how the theory-based inference techniques would have approximated this confidence interval by using the formula <span class="math display">\[\bar{x} \pm (2 * SE),\]</span> where <span class="math inline">\(\bar{x}\)</span> is our original sample mean and <span class="math inline">\(SE\)</span> stands for <strong>standard error</strong> and corresponds to the standard deviation of the bootstrap distribution. The value of 2 here corresponds to it being a 95% confidence interval. (95% of the values in a normal distribution fall within 2 standard deviations of the mean.) This formula assumes that the bootstrap distribution is symmetric and bell-shaped. This is often the case with bootstrap distributions, especially those in which the original distribution of the sample is not highly skewed.</p>
 <hr />
 <p><strong>Definition: standard error</strong></p>
-<p>The <em>standard error</em> is the standard deviation of the sampling distribution. The sampling distribution may be approximated by the bootstrap distribution or the null distribution depending on the context. Traditional theory-based methodologies for inference also have formulas for standard errors assuming some conditions are met.</p>
+<p>The <em>standard error</em> is the standard deviation of the sampling distribution. The sampling distribution may be approximated by the bootstrap distribution or the null distribution depending on the context. Traditional theory-based methodologies for inference also have formulas for standard errors, assuming some conditions are met.</p>
 <hr />
 <p>To compute this type of confidence interval, we only need to make a slight modification to the <code>confint</code> function seen above. (The expression after the <span class="math inline">\(\pm\)</span> sign is known as the <strong>margin of error</strong>.)</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">(cise_mean_rating &lt;-<span class="st"> </span><span class="kw">confint</span>(trials, <span class="dt">level =</span> <span class="fl">0.95</span>, <span class="dt">method =</span> <span class="st">&quot;stderr&quot;</span>))</code></pre></div>
-<pre><code>##   name    lower    upper level method estimate margin.of.error
-## 1 mean 5.468465 6.314277  0.95 stderr    5.894       0.4229063</code></pre>
+<pre><code>##   name lower upper level method estimate margin.of.error
+## 1 mean 5.468 6.314  0.95 stderr    5.894          0.4229</code></pre>
 <blockquote>
-<p>Based on the sample data and bootstrapping techniques, we can be 95% confident that the true mean rating of ALL IMDB ratings is between 5.4684649 and 6.3142775.</p>
+<p>Based on the sample data and bootstrapping techniques, we can be 95% confident that the true mean rating of ALL IMDB ratings is between 5.4685 and 6.3143.</p>
 </blockquote>
 <hr />
 <div class="learncheck">
@@ -539,11 +557,11 @@ <h2><span class="header-section-number">8.2</span> mean_rating</h2>
 </div>
 <p><strong>(LC8.6)</strong> Reproduce the bootstrapping above using a sample of size 50 instead of 25. What changes do you see?</p>
 <p><strong>(LC8.7)</strong> Reproduce the bootstrapping above using a sample of size 5 instead of 25. What changes do you see?</p>
-<p><strong>(LC8.8)</strong> How does the sample size affect the analysis?</p>
+<p><strong>(LC8.8)</strong> How does the sample size affect the analysis above?</p>
 <p><strong>(LC8.9)</strong> Why must bootstrap samples be the same size as the original sample?</p>
 <hr />
 <div id="review-of-bootstrapping" class="section level3">
-<h3><span class="header-section-number">8.2.1</span> Review of Bootstrapping</h3>
+<h3><span class="header-section-number">8.1.1</span> Review of Bootstrapping</h3>
 <p>We can summarize the process to generate a bootstrap distribution here in a series of steps that clearly identify the terminology we will use <span class="citation">(R. Lock et al. <a href="#ref-lock2012">2012</a>)</span>.</p>
 <ul>
 <li>Generate <code>bootstrap samples</code> by sampling with replacement from the original sample, using the same sample size.</li>
@@ -560,16 +578,16 @@ <h3><span class="header-section-number">8.2.1</span> Review of Bootstrapping</h3
 </div>
 </div>
 <div id="relation-to-hypothesis-testing" class="section level2">
-<h2><span class="header-section-number">8.3</span> Relation to hypothesis testing</h2>
-<p>Recall that we found a statistically significant difference in the sample mean of romance movie ratings compared to the sample mean of action movie ratings. We concluded Chapter <a href="7-hypo.html#hypo">7</a> by attempted to understand just how much greater we could expect the <em>population</em> mean romance movie rating to be as compared to the <em>population</em> mean action movie rating. In order to do so, we will calculate a confidence interval for the difference <span class="math inline">\(\mu_r - \mu_a\)</span>. We’ll then go back to our population parameter values and see if our confidence interval contains our parameter value.</p>
+<h2><span class="header-section-number">8.2</span> Relation to hypothesis testing</h2>
+<p>Recall that we found a statistically significant difference in the sample mean of romance movie ratings compared to the sample mean of action movie ratings. We concluded Chapter <a href="7-hypo.html#hypo">7</a> by attempting to understand just how much greater we could expect the <em>population</em> mean romance movie rating to be compared to the <em>population</em> mean action movie rating. In order to do so, we will calculate a confidence interval for the difference <span class="math inline">\(\mu_r - \mu_a\)</span>. We’ll then go back to our population parameter values and see if our confidence interval contains our parameter value.</p>
 <p>We could use bootstrapping in a way similar to that done above, except now on a difference in sample means, to create a distribution and then use the <code>confint</code> function with the option of <code>quantile</code> to determine a confidence interval for the plausible values of the difference in population means. This is an excellent programming activity and the reader is urged to try to do so.</p>
 <p>Recall what the randomization/null distribution looked like for our simulated shuffled sample means:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(ggplot2)
 <span class="kw">library</span>(dplyr)
 <span class="kw">ggplot</span>(<span class="dt">data =</span> rand_distn, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> diffmean)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">color =</span> <span class="st">&quot;white&quot;</span>, <span class="dt">bins =</span> <span class="dv">20</span>)</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-120"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-120-1.png" alt="Simulated shuffled sample means histogram" width="\textwidth" />
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-123"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-123-1.png" alt="Simulated shuffled sample means histogram" width="\textwidth" />
 <p class="caption">
 Figure 8.6: Simulated shuffled sample means histogram
 </p>
@@ -577,21 +595,21 @@ <h2><span class="header-section-number">8.3</span> Relation to hypothesis testin
 <p>With this null distribution being quite symmetric and bell-shaped, the standard error method introduced above likely provides a good estimate of a range of plausible values for <span class="math inline">\(\mu_r - \mu_a\)</span>. Another nice option here is that we can use the standard deviation of the null/randomization distribution we just found with our hypothesis test.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">(std_err &lt;-<span class="st"> </span>rand_distn %&gt;%<span class="st"> </span><span class="kw">summarize</span>(<span class="dt">se =</span> <span class="kw">sd</span>(diffmean)))</code></pre></div>
 <pre><code>## # A tibble: 1 × 1
-##           se
-##        &lt;dbl&gt;
-## 1 0.03182225</code></pre>
-<p>Remembering that we can use the general formula of <span class="math inline">\(statistic \pm (2 * SE)\)</span> we get the following result for plausible values of the difference in population means at the 95% level.</p>
+##        se
+##     &lt;dbl&gt;
+## 1 0.03182</code></pre>
+<p>We can use the general formula of <span class="math inline">\(statistic \pm (2 * SE)\)</span> for a confidence interval to obtain the following result for plausible values of the difference in population means at the 95% level.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">(lower &lt;-<span class="st"> </span>obs_diff -<span class="st"> </span>(<span class="dv">2</span> *<span class="st"> </span>std_err))</code></pre></div>
-<pre><code>##          se
-## 1 0.7657673</code></pre>
+<pre><code>##       se
+## 1 0.7658</code></pre>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">(upper &lt;-<span class="st"> </span>obs_diff +<span class="st"> </span>(<span class="dv">2</span> *<span class="st"> </span>std_err))</code></pre></div>
-<pre><code>##          se
-## 1 0.8930563</code></pre>
+<pre><code>##       se
+## 1 0.8931</code></pre>
 <p>We can, therefore, say that we are 95% confident that the population mean rating for romance movies is between 0.766 and 0.893 points higher than for that of action movies.</p>
-<p>The important thing to check here is whether 0 is contained in the confidence interval. If it is, it is plausible that the difference in the two population means between the two groups is 0. This means that the null hypothesis is plausible. The results of the hypothesis test and the confidence interval should match as they do here. We rejected the null hypothesis with hypothesis testing and we have evidence here than the mean rating for romance movies is higher than for action movies.</p>
+<p>The important thing to check here is whether 0 is contained in the confidence interval. If it is, it is plausible that the difference in the two population means between the two groups is 0. This means that the null hypothesis is plausible. The results of the hypothesis test and the confidence interval should match as they do here. We rejected the null hypothesis with hypothesis testing and we have evidence here that the mean rating for romance movies is higher than for action movies.</p>
 </div>
 <div id="effect-size" class="section level2">
-<h2><span class="header-section-number">8.4</span> Effect size</h2>
+<h2><span class="header-section-number">8.3</span> Effect size</h2>
 <p>The phrase <strong>effect size</strong> has been thrown around recently as an alternative to <span class="math inline">\(p\)</span>-values. In combination with the confidence interval, it can be often more valuable than just looking at the results of a hypothesis test. It depends on the scientific discipline exactly what is meant by “effect size” but, in general, it refers to <em>the magnitude of the difference between group measurements</em>. For our two sample problem involving movies, it is the observed difference in sample means <code>obs_diff</code>.</p>
 <p>It’s worthy of mention here that confidence intervals are always centered at the observed statistic. In other words, if you are looking at a confidence interval and someone asks you what the “effect size” is you can simply find the midpoint of the stated confidence interval.</p>
 <hr />
@@ -606,7 +624,7 @@ <h2><span class="header-section-number">8.4</span> Effect size</h2>
 <p><strong>(LC8.13)</strong> Why is a 95% confidence interval wider than a 90% confidence interval? Explain by using a concrete example from everyday life about what is meant by “confidence.”</p>
 <p><strong>(LC8.14)</strong> How would confidence intervals correspond to one-sided hypothesis tests?</p>
 <p><strong>(LC8.15)</strong> There is a relationship between the significance level and the confidence level. What do you think it is?</p>
-<p><strong>(LC8.16)</strong> The moment the phrase “standard error” is mentioned, there seems to be someone that says “The standard error is <span class="math inline">\(s\)</span> divided by the square root of <span class="math inline">\(n\)</span>.” This standard error formula is correct and used in the theory-based procedure for an inference on one mean. But… does it always work? For <code>samp1</code>, <code>samp2</code>, and <code>samp3</code> below, do the following:</p>
+<p><strong>(LC8.16)</strong> The moment the phrase “standard error” is mentioned, there seems to be someone that says “The standard error is <span class="math inline">\(s\)</span> divided by the square root of <span class="math inline">\(n\)</span>.” This standard error formula is used in the theory-based procedure for an inference on one mean. But… does it always work? For <code>samp1</code>, <code>samp2</code>, and <code>samp3</code> below, do the following:</p>
 <ol style="list-style-type: decimal">
 <li>produce a bootstrap distribution based on the sample</li>
 <li>calculate the standard deviation of the bootstrap distribution</li>
@@ -618,14 +636,17 @@ <h2><span class="header-section-number">8.4</span> Effect size</h2>
 <p>Describe how <span class="math inline">\(s / \sqrt{n}\)</span> does in approximating the standard error for these three samples and their corresponding bootstrap distributions.</p>
 <hr />
 </div>
-<div id="script-of-r-code-3" class="section level2">
-<h2><span class="header-section-number">8.5</span> Script of R code</h2>
+<div id="conclusion-4" class="section level2">
+<h2><span class="header-section-number">8.4</span> Conclusion</h2>
+<div id="script-of-r-code-4" class="section level3">
+<h3><span class="header-section-number">8.4.1</span> Script of R code</h3>
 <p>An R script file of all R code used in this chapter is available <a href="http://ismayc.github.io/moderndiver-book/08-ci.R">here</a>.</p>
 </div>
-<div id="whats-to-come-5" class="section level2">
-<h2><span class="header-section-number">8.6</span> What’s to come?</h2>
-<p>We will see in Chapter <a href="9-regression-via-broom.html#regress"><strong>??</strong></a> many of the same ideas we have seen with hypothesis testing and confidence intervals in the last two chapters. Regression is frequently associated both correctly and incorrectly with statistics and data analysis, so you’ll need to make sure you understand when it is appropriate and when it is not.</p>
+<div id="whats-to-come-5" class="section level3">
+<h3><span class="header-section-number">8.4.2</span> What’s to come?</h3>
+<p>We will see in Chapter <a href="9-regress.html#regress">9</a> many of the same ideas we have seen with hypothesis testing and confidence intervals in the last two chapters. Regression is frequently associated both correctly and incorrectly with statistics and data analysis, so you’ll need to make sure you understand when it is appropriate and when it is not.</p>
 
+</div>
 </div>
 </div>
 <h3>References</h3>
@@ -643,7 +664,7 @@ <h3>References</h3>
         </div>
       </div>
 <a href="7-hypo.html" class="navigation navigation-prev " aria-label="Previous page"><i class="fa fa-angle-left"></i></a>
-<a href="9-regression-via-broom.html" class="navigation navigation-next " aria-label="Next page""><i class="fa fa-angle-right"></i></a>
+<a href="9-regress.html" class="navigation navigation-next " aria-label="Next page""><i class="fa fa-angle-right"></i></a>
 
 <script src="libs/gitbook-2.6.7/js/app.min.js"></script>
 <script src="libs/gitbook-2.6.7/js/lunr.js"></script>
diff --git a/docs/9-regression-via-broom.html b/docs/9-regress.html
similarity index 67%
rename from docs/9-regression-via-broom.html
rename to docs/9-regress.html
index 8c1b88324..364c4b34e 100644
--- a/docs/9-regression-via-broom.html
+++ b/docs/9-regress.html
@@ -26,7 +26,7 @@
 <meta name="author" content="Chester Ismay and Albert Y. Kim">
 
 
-<meta name="date" content="2017-01-07">
+<meta name="date" content="2017-01-10">
 
   <meta name="viewport" content="width=device-width, initial-scale=1">
   <meta name="apple-mobile-web-app-capable" content="yes">
@@ -119,9 +119,9 @@
 
 <ul class="summary">
 <li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> Preamble</a><ul>
-<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book</a></li>
+<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book---for-instructors"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book - For Instructors</a></li>
 <li class="chapter" data-level="1.2" data-path="index.html"><a href="index.html#contribute"><i class="fa fa-check"></i><b>1.2</b> Contribute</a></li>
-<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started"><i class="fa fa-check"></i><b>1.3</b> Getting Started</a></li>
+<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started---for-students"><i class="fa fa-check"></i><b>1.3</b> Getting Started - For Students</a></li>
 <li class="chapter" data-level="" data-path="index.html"><a href="index.html#colophon"><i class="fa fa-check"></i>Colophon</a></li>
 </ul></li>
 <li class="chapter" data-level="2" data-path="2-intro.html"><a href="2-intro.html"><i class="fa fa-check"></i><b>2</b> Introduction</a><ul>
@@ -133,87 +133,100 @@
 </ul></li>
 <li class="part"><span><b>I Data Exploration</b></span></li>
 <li class="chapter" data-level="3" data-path="3-tidy.html"><a href="3-tidy.html"><i class="fa fa-check"></i><b>3</b> Tidy Data</a><ul>
+<li class="chapter" data-level="" data-path="3-tidy.html"><a href="3-tidy.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="3.1" data-path="3-tidy.html"><a href="3-tidy.html#what-is-tidy-data"><i class="fa fa-check"></i><b>3.1</b> What is tidy data?</a></li>
-<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#the-nycflights13-datasets"><i class="fa fa-check"></i><b>3.2</b> The <code>nycflights13</code> datasets</a></li>
-<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a></li>
+<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#datasets-in-the-nycflights13-package"><i class="fa fa-check"></i><b>3.2</b> Datasets in the <code>nycflights13</code> package</a></li>
+<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a><ul>
+<li class="chapter" data-level="3.3.1" data-path="3-tidy.html"><a href="3-tidy.html#identification-variables"><i class="fa fa-check"></i><b>3.3.1</b> Identification variables</a></li>
+</ul></li>
 <li class="chapter" data-level="3.4" data-path="3-tidy.html"><a href="3-tidy.html#normal-forms-of-data"><i class="fa fa-check"></i><b>3.4</b> Normal forms of data</a></li>
 <li class="chapter" data-level="3.5" data-path="3-tidy.html"><a href="3-tidy.html#whats-to-come"><i class="fa fa-check"></i><b>3.5</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via <code id="viz">ggplot2</code></a><ul>
-<li class="chapter" data-level="" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
-<li class="chapter" data-level="4.1.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
-<li class="chapter" data-level="4.1.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
-<li class="chapter" data-level="4.1.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
-<li class="chapter" data-level="4.1.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The <code>ggplot2</code> Package</a></li>
-</ul></li>
-<li class="chapter" data-level="4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
-<li class="chapter" data-level="4.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
-<li class="chapter" data-level="4.3.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatter-plots-via-geom_point"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via <code id="geompoint">geom_point</code></a></li>
-<li class="chapter" data-level="4.3.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
-<li class="chapter" data-level="4.3.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
-<li class="chapter" data-level="4.4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#line-graphs-via-geom_line"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via <code id="geomline">geom_line</code></a></li>
-<li class="chapter" data-level="4.4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.5" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
-<li class="chapter" data-level="4.5.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms-via-geom_histogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via <code id="geomhistogram">geom_histogram</code></a></li>
-<li class="chapter" data-level="4.5.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
-<li class="chapter" data-level="4.5.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.6" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
-<li class="chapter" data-level="4.7" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
-<li class="chapter" data-level="4.7.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#boxplots-via-geom_boxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via <code id="geomboxplot">geom_boxplot</code></a></li>
-<li class="chapter" data-level="4.7.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.8" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
-<li class="chapter" data-level="4.8.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via <code>geom_bar</code></a></li>
-<li class="chapter" data-level="4.8.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
-<li class="chapter" data-level="4.8.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
-<li class="chapter" data-level="4.8.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.9" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
-<li class="chapter" data-level="4.9.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.1</b> What’s to come?</a></li>
-<li class="chapter" data-level="4.9.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
-</ul></li>
-</ul></li>
-<li class="chapter" data-level="5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via <code id="manip">dplyr</code></a><ul>
-<li class="chapter" data-level="" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="5.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
-<li class="chapter" data-level="5.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#four-main-verbs---the-4mv"><i class="fa fa-check"></i><b>5.2</b> Four Main Verbs - The 4MV</a><ul>
-<li class="chapter" data-level="5.2.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#filter-observations-using-filter"><i class="fa fa-check"></i><b>5.2.1</b> Filter observations using <code id="filter">filter</code></a></li>
-<li class="chapter" data-level="5.2.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> Summarize variables using <code>summarize</code></a></li>
-<li class="chapter" data-level="5.2.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.3</b> Create new variables/change old variables using <code>mutate</code></a></li>
-<li class="chapter" data-level="5.2.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#reorder-the-data-frame-using-arrange"><i class="fa fa-check"></i><b>5.2.4</b> Reorder the data frame using <code id="arrange">arrange</code></a></li>
-</ul></li>
-<li class="chapter" data-level="5.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#other-verbs"><i class="fa fa-check"></i><b>5.3</b> Other verbs</a><ul>
-<li class="chapter" data-level="5.3.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#select-variables-using-select"><i class="fa fa-check"></i><b>5.3.1</b> Select variables using <code id="select">select</code></a></li>
-<li class="chapter" data-level="5.3.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#rename-variables-using-rename"><i class="fa fa-check"></i><b>5.3.2</b> Rename variables using <code id="rename">rename</code></a></li>
-<li class="chapter" data-level="5.3.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.3.3</b> Find the top number of values using <code>top_n</code></a></li>
-</ul></li>
-<li class="chapter" data-level="5.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#joiningmerging-data-frames"><i class="fa fa-check"></i><b>5.4</b> Joining/merging data frames</a></li>
-<li class="chapter" data-level="5.5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5</b> Script of R code</a></li>
-<li class="chapter" data-level="5.6" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.6</b> What’s to come?</a></li>
+<li class="chapter" data-level="4" data-path="4-viz.html"><a href="4-viz.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via ggplot2</a><ul>
+<li class="chapter" data-level="" data-path="4-viz.html"><a href="4-viz.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="4.1" data-path="4-viz.html"><a href="4-viz.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
+<li class="chapter" data-level="4.1.1" data-path="4-viz.html"><a href="4-viz.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
+<li class="chapter" data-level="4.1.2" data-path="4-viz.html"><a href="4-viz.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
+<li class="chapter" data-level="4.1.3" data-path="4-viz.html"><a href="4-viz.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
+<li class="chapter" data-level="4.1.4" data-path="4-viz.html"><a href="4-viz.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The ggplot2 Package</a></li>
+</ul></li>
+<li class="chapter" data-level="4.2" data-path="4-viz.html"><a href="4-viz.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
+<li class="chapter" data-level="4.3" data-path="4-viz.html"><a href="4-viz.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
+<li class="chapter" data-level="4.3.1" data-path="4-viz.html"><a href="4-viz.html#geompoint"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via geom_point</a></li>
+<li class="chapter" data-level="4.3.2" data-path="4-viz.html"><a href="4-viz.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
+<li class="chapter" data-level="4.3.3" data-path="4-viz.html"><a href="4-viz.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.4" data-path="4-viz.html"><a href="4-viz.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
+<li class="chapter" data-level="4.4.1" data-path="4-viz.html"><a href="4-viz.html#geomline"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via geom_line</a></li>
+<li class="chapter" data-level="4.4.2" data-path="4-viz.html"><a href="4-viz.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.5" data-path="4-viz.html"><a href="4-viz.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
+<li class="chapter" data-level="4.5.1" data-path="4-viz.html"><a href="4-viz.html#geomhistogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via geom_histogram</a></li>
+<li class="chapter" data-level="4.5.2" data-path="4-viz.html"><a href="4-viz.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
+<li class="chapter" data-level="4.5.3" data-path="4-viz.html"><a href="4-viz.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.6" data-path="4-viz.html"><a href="4-viz.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
+<li class="chapter" data-level="4.7" data-path="4-viz.html"><a href="4-viz.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
+<li class="chapter" data-level="4.7.1" data-path="4-viz.html"><a href="4-viz.html#geomboxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via geom_boxplot</a></li>
+<li class="chapter" data-level="4.7.2" data-path="4-viz.html"><a href="4-viz.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.8" data-path="4-viz.html"><a href="4-viz.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
+<li class="chapter" data-level="4.8.1" data-path="4-viz.html"><a href="4-viz.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via geom_bar</a></li>
+<li class="chapter" data-level="4.8.2" data-path="4-viz.html"><a href="4-viz.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
+<li class="chapter" data-level="4.8.3" data-path="4-viz.html"><a href="4-viz.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
+<li class="chapter" data-level="4.8.4" data-path="4-viz.html"><a href="4-viz.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.9" data-path="4-viz.html"><a href="4-viz.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="4.9.1" data-path="4-viz.html"><a href="4-viz.html#resources"><i class="fa fa-check"></i><b>4.9.1</b> Resources</a></li>
+<li class="chapter" data-level="4.9.2" data-path="4-viz.html"><a href="4-viz.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
+<li class="chapter" data-level="4.9.3" data-path="4-viz.html"><a href="4-viz.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.3</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="5" data-path="5-manip.html"><a href="5-manip.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via dplyr</a><ul>
+<li class="chapter" data-level="" data-path="5-manip.html"><a href="5-manip.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="5.1" data-path="5-manip.html"><a href="5-manip.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
+<li class="chapter" data-level="5.2" data-path="5-manip.html"><a href="5-manip.html#five-main-verbs---the-5mv"><i class="fa fa-check"></i><b>5.2</b> Five Main Verbs - The 5MV</a><ul>
+<li class="chapter" data-level="5.2.1" data-path="5-manip.html"><a href="5-manip.html#filter"><i class="fa fa-check"></i><b>5.2.1</b> 5MV#1: Filter observations using filter</a></li>
+<li class="chapter" data-level="5.2.2" data-path="5-manip.html"><a href="5-manip.html#mv2-summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> 5MV#2: Summarize variables using summarize</a></li>
+<li class="chapter" data-level="5.2.3" data-path="5-manip.html"><a href="5-manip.html#mv3-group-rows-using-group_by"><i class="fa fa-check"></i><b>5.2.3</b> 5MV#3: Group rows using group_by</a></li>
+<li class="chapter" data-level="5.2.4" data-path="5-manip.html"><a href="5-manip.html#mv4-create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.4</b> 5MV#4: Create new variables/change old variables using mutate</a></li>
+<li class="chapter" data-level="5.2.5" data-path="5-manip.html"><a href="5-manip.html#arrange"><i class="fa fa-check"></i><b>5.2.5</b> 5MV#5: Reorder the data frame using arrange</a></li>
+</ul></li>
+<li class="chapter" data-level="5.3" data-path="5-manip.html"><a href="5-manip.html#joining-data-frames"><i class="fa fa-check"></i><b>5.3</b> Joining data frames</a><ul>
+<li class="chapter" data-level="5.3.1" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables"><i class="fa fa-check"></i><b>5.3.1</b> Joining by Key Variables</a></li>
+<li class="chapter" data-level="5.3.2" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables-with-different-names"><i class="fa fa-check"></i><b>5.3.2</b> Joining by Key Variables with Different Names</a></li>
+</ul></li>
+<li class="chapter" data-level="5.4" data-path="5-manip.html"><a href="5-manip.html#optional-other-verbs"><i class="fa fa-check"></i><b>5.4</b> Optional: Other verbs</a><ul>
+<li class="chapter" data-level="5.4.1" data-path="5-manip.html"><a href="5-manip.html#select"><i class="fa fa-check"></i><b>5.4.1</b> Select variables using select</a></li>
+<li class="chapter" data-level="5.4.2" data-path="5-manip.html"><a href="5-manip.html#rename"><i class="fa fa-check"></i><b>5.4.2</b> Rename variables using rename</a></li>
+<li class="chapter" data-level="5.4.3" data-path="5-manip.html"><a href="5-manip.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.4.3</b> Find the top number of values using top_n</a></li>
+</ul></li>
+<li class="chapter" data-level="5.5" data-path="5-manip.html"><a href="5-manip.html#conclusion-1"><i class="fa fa-check"></i><b>5.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="5.5.1" data-path="5-manip.html"><a href="5-manip.html#resources-1"><i class="fa fa-check"></i><b>5.5.1</b> Resources</a></li>
+<li class="chapter" data-level="5.5.2" data-path="5-manip.html"><a href="5-manip.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5.2</b> Script of R code</a></li>
+<li class="chapter" data-level="5.5.3" data-path="5-manip.html"><a href="5-manip.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.5.3</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="part"><span><b>II Inference</b></span></li>
-<li class="chapter" data-level="6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via <code id="sim">mosaic</code></a><ul>
-<li class="chapter" data-level="" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="6.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
-<li class="chapter" data-level="6.1.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
-<li class="chapter" data-level="6.1.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+<li class="chapter" data-level="6" data-path="6-sim.html"><a href="6-sim.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via mosaic</a><ul>
+<li class="chapter" data-level="" data-path="6-sim.html"><a href="6-sim.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="6.1" data-path="6-sim.html"><a href="6-sim.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
+<li class="chapter" data-level="6.1.1" data-path="6-sim.html"><a href="6-sim.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
+<li class="chapter" data-level="6.1.2" data-path="6-sim.html"><a href="6-sim.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+</ul></li>
+<li class="chapter" data-level="6.2" data-path="6-sim.html"><a href="6-sim.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
+<li class="chapter" data-level="6.2.1" data-path="6-sim.html"><a href="6-sim.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
+<li class="chapter" data-level="6.2.2" data-path="6-sim.html"><a href="6-sim.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
 </ul></li>
-<li class="chapter" data-level="6.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
-<li class="chapter" data-level="6.2.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
-<li class="chapter" data-level="6.2.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
+<li class="chapter" data-level="6.3" data-path="6-sim.html"><a href="6-sim.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
+<li class="chapter" data-level="6.4" data-path="6-sim.html"><a href="6-sim.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
+<li class="chapter" data-level="6.5" data-path="6-sim.html"><a href="6-sim.html#conclusion-2"><i class="fa fa-check"></i><b>6.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="6.5.1" data-path="6-sim.html"><a href="6-sim.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5.1</b> Script of R code</a></li>
+<li class="chapter" data-level="6.5.2" data-path="6-sim.html"><a href="6-sim.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.5.2</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="6.3" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
-<li class="chapter" data-level="6.4" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
-<li class="chapter" data-level="6.5" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5</b> Script of R code</a></li>
-<li class="chapter" data-level="6.6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.6</b> What’s to come?</a></li>
 </ul></li>
 <li class="chapter" data-level="7" data-path="7-hypo.html"><a href="7-hypo.html"><i class="fa fa-check"></i><b>7</b> Hypothesis Testing</a><ul>
-<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="7.1" data-path="7-hypo.html"><a href="7-hypo.html#when-inference-is-not-needed"><i class="fa fa-check"></i><b>7.1</b> When Inference Is Not Needed</a></li>
 <li class="chapter" data-level="7.2" data-path="7-hypo.html"><a href="7-hypo.html#basics-of-hypothesis-testing"><i class="fa fa-check"></i><b>7.2</b> Basics of Hypothesis Testing</a></li>
 <li class="chapter" data-level="7.3" data-path="7-hypo.html"><a href="7-hypo.html#trial"><i class="fa fa-check"></i><b>7.3</b> Criminal trial analogy</a><ul>
@@ -249,44 +262,50 @@
 <li class="chapter" data-level="7.8.1" data-path="7-hypo.html"><a href="7-hypo.html#example-t-test-for-two-independent-samples"><i class="fa fa-check"></i><b>7.8.1</b> EXAMPLE: <span class="math inline">\(t\)</span>-test for two independent samples</a></li>
 <li class="chapter" data-level="7.8.2" data-path="7-hypo.html"><a href="7-hypo.html#conditions-for-t-test"><i class="fa fa-check"></i><b>7.8.2</b> Conditions for t-test</a></li>
 </ul></li>
-<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9</b> What’s to come?</a></li>
+<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#conclusion-3"><i class="fa fa-check"></i><b>7.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="7.9.1" data-path="7-hypo.html"><a href="7-hypo.html#script-of-r-code-3"><i class="fa fa-check"></i><b>7.9.1</b> Script of R code</a></li>
+<li class="chapter" data-level="7.9.2" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="chapter" data-level="8" data-path="8-ci.html"><a href="8-ci.html"><i class="fa fa-check"></i><b>8</b> Confidence Intervals</a><ul>
-<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a></li>
-<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#mean_rating"><i class="fa fa-check"></i><b>8.2</b> mean_rating</a><ul>
-<li class="chapter" data-level="8.2.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.2.1</b> Review of Bootstrapping</a></li>
-</ul></li>
-<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.3</b> Relation to hypothesis testing</a></li>
-<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.4</b> Effect size</a></li>
-<li class="chapter" data-level="8.5" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-3"><i class="fa fa-check"></i><b>8.5</b> Script of R code</a></li>
-<li class="chapter" data-level="8.6" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.6</b> What’s to come?</a></li>
-</ul></li>
-<li class="chapter" data-level="9" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html"><i class="fa fa-check"></i><b>9</b> Regression via <code id="regress">broom</code></a><ul>
-<li class="chapter" data-level="" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="9.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
-<li class="chapter" data-level="9.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
-<li class="chapter" data-level="9.2.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
-</ul></li>
-<li class="chapter" data-level="9.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
-<li class="chapter" data-level="9.3.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
-<li class="chapter" data-level="9.3.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
-<li class="chapter" data-level="9.3.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#interpretting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpretting the slope</a></li>
-<li class="chapter" data-level="9.3.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
-</ul></li>
-<li class="chapter" data-level="9.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
-<li class="chapter" data-level="9.4.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
-<li class="chapter" data-level="9.4.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
-<li class="chapter" data-level="9.4.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
-<li class="chapter" data-level="9.4.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
-<li class="chapter" data-level="9.4.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
-</ul></li>
-<li class="chapter" data-level="9.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
-<li class="chapter" data-level="9.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
-<li class="chapter" data-level="9.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#script-of-r-code-4"><i class="fa fa-check"></i><b>9.7</b> Script of R code</a></li>
-<li class="chapter" data-level="9.8" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.8</b> What’s to come?</a></li>
+<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a><ul>
+<li class="chapter" data-level="8.1.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.1.1</b> Review of Bootstrapping</a></li>
+</ul></li>
+<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.2</b> Relation to hypothesis testing</a></li>
+<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.3</b> Effect size</a></li>
+<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#conclusion-4"><i class="fa fa-check"></i><b>8.4</b> Conclusion</a><ul>
+<li class="chapter" data-level="8.4.1" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-4"><i class="fa fa-check"></i><b>8.4.1</b> Script of R code</a></li>
+<li class="chapter" data-level="8.4.2" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.4.2</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="9" data-path="9-regress.html"><a href="9-regress.html"><i class="fa fa-check"></i><b>9</b> Regression via broom</a><ul>
+<li class="chapter" data-level="" data-path="9-regress.html"><a href="9-regress.html#needed-packages-6"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="9.1" data-path="9-regress.html"><a href="9-regress.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
+<li class="chapter" data-level="9.2" data-path="9-regress.html"><a href="9-regress.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
+<li class="chapter" data-level="9.2.1" data-path="9-regress.html"><a href="9-regress.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
+</ul></li>
+<li class="chapter" data-level="9.3" data-path="9-regress.html"><a href="9-regress.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
+<li class="chapter" data-level="9.3.1" data-path="9-regress.html"><a href="9-regress.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
+<li class="chapter" data-level="9.3.2" data-path="9-regress.html"><a href="9-regress.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
+<li class="chapter" data-level="9.3.3" data-path="9-regress.html"><a href="9-regress.html#interpreting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpreting the slope</a></li>
+<li class="chapter" data-level="9.3.4" data-path="9-regress.html"><a href="9-regress.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
+</ul></li>
+<li class="chapter" data-level="9.4" data-path="9-regress.html"><a href="9-regress.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
+<li class="chapter" data-level="9.4.1" data-path="9-regress.html"><a href="9-regress.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
+<li class="chapter" data-level="9.4.2" data-path="9-regress.html"><a href="9-regress.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
+<li class="chapter" data-level="9.4.3" data-path="9-regress.html"><a href="9-regress.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
+<li class="chapter" data-level="9.4.4" data-path="9-regress.html"><a href="9-regress.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.5" data-path="9-regress.html"><a href="9-regress.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
+<li class="chapter" data-level="9.4.6" data-path="9-regress.html"><a href="9-regress.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.7" data-path="9-regress.html"><a href="9-regress.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
+</ul></li>
+<li class="chapter" data-level="9.5" data-path="9-regress.html"><a href="9-regress.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
+<li class="chapter" data-level="9.6" data-path="9-regress.html"><a href="9-regress.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
+<li class="chapter" data-level="9.7" data-path="9-regress.html"><a href="9-regress.html#conclusion-5"><i class="fa fa-check"></i><b>9.7</b> Conclusion</a><ul>
+<li class="chapter" data-level="9.7.1" data-path="9-regress.html"><a href="9-regress.html#script-of-r-code-5"><i class="fa fa-check"></i><b>9.7.1</b> Script of R code</a></li>
+<li class="chapter" data-level="9.7.2" data-path="9-regress.html"><a href="9-regress.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.7.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="part"><span><b>III Conclusion</b></span></li>
 <li class="chapter" data-level="10" data-path="10-effective-data-storytelling.html"><a href="10-effective-data-storytelling.html"><i class="fa fa-check"></i><b>10</b> Effective Data Storytelling</a><ul>
@@ -304,7 +323,8 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="B" data-path="B-appendixB.html"><a href="B-appendixB.html"><i class="fa fa-check"></i><b>B</b> Inference Examples</a><ul>
-<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-6"><i class="fa fa-check"></i><b>B.1</b> Needed packages</a></li>
+<li class="chapter" data-level="" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-7"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#inference-mind-map"><i class="fa fa-check"></i><b>B.1</b> Inference Mind Map</a></li>
 <li class="chapter" data-level="B.2" data-path="B-appendixB.html"><a href="B-appendixB.html#one-mean"><i class="fa fa-check"></i><b>B.2</b> One Mean</a><ul>
 <li class="chapter" data-level="B.2.1" data-path="B-appendixB.html"><a href="B-appendixB.html#problem-statement"><i class="fa fa-check"></i><b>B.2.1</b> Problem Statement</a></li>
 <li class="chapter" data-level="B.2.2" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses"><i class="fa fa-check"></i><b>B.2.2</b> Competing Hypotheses</a></li>
@@ -352,6 +372,7 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="C" data-path="C-appendixC.html"><a href="C-appendixC.html"><i class="fa fa-check"></i><b>C</b> Reach for the Starts</a><ul>
+<li class="chapter" data-level="" data-path="C-appendixC.html"><a href="C-appendixC.html#needed-packages-8"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="C.1" data-path="C-appendixC.html"><a href="C-appendixC.html#sorted-barplots"><i class="fa fa-check"></i><b>C.1</b> Sorted barplots</a></li>
 <li class="chapter" data-level="C.2" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-graphics"><i class="fa fa-check"></i><b>C.2</b> Interactive graphics</a><ul>
 <li class="chapter" data-level="C.2.1" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-line-graphs"><i class="fa fa-check"></i><b>C.2.1</b> Interactive line-graphs</a></li>
@@ -375,33 +396,18 @@ <h1>
           <div class="page-inner">
 
             <section class="normal" id="section-">
-<div id="regression-via-broom" class="section level1">
-<h1><span class="header-section-number">9</span> Regression via <code id="regress">broom</code></h1>
+<div id="regress" class="section level1">
+<h1><span class="header-section-number">9</span> Regression via broom</h1>
 <p>One of the most commonly used statistical procedures is <em>regression</em>. Regression, in its simplest form, focuses on trying to predict values of one numerical variable based on the values of another numerical variable using a straight line fit to data. We saw in Chapters <a href="7-hypo.html#hypo">7</a> and <a href="8-ci.html#ci">8</a> an example of analyses using a categorical predictor (movie genre–action or romance) and a numerical response (movie rating). In this chapter, we will focus on going back to the <code>flights</code> data frame in the <code>nycflights13</code> package to look at the relationship between departure delay and arrival delay. We will also discuss the concept of <em>correlation</em> and how it is frequently incorrectly implied to also lead to <em>causation</em>. This chapter also introduces the <code>broom</code> package, which is a useful tool in summarizing the results of model fits in tidy format. You will see examples of the <code>tidy</code>, <code>glance</code>, and <code>augment</code> functions with linear regression.</p>
 <!--
-## Simple linear regression
-
-+ Implement `tidy`, `augment`, and `glance` in `broom` package to get results
-   
-- Y = Arrival delay, X = departure delay for Alaskan flights 
-- Skipped computing b0 and b1
-- Contrasting correlation coeff vs slope
-- Interpreting b1
-- Build standard error by shuffling y based on x under the null
 - Bootstrap points (Hesterberg)
 - Interpreting table of results
     -  P-value: H0: beta1 = 0 vs !=0
     - CI: Range of plausible
 R-squared
-broom:tidy() to show regression output tables
-Other broom functions (augment and glance)
-Residual analysis?
-  Showed a histogram of residuals to convey that
-    Centered at 0
-    Have no systematic pattern
 -->
-<div id="needed-packages-5" class="section level2 unnumbered">
-<h2>Needed packages</h2>
+<div id="needed-packages-6" class="section level3 unnumbered">
+<h3>Needed packages</h3>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(mosaic)
 <span class="kw">library</span>(dplyr)
 <span class="kw">library</span>(ggplot2)
@@ -423,8 +429,7 @@ <h2><span class="header-section-number">9.1</span> EXAMPLE: Alaskan Airlines del
 <span class="st">  </span><span class="kw">filter</span>(!<span class="kw">is.na</span>(dep_delay) &amp;<span class="st"> </span>!<span class="kw">is.na</span>(arr_delay)) %&gt;%<span class="st"> </span>
 <span class="st">  </span><span class="kw">resample</span>(<span class="dt">size =</span> <span class="dv">50</span>, <span class="dt">replace =</span> <span class="ot">FALSE</span>)
 
-<span class="kw">ggplot</span>(<span class="dt">data =</span> alaska_flights, 
-       <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> dep_delay, <span class="dt">y =</span> arr_delay)) +<span class="st"> </span>
+<span class="kw">ggplot</span>(<span class="dt">data =</span> alaska_flights, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> dep_delay, <span class="dt">y =</span> arr_delay)) +<span class="st"> </span>
 <span class="st">   </span><span class="kw">geom_point</span>()</code></pre></div>
 <div class="figure" style="text-align: center"><span id="fig:regplot1"></span>
 <img src="ismaykim_files/figure-html/regplot1-1.png" alt="Departure and Arrival Flight Delays for a sample of 50 Alaskan flights from NYC" width="\textwidth" />
@@ -438,7 +443,7 @@ <h2><span class="header-section-number">9.1</span> EXAMPLE: Alaskan Airlines del
 <strong><em>Learning check</em></strong>
 </p>
 </div>
-<p><strong>(LC9.1)</strong> Does there appear to be a linear relationship with arrival delay and departure delay? In other words, could you fit a line to the data and have explain how <code>arr_delay</code> increases as <code>dep_delay</code> increases?</p>
+<p><strong>(LC9.1)</strong> Does there appear to be a linear relationship with arrival delay and departure delay? In other words, could you fit a line to the data and have it explain well how <code>arr_delay</code> increases as <code>dep_delay</code> increases?</p>
 <p><strong>(LC9.2)</strong> Is there only one possible line that fits the data “well”? How could you decide on which one is best if there are multiple options?</p>
 <hr />
 </div>
@@ -476,10 +481,10 @@ <h2><span class="header-section-number">9.2</span> Correlation</h2>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">alaska_flights %&gt;%<span class="st"> </span>
 <span class="st">  </span><span class="kw">summarize</span>(<span class="dt">correl =</span> <span class="kw">cor</span>(dep_delay, arr_delay))</code></pre></div>
 <pre><code>## # A tibble: 1 × 1
-##      correl
-##       &lt;dbl&gt;
-## 1 0.7907993</code></pre>
-<p>The sample correlation coefficient is denoted by <span class="math inline">\(r\)</span>. In this case, <span class="math inline">\(r = 0.7907993\)</span>.</p>
+##   correl
+##    &lt;dbl&gt;
+## 1 0.7908</code></pre>
+<p>The sample correlation coefficient is denoted by <span class="math inline">\(r\)</span>. In this case, <span class="math inline">\(r = 0.7908\)</span>.</p>
 <hr />
 <div class="learncheck">
 <p>
@@ -500,7 +505,7 @@ <h2><span class="header-section-number">9.2</span> Correlation</h2>
 <div id="correlation-does-not-imply-causation" class="section level3">
 <h3><span class="header-section-number">9.2.1</span> Correlation does not imply causation</h3>
 <p>Just because arrival delays are related to departure delays in a somewhat linear fashion, we can’t say with certaintly that arrival delays are caused <strong>entirely</strong> by departure delays. Certainly it appears that as one increases, the other tends to increase, but that might not always be the case.</p>
-<p>Causation is a tricky problem and frequently takes carefully designed experiments. These experiments remove confounding variables and only focus on the behavior of one variable in the presence of the levels of the other variable.</p>
+<p>Causation is a tricky problem and frequently takes carefully designed experiments. These experiments remove confounding variables and only focus on the behavior of one variable in the presence of the levels of the other variable(s).</p>
 <p>Be careful as you read studies to make sure that the writers aren’t falling into this fallacy of correlation implying causation. If you spot one, you may want to send them a link to <a href="http://www.tylervigen.com/spurious-correlations">Spurious Correlations</a>.</p>
 <hr />
 <div class="learncheck">
@@ -514,8 +519,8 @@ <h3><span class="header-section-number">9.2.1</span> Correlation does not imply
 </div>
 <div id="linear-regression" class="section level2">
 <h2><span class="header-section-number">9.3</span> Linear regression</h2>
-<p>So we see above that there is a strong positive association between these delay variables. Let’s say that we are waiting for our flight to leave New York City on Alaskan and we are told that our flight is going to be delayed 25 minutes. What could we predict for our arrival delay based on the plot in Figure <a href="9-regression-via-broom.html#fig:regplot1">9.1</a>?</p>
-<p>It may be hard to pick a particular value here, especially after just going on confidence intervals in Chapter <a href="8-ci.html#ci">8</a>. One way to do this would be to fit a line that fits the data best and then use the predicted <code>arr_delay</code> value from that line for <code>dep_delay = 25</code> as our prediction. But what is meant by “fits the data best”?</p>
+<p>So we see above that there is a strong positive association between these delay variables. Let’s say that we are waiting for our flight to leave New York City on Alaskan and we are told that our flight is going to be delayed 25 minutes. What could we predict for our arrival delay based on the plot in Figure <a href="9-regress.html#fig:regplot1">9.1</a>?</p>
+<p>It may be hard to pick a particular value here, especially after just going over confidence intervals in Chapter <a href="8-ci.html#ci">8</a>. One way to do this would be to fit a line that fits the data best and then use the predicted <code>arr_delay</code> value from that line for <code>dep_delay = 25</code> as our prediction. But what is meant by “fits the data best”?</p>
 <p>The least squares/best fitting/linear regression line has been fit to the data below.</p>
 <div class="figure" style="text-align: center"><span id="fig:with-reg"></span>
 <img src="ismaykim_files/figure-html/with-reg-1.png" alt="Regression line fit on delays" width="\textwidth" />
@@ -523,17 +528,17 @@ <h2><span class="header-section-number">9.3</span> Linear regression</h2>
 Figure 9.3: Regression line fit on delays
 </p>
 </div>
-<p>Here <code>lm</code> corresponds to “linear model” and we’ll see it’s use again in a bit when we find the values that define this line.</p>
+<p>Here <code>lm</code> corresponds to “linear model” and we’ll see its use again in a bit when we find the values that define this line.</p>
 <div id="understanding-linear-regression-basics" class="section level3">
 <h3><span class="header-section-number">9.3.1</span> Understanding linear regression basics</h3>
 <p>Let’s choose an arbitrary point on the graph and label it the color blue.</p>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-127-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-130-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 <p>Now consider this point’s <em>deviation</em> from the regression line.</p>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-128-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-131-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 <p>Do this for another point.</p>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-129-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-132-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 <p>And for another point.</p>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-130-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-133-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 <p>We could repeat this process for each of the points in our sample. The pattern that emerges here is that the regression line minimizes the sum of the squared arrow lengths (i.e., the least squares) for all of the points.</p>
 <p>As you look at these points you might think that a different line could fit the data better based on this criteria. That isn’t the case though and it can be shown via calculus (omitted here) that this line minimizes the sum of the squared residuals for these 50 points.</p>
 </div>
@@ -555,28 +560,28 @@ <h3><span class="header-section-number">9.3.2</span> The equation of the line</h
 <tbody>
 <tr class="odd">
 <td align="left">(Intercept)</td>
-<td align="right">-14.155017</td>
-<td align="right">2.8094813</td>
-<td align="right">-5.038302</td>
-<td align="right">0.0000071</td>
+<td align="right">-14.155</td>
+<td align="right">2.809</td>
+<td align="right">-5.038</td>
+<td align="right">0</td>
 </tr>
 <tr class="even">
 <td align="left">dep_delay</td>
-<td align="right">1.217666</td>
-<td align="right">0.1360336</td>
-<td align="right">8.951212</td>
-<td align="right">0.0000000</td>
+<td align="right">1.218</td>
+<td align="right">0.136</td>
+<td align="right">8.951</td>
+<td align="right">0</td>
 </tr>
 </tbody>
 </table>
-<p>In general, the equation of the line of best fit for a sample is <span class="math display">\[\hat{y} = b_0 + b_1 x\]</span>. Thus, our equation is <span class="math inline">\(\hat{y} = -14.1550165 + 1.2176658 \, x\)</span>. It is usually preferred to actually write the names of the variables instead of <span class="math inline">\(x\)</span> and <span class="math inline">\(y\)</span>: <span class="math display">\[\widehat{arr\_delay} = -14.1550165 + 1.2176658 \, dep\_delay\]</span>.</p>
+<p>In general, the equation of the line of best fit for a sample is <span class="math display">\[\hat{y} = b_0 + b_1 x.\]</span> Thus, our equation is <span class="math inline">\(\hat{y} = -14.155 + 1.2177 \, x.\)</span> It is usually preferred to actually write the names of the variables instead of <span class="math inline">\(x\)</span> and <span class="math inline">\(y\)</span>: <span class="math display">\[\widehat{arr\_delay} = -14.155 + 1.2177 \, dep\_delay.\]</span></p>
 <p>We can also extract the coefficients by using the <code>coef</code> function:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">coef</span>(delay_fit)</code></pre></div>
 <pre><code>## (Intercept)   dep_delay 
-##  -14.155016    1.217666</code></pre>
+##     -14.155       1.218</code></pre>
 </div>
-<div id="interpretting-the-slope" class="section level3">
-<h3><span class="header-section-number">9.3.3</span> Interpretting the slope</h3>
+<div id="interpreting-the-slope" class="section level3">
+<h3><span class="header-section-number">9.3.3</span> Interpreting the slope</h3>
 <p>After you have determined your line of best fit, it is good practice to interpret the results to see if they make sense. Slope is defined as rise over run or the change in <span class="math inline">\(y\)</span> for every one unit increase in <span class="math inline">\(x\)</span>. For our specific example, we can say that for every one <strong>minute</strong> increase in the departure delay of Alaskan Airlines flights from NYC, we can expect the corresponding arrival delay to be 1.22 minutes more.</p>
 <p>This estimate does make some practical sense. It would be strange if arrival delays went down as departure delays increased. We also expect that the longer a flight is delayed on departure, the more likely the longer a flight is delayed on arrival. Remember that we are also using data here to make a guess as to how the population of all Alaskan flights might behave with regards to departure delays and arrival delays, so just as with other sampling procedures there is also variability in the sample estimates for the regression line.</p>
 </div>
@@ -584,21 +589,21 @@ <h3><span class="header-section-number">9.3.3</span> Interpretting the slope</h3
 <h3><span class="header-section-number">9.3.4</span> Predicting values</h3>
 <p>Getting back to our hypothetical flight that has been delayed 25 minutes, we can use the <code>augment</code> function in the <code>broom</code> package to get the fitted arrival delay value:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">delay_fit %&gt;%<span class="st"> </span><span class="kw">augment</span>(<span class="dt">newdata =</span> <span class="kw">data_frame</span>(<span class="dt">dep_delay =</span> <span class="dv">25</span>))</code></pre></div>
-<pre><code>##   dep_delay  .fitted  .se.fit
-## 1        25 16.28663 3.967287</code></pre>
-<p>Note the use of the <code>data_frame</code> function here, which must be used since <code>newdata</code> is expected a data frame as its argument. We must also specify that we are plugging in 25 for the value of <code>dep_delay</code> here. We can see that the line predicted an arrival delay of 16.29 minutes based on our 25 minute departure delay. This also does make some sense since flights that aren’t delayed greatly from the beginning to tend to make up time in the air to compensate.</p>
+<pre><code>##   dep_delay .fitted .se.fit
+## 1        25   16.29   3.967</code></pre>
+<p>Note the use of the <code>data_frame</code> function here, which must be used since <code>newdata</code> is expecting a data frame as its argument. We must also specify that we are plugging in 25 for the value of <code>dep_delay</code> here. We can see that the line predicted an arrival delay of 16.29 minutes based on our 25 minute departure delay. This also does make some sense since flights that aren’t delayed greatly from the beginning to tend to make up time in the air to compensate.</p>
 <p><strong>Important note</strong>: The correlation coefficient and the slope of the regression line are not the same thing. They will always share the same sign (positive correlation coefficients correspond to positive slope coefficients and the same holds true for negative values), but you can’t make any more conclusions about them than that.</p>
 <p>For example, say we have 3 groups of points:</p>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-133-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-136-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 <p>Their regression lines have different slopes, but <span class="math inline">\(r = 1\)</span> for all 3. In other words, all three groups of points have a perfect (positive) linear relationship.</p>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-134-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-137-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 </div>
 </div>
 <div id="inference-for-regression" class="section level2">
 <h2><span class="header-section-number">9.4</span> Inference for regression</h2>
-<p>The population least squares line is defined by the formula <span class="math inline">\(y = \beta_0 + \beta_1 x + \epsilon\)</span>. Here <span class="math inline">\(\epsilon\)</span> corresponds to the error term. It corresponds to the part of the response variable <span class="math inline">\(y\)</span> that remains unexplained after considering the predictor variable <span class="math inline">\(x\)</span>. Often it is standard practice to assume that this error term follows a normal distribution. We will focus on checking whether that assumption is valid in Section <a href="9-regression-via-broom.html#resid">9.5</a>.</p>
+<p>The population least squares line is defined by the formula <span class="math inline">\(y = \beta_0 + \beta_1 x + \epsilon\)</span>. Here <span class="math inline">\(\epsilon\)</span> represents the error term. It corresponds to the part of the response variable <span class="math inline">\(y\)</span> that remains unexplained after considering the predictor variable <span class="math inline">\(x\)</span>. Often it is standard practice to assume that this error term follows a normal distribution. We will focus on checking whether that assumption is valid in Section <a href="9-regress.html#resid">9.5</a>.</p>
 <p>In the population least squares line <span class="math inline">\(y = \beta_0 + \beta_1 x + \epsilon\)</span>, we can see that if <span class="math inline">\(\beta_1 = 0\)</span> there is no relationship between <span class="math inline">\(x\)</span> and <span class="math inline">\(y\)</span>. If <span class="math inline">\(\beta_1 = 0\)</span>, <span class="math inline">\(y = \beta_0 + \epsilon\)</span>. Therefore, <span class="math inline">\(y\)</span> does not depend on <span class="math inline">\(x\)</span> at all in the equation. A hypothesis test is frequently conducted to check whether a relationship exists between two numerical variables <span class="math inline">\(x\)</span> and <span class="math inline">\(y\)</span>.</p>
-<p>We can also use the concept of shuffling to determine standard error and conduct a hypothesis test for a population slope. Let’s go back to our example on Alaskan flights that represent a sample of all Alaskan flights departing NYC in 2013. Let’s test to see if we have evidence that a <em>positive</em> relationship exists between the departure delay and arrival delay for Alaskan flights. We will set up this hypothesis testing process as we have each before via the “There is Only One Test” diagram in Figure <a href="7-hypo.html#fig:htdowney">7.1</a>.</p>
+<p>We can also use the concept of shuffling to determine the standard error of our null distribution and conduct a hypothesis test for a population slope. Let’s go back to our example on Alaskan flights that represent a sample of all Alaskan flights departing NYC in 2013. Let’s test to see if we have evidence that a <em>positive</em> relationship exists between the departure delay and arrival delay for Alaskan flights. We will set up this hypothesis testing process as we have each before via the “There is Only One Test” diagram in Figure <a href="7-hypo.html#fig:htdowney">7.1</a>.</p>
 <div id="data-2" class="section level3">
 <h3><span class="header-section-number">9.4.1</span> Data</h3>
 <p>Our data is stored in <code>alaska_flights</code> and we are focused on the 50 measurements of <code>dep_delay</code> and <code>arr_delay</code> there.</p>
@@ -610,17 +615,16 @@ <h3><span class="header-section-number">9.4.2</span> Test Statistic <span class=
 <div id="observed-effect-delta-2" class="section level3">
 <h3><span class="header-section-number">9.4.3</span> Observed effect <span class="math inline">\(\delta^*\)</span></h3>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">(b1_obs &lt;-<span class="st"> </span><span class="kw">tidy</span>(delay_fit)$estimate[<span class="dv">2</span>])</code></pre></div>
-<pre><code>## [1] 1.217666</code></pre>
-<p>The calculated slope value from our observed sample is <span class="math inline">\(b_1 = 1.2176658\)</span>.</p>
+<pre><code>## [1] 1.218</code></pre>
+<p>The calculated slope value from our observed sample is <span class="math inline">\(b_1 = 1.2177\)</span>.</p>
 </div>
 <div id="model-of-h_0-2" class="section level3">
 <h3><span class="header-section-number">9.4.4</span> Model of <span class="math inline">\(H_0\)</span></h3>
-<p>We are looking to see if a positive relationship exists so <span class="math inline">\(H_A: \beta_1 &gt; 0\)</span>. Our null hypothesis is always in terms of equality so we have <span class="math inline">\(\beta_1 = 0\)</span>.</p>
+<p>We are looking to see if a positive relationship exists so <span class="math inline">\(H_a: \beta_1 &gt; 0\)</span>. Our null hypothesis is always in terms of equality so we have <span class="math inline">\(H_0: \beta_1 = 0\)</span>.</p>
 </div>
 <div id="simulated-data-2" class="section level3">
 <h3><span class="header-section-number">9.4.5</span> Simulated Data</h3>
-<p>Now to simulate the null hypothesis being true and recreating how our sample was created, we need to think about what it means for <span class="math inline">\(\beta_1\)</span> to be zero.</p>
-<p>If <span class="math inline">\(\beta_1 = 0\)</span>, we said above that there is no relationship between the departure delay and arrival delay. If there is no relationship, then any one of the arrival delay values could have just as likely occurred with any of the other departure delay values instead of the one that it actually did fall with. We, therefore, have another example of shuffling in our simulating data.</p>
+<p>Now to simulate the null hypothesis being true and recreating how our sample was created, we need to think about what it means for <span class="math inline">\(\beta_1\)</span> to be zero. If <span class="math inline">\(\beta_1 = 0\)</span>, we said above that there is no relationship between the departure delay and arrival delay. If there is no relationship, then any one of the arrival delay values could have just as likely occurred with any of the other departure delay values instead of the one that it actually did fall with. We, therefore, have another example of shuffling in our simulating of data.</p>
 <p><strong>Tactile simulation</strong></p>
 <p>We could use a deck of 100 note cards to create a tactile simulation of this shuffling process. We would write the 50 different values of departure delays on each of the 50 cards, one per card. We would then do the same thing for the 50 arrival delays putting them on one per card.</p>
 <p>Next, we would lay out each of the 50 departure delay cards and we would shuffle the arrival delay deck. Then, after shuffling the deck well, we would disperse the cards one per each one of the departure delay cards. We would then enter these new values in for arrival delay and compute a sample slope based on this shuffling. We could repeat this process many times, keeping track of our sample slope after each shuffle.</p>
@@ -636,20 +640,20 @@ <h3><span class="header-section-number">9.4.6</span> Distribution of <span class
 <p>We see that the names of our columns are <code>Intercept</code> and <code>dep_delay</code>. We want to look at <code>dep_delay</code> since that corresponds to the slope coefficients.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> rand_distn, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> dep_delay)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">color =</span> <span class="st">&quot;white&quot;</span>, <span class="dt">bins =</span> <span class="dv">20</span>)</code></pre></div>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-136-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-139-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 </div>
 <div id="the-p-value-2" class="section level3">
 <h3><span class="header-section-number">9.4.7</span> The p-value</h3>
-<p>Recall that we want to see where our observed sample slope <span class="math inline">\(\delta^* = 1.2176658\)</span> falls on this distribution and then count all of the values to the right of it corresponding to <span class="math inline">\(H_A: \beta_0 &gt; 0\)</span>. To get a sense for where our values falls, we can shade all values at least as big as <span class="math inline">\(\delta^*\)</span>.</p>
+<p>Recall that we want to see where our observed sample slope <span class="math inline">\(\delta^* = 1.2177\)</span> falls on this distribution and then count all of the values to the right of it corresponding to <span class="math inline">\(H_a: \beta_0 &gt; 0\)</span>. To get a sense for where our values falls, we can shade all values at least as big as <span class="math inline">\(\delta^*\)</span>.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> rand_distn, <span class="kw">aes</span>(<span class="dt">x =</span> dep_delay, <span class="dt">fill =</span> (dep_delay &gt;=<span class="st"> </span>b1_obs))) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">color =</span> <span class="st">&quot;white&quot;</span>, <span class="dt">bins =</span> <span class="dv">20</span>)</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-137"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-137-1.png" alt="Shaded histogram to show p-value" width="\textwidth" />
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-140"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-140-1.png" alt="Shaded histogram to show p-value" width="\textwidth" />
 <p class="caption">
 Figure 9.4: Shaded histogram to show p-value
 </p>
 </div>
-<p>Since 1.2176658 falls far to the right of this plot, we can say that we have a <span class="math inline">\(p\)</span>-value of 0. We, thus, have evidence to reject the null hypothesis in support of there being a positive association between the departure delay and arrival delay of all Alaskan flights from NYC in 2013.</p>
+<p>Since 1.2177 falls far to the right of this plot, we can say that we have a <span class="math inline">\(p\)</span>-value of 0. We, thus, have evidence to reject the null hypothesis in support of there being a positive association between the departure delay and arrival delay of all Alaskan flights from NYC in 2013.</p>
 <hr />
 <div class="learncheck">
 <p>
@@ -657,15 +661,15 @@ <h3><span class="header-section-number">9.4.7</span> The p-value</h3>
 </p>
 </div>
 <p><strong>(LC9.7)</strong> Repeat the inference above but this time for the correlation coefficient instead of the slope.</p>
-<p><strong>(LC9.8)</strong> Use bootstrapping with points to determine a range of possible values for the population slope comparing departure delays to arrival delays for Alaskan flights in 2013 from NYC.</p>
+<p><strong>(LC9.8)</strong> Use bootstrapping (of points) to determine a range of possible values for the population slope comparing departure delays to arrival delays for Alaskan flights in 2013 from NYC.</p>
 <hr />
 </div>
 </div>
 <div id="resid" class="section level2">
 <h2><span class="header-section-number">9.5</span> Residual analysis</h2>
 <p>The following diagram will help you to keep track of what is meant by a residual.</p>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-138-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
-<p>Here, <span class="math inline">\(y_i\)</span> is an observed value of the <code>arr_delay</code> variable. <span class="math inline">\(i\)</span> ranges from 1 to 50. <span class="math inline">\(\hat{y}_i\)</span> is the fitted value–the <code>arr_delay</code> value that is being pointed to on the red line. The residual is <span class="math display">\[\hat{\epsilon}_i = y_i - \hat{y}_i\]</span>. <strong>Note the order here!</strong> You start at the non-pointy end of the arrow (<span class="math inline">\(y_i\)</span>) and then subtract away what comes at the point (<span class="math inline">\(\hat{y_i}\)</span>).</p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-141-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p>Here, <span class="math inline">\(y_i\)</span> is an observed value of the <code>arr_delay</code> variable. <span class="math inline">\(i\)</span> ranges from 1 to 50. For this example, it is the vertical component of the blue dot. <span class="math inline">\(\hat{y}_i\)</span> is the fitted value–the <code>arr_delay</code> value that is being pointed to on the red line. The residual is <span class="math display">\[\hat{\epsilon}_i = y_i - \hat{y}_i.\]</span> <strong>Note the order here!</strong> You start at the non-pointy end of the arrow (<span class="math inline">\(y_i\)</span>) and then subtract away what comes at the point (<span class="math inline">\(\hat{y_i}\)</span>).</p>
 </div>
 <div id="conditions-for-regression" class="section level2">
 <h2><span class="header-section-number">9.6</span> Conditions for regression</h2>
@@ -693,15 +697,19 @@ <h2><span class="header-section-number">9.6</span> Conditions for regression</h2
 <li><p>The second condition is invalidated if there is a trigonometric pattern of up and down throughout the residual plot. That is not the case here.</p></li>
 <li><p>We look at the <em>quantile-quantile plot</em> (Q-Q plot for sure) for the third condition. We are looking to see if the residuals fall on a straight line with what we would expect if they were normally distributed. We see some curvature here as well. We should begin to wonder if regression was valid here with both condition 1 and condition 3 in question.</p></li>
 </ol>
-<p>We have reason to doubt whether a linear regression is valid here. Unfortunately, all too frequently regressions are run without checking these assumptions carefully. While small deviations in the assumptions can be OK, larger violations can completely invalidate the results and make any inferences improbable and questionable.</p>
+<p>We have reason to doubt whether a linear regression is valid here. Unfortunately, all too frequently regressions are run without checking these assumptions carefully. While small deviations from the assumptions can be OK, larger violations can completely invalidate the results and make any inferences improbable and questionable.</p>
 </div>
-<div id="script-of-r-code-4" class="section level2">
-<h2><span class="header-section-number">9.7</span> Script of R code</h2>
+<div id="conclusion-5" class="section level2">
+<h2><span class="header-section-number">9.7</span> Conclusion</h2>
+<div id="script-of-r-code-5" class="section level3">
+<h3><span class="header-section-number">9.7.1</span> Script of R code</h3>
 <p>An R script file of all R code used in this chapter is available <a href="http://ismayc.github.io/moderndiver-book/09-regress.R">here</a>.</p>
 </div>
-<div id="whats-to-come-6" class="section level2">
-<h2><span class="header-section-number">9.8</span> What’s to come?</h2>
+<div id="whats-to-come-6" class="section level3">
+<h3><span class="header-section-number">9.7.2</span> What’s to come?</h3>
+<p>In the last chapter of the textbook, we’ll summarize the purpose of this book as well as present an excellent example of what goes into making an effective story via data.</p>
 
+</div>
 </div>
 </div>
 
diff --git a/docs/A-appendixA.html b/docs/A-appendixA.html
index 1280c39ee..58b85eaf0 100644
--- a/docs/A-appendixA.html
+++ b/docs/A-appendixA.html
@@ -26,7 +26,7 @@
 <meta name="author" content="Chester Ismay and Albert Y. Kim">
 
 
-<meta name="date" content="2017-01-07">
+<meta name="date" content="2017-01-10">
 
   <meta name="viewport" content="width=device-width, initial-scale=1">
   <meta name="apple-mobile-web-app-capable" content="yes">
@@ -119,9 +119,9 @@
 
 <ul class="summary">
 <li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> Preamble</a><ul>
-<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book</a></li>
+<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book---for-instructors"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book - For Instructors</a></li>
 <li class="chapter" data-level="1.2" data-path="index.html"><a href="index.html#contribute"><i class="fa fa-check"></i><b>1.2</b> Contribute</a></li>
-<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started"><i class="fa fa-check"></i><b>1.3</b> Getting Started</a></li>
+<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started---for-students"><i class="fa fa-check"></i><b>1.3</b> Getting Started - For Students</a></li>
 <li class="chapter" data-level="" data-path="index.html"><a href="index.html#colophon"><i class="fa fa-check"></i>Colophon</a></li>
 </ul></li>
 <li class="chapter" data-level="2" data-path="2-intro.html"><a href="2-intro.html"><i class="fa fa-check"></i><b>2</b> Introduction</a><ul>
@@ -133,87 +133,100 @@
 </ul></li>
 <li class="part"><span><b>I Data Exploration</b></span></li>
 <li class="chapter" data-level="3" data-path="3-tidy.html"><a href="3-tidy.html"><i class="fa fa-check"></i><b>3</b> Tidy Data</a><ul>
+<li class="chapter" data-level="" data-path="3-tidy.html"><a href="3-tidy.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="3.1" data-path="3-tidy.html"><a href="3-tidy.html#what-is-tidy-data"><i class="fa fa-check"></i><b>3.1</b> What is tidy data?</a></li>
-<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#the-nycflights13-datasets"><i class="fa fa-check"></i><b>3.2</b> The <code>nycflights13</code> datasets</a></li>
-<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a></li>
+<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#datasets-in-the-nycflights13-package"><i class="fa fa-check"></i><b>3.2</b> Datasets in the <code>nycflights13</code> package</a></li>
+<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a><ul>
+<li class="chapter" data-level="3.3.1" data-path="3-tidy.html"><a href="3-tidy.html#identification-variables"><i class="fa fa-check"></i><b>3.3.1</b> Identification variables</a></li>
+</ul></li>
 <li class="chapter" data-level="3.4" data-path="3-tidy.html"><a href="3-tidy.html#normal-forms-of-data"><i class="fa fa-check"></i><b>3.4</b> Normal forms of data</a></li>
 <li class="chapter" data-level="3.5" data-path="3-tidy.html"><a href="3-tidy.html#whats-to-come"><i class="fa fa-check"></i><b>3.5</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via <code id="viz">ggplot2</code></a><ul>
-<li class="chapter" data-level="" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
-<li class="chapter" data-level="4.1.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
-<li class="chapter" data-level="4.1.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
-<li class="chapter" data-level="4.1.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
-<li class="chapter" data-level="4.1.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The <code>ggplot2</code> Package</a></li>
-</ul></li>
-<li class="chapter" data-level="4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
-<li class="chapter" data-level="4.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
-<li class="chapter" data-level="4.3.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatter-plots-via-geom_point"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via <code id="geompoint">geom_point</code></a></li>
-<li class="chapter" data-level="4.3.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
-<li class="chapter" data-level="4.3.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
-<li class="chapter" data-level="4.4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#line-graphs-via-geom_line"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via <code id="geomline">geom_line</code></a></li>
-<li class="chapter" data-level="4.4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.5" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
-<li class="chapter" data-level="4.5.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms-via-geom_histogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via <code id="geomhistogram">geom_histogram</code></a></li>
-<li class="chapter" data-level="4.5.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
-<li class="chapter" data-level="4.5.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.6" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
-<li class="chapter" data-level="4.7" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
-<li class="chapter" data-level="4.7.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#boxplots-via-geom_boxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via <code id="geomboxplot">geom_boxplot</code></a></li>
-<li class="chapter" data-level="4.7.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.8" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
-<li class="chapter" data-level="4.8.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via <code>geom_bar</code></a></li>
-<li class="chapter" data-level="4.8.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
-<li class="chapter" data-level="4.8.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
-<li class="chapter" data-level="4.8.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.9" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
-<li class="chapter" data-level="4.9.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.1</b> What’s to come?</a></li>
-<li class="chapter" data-level="4.9.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
-</ul></li>
-</ul></li>
-<li class="chapter" data-level="5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via <code id="manip">dplyr</code></a><ul>
-<li class="chapter" data-level="" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="5.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
-<li class="chapter" data-level="5.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#four-main-verbs---the-4mv"><i class="fa fa-check"></i><b>5.2</b> Four Main Verbs - The 4MV</a><ul>
-<li class="chapter" data-level="5.2.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#filter-observations-using-filter"><i class="fa fa-check"></i><b>5.2.1</b> Filter observations using <code id="filter">filter</code></a></li>
-<li class="chapter" data-level="5.2.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> Summarize variables using <code>summarize</code></a></li>
-<li class="chapter" data-level="5.2.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.3</b> Create new variables/change old variables using <code>mutate</code></a></li>
-<li class="chapter" data-level="5.2.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#reorder-the-data-frame-using-arrange"><i class="fa fa-check"></i><b>5.2.4</b> Reorder the data frame using <code id="arrange">arrange</code></a></li>
-</ul></li>
-<li class="chapter" data-level="5.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#other-verbs"><i class="fa fa-check"></i><b>5.3</b> Other verbs</a><ul>
-<li class="chapter" data-level="5.3.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#select-variables-using-select"><i class="fa fa-check"></i><b>5.3.1</b> Select variables using <code id="select">select</code></a></li>
-<li class="chapter" data-level="5.3.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#rename-variables-using-rename"><i class="fa fa-check"></i><b>5.3.2</b> Rename variables using <code id="rename">rename</code></a></li>
-<li class="chapter" data-level="5.3.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.3.3</b> Find the top number of values using <code>top_n</code></a></li>
-</ul></li>
-<li class="chapter" data-level="5.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#joiningmerging-data-frames"><i class="fa fa-check"></i><b>5.4</b> Joining/merging data frames</a></li>
-<li class="chapter" data-level="5.5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5</b> Script of R code</a></li>
-<li class="chapter" data-level="5.6" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.6</b> What’s to come?</a></li>
+<li class="chapter" data-level="4" data-path="4-viz.html"><a href="4-viz.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via ggplot2</a><ul>
+<li class="chapter" data-level="" data-path="4-viz.html"><a href="4-viz.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="4.1" data-path="4-viz.html"><a href="4-viz.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
+<li class="chapter" data-level="4.1.1" data-path="4-viz.html"><a href="4-viz.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
+<li class="chapter" data-level="4.1.2" data-path="4-viz.html"><a href="4-viz.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
+<li class="chapter" data-level="4.1.3" data-path="4-viz.html"><a href="4-viz.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
+<li class="chapter" data-level="4.1.4" data-path="4-viz.html"><a href="4-viz.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The ggplot2 Package</a></li>
+</ul></li>
+<li class="chapter" data-level="4.2" data-path="4-viz.html"><a href="4-viz.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
+<li class="chapter" data-level="4.3" data-path="4-viz.html"><a href="4-viz.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
+<li class="chapter" data-level="4.3.1" data-path="4-viz.html"><a href="4-viz.html#geompoint"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via geom_point</a></li>
+<li class="chapter" data-level="4.3.2" data-path="4-viz.html"><a href="4-viz.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
+<li class="chapter" data-level="4.3.3" data-path="4-viz.html"><a href="4-viz.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.4" data-path="4-viz.html"><a href="4-viz.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
+<li class="chapter" data-level="4.4.1" data-path="4-viz.html"><a href="4-viz.html#geomline"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via geom_line</a></li>
+<li class="chapter" data-level="4.4.2" data-path="4-viz.html"><a href="4-viz.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.5" data-path="4-viz.html"><a href="4-viz.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
+<li class="chapter" data-level="4.5.1" data-path="4-viz.html"><a href="4-viz.html#geomhistogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via geom_histogram</a></li>
+<li class="chapter" data-level="4.5.2" data-path="4-viz.html"><a href="4-viz.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
+<li class="chapter" data-level="4.5.3" data-path="4-viz.html"><a href="4-viz.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.6" data-path="4-viz.html"><a href="4-viz.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
+<li class="chapter" data-level="4.7" data-path="4-viz.html"><a href="4-viz.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
+<li class="chapter" data-level="4.7.1" data-path="4-viz.html"><a href="4-viz.html#geomboxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via geom_boxplot</a></li>
+<li class="chapter" data-level="4.7.2" data-path="4-viz.html"><a href="4-viz.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.8" data-path="4-viz.html"><a href="4-viz.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
+<li class="chapter" data-level="4.8.1" data-path="4-viz.html"><a href="4-viz.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via geom_bar</a></li>
+<li class="chapter" data-level="4.8.2" data-path="4-viz.html"><a href="4-viz.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
+<li class="chapter" data-level="4.8.3" data-path="4-viz.html"><a href="4-viz.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
+<li class="chapter" data-level="4.8.4" data-path="4-viz.html"><a href="4-viz.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.9" data-path="4-viz.html"><a href="4-viz.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="4.9.1" data-path="4-viz.html"><a href="4-viz.html#resources"><i class="fa fa-check"></i><b>4.9.1</b> Resources</a></li>
+<li class="chapter" data-level="4.9.2" data-path="4-viz.html"><a href="4-viz.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
+<li class="chapter" data-level="4.9.3" data-path="4-viz.html"><a href="4-viz.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.3</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="5" data-path="5-manip.html"><a href="5-manip.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via dplyr</a><ul>
+<li class="chapter" data-level="" data-path="5-manip.html"><a href="5-manip.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="5.1" data-path="5-manip.html"><a href="5-manip.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
+<li class="chapter" data-level="5.2" data-path="5-manip.html"><a href="5-manip.html#five-main-verbs---the-5mv"><i class="fa fa-check"></i><b>5.2</b> Five Main Verbs - The 5MV</a><ul>
+<li class="chapter" data-level="5.2.1" data-path="5-manip.html"><a href="5-manip.html#filter"><i class="fa fa-check"></i><b>5.2.1</b> 5MV#1: Filter observations using filter</a></li>
+<li class="chapter" data-level="5.2.2" data-path="5-manip.html"><a href="5-manip.html#mv2-summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> 5MV#2: Summarize variables using summarize</a></li>
+<li class="chapter" data-level="5.2.3" data-path="5-manip.html"><a href="5-manip.html#mv3-group-rows-using-group_by"><i class="fa fa-check"></i><b>5.2.3</b> 5MV#3: Group rows using group_by</a></li>
+<li class="chapter" data-level="5.2.4" data-path="5-manip.html"><a href="5-manip.html#mv4-create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.4</b> 5MV#4: Create new variables/change old variables using mutate</a></li>
+<li class="chapter" data-level="5.2.5" data-path="5-manip.html"><a href="5-manip.html#arrange"><i class="fa fa-check"></i><b>5.2.5</b> 5MV#5: Reorder the data frame using arrange</a></li>
+</ul></li>
+<li class="chapter" data-level="5.3" data-path="5-manip.html"><a href="5-manip.html#joining-data-frames"><i class="fa fa-check"></i><b>5.3</b> Joining data frames</a><ul>
+<li class="chapter" data-level="5.3.1" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables"><i class="fa fa-check"></i><b>5.3.1</b> Joining by Key Variables</a></li>
+<li class="chapter" data-level="5.3.2" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables-with-different-names"><i class="fa fa-check"></i><b>5.3.2</b> Joining by Key Variables with Different Names</a></li>
+</ul></li>
+<li class="chapter" data-level="5.4" data-path="5-manip.html"><a href="5-manip.html#optional-other-verbs"><i class="fa fa-check"></i><b>5.4</b> Optional: Other verbs</a><ul>
+<li class="chapter" data-level="5.4.1" data-path="5-manip.html"><a href="5-manip.html#select"><i class="fa fa-check"></i><b>5.4.1</b> Select variables using select</a></li>
+<li class="chapter" data-level="5.4.2" data-path="5-manip.html"><a href="5-manip.html#rename"><i class="fa fa-check"></i><b>5.4.2</b> Rename variables using rename</a></li>
+<li class="chapter" data-level="5.4.3" data-path="5-manip.html"><a href="5-manip.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.4.3</b> Find the top number of values using top_n</a></li>
+</ul></li>
+<li class="chapter" data-level="5.5" data-path="5-manip.html"><a href="5-manip.html#conclusion-1"><i class="fa fa-check"></i><b>5.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="5.5.1" data-path="5-manip.html"><a href="5-manip.html#resources-1"><i class="fa fa-check"></i><b>5.5.1</b> Resources</a></li>
+<li class="chapter" data-level="5.5.2" data-path="5-manip.html"><a href="5-manip.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5.2</b> Script of R code</a></li>
+<li class="chapter" data-level="5.5.3" data-path="5-manip.html"><a href="5-manip.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.5.3</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="part"><span><b>II Inference</b></span></li>
-<li class="chapter" data-level="6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via <code id="sim">mosaic</code></a><ul>
-<li class="chapter" data-level="" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="6.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
-<li class="chapter" data-level="6.1.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
-<li class="chapter" data-level="6.1.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+<li class="chapter" data-level="6" data-path="6-sim.html"><a href="6-sim.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via mosaic</a><ul>
+<li class="chapter" data-level="" data-path="6-sim.html"><a href="6-sim.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="6.1" data-path="6-sim.html"><a href="6-sim.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
+<li class="chapter" data-level="6.1.1" data-path="6-sim.html"><a href="6-sim.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
+<li class="chapter" data-level="6.1.2" data-path="6-sim.html"><a href="6-sim.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+</ul></li>
+<li class="chapter" data-level="6.2" data-path="6-sim.html"><a href="6-sim.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
+<li class="chapter" data-level="6.2.1" data-path="6-sim.html"><a href="6-sim.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
+<li class="chapter" data-level="6.2.2" data-path="6-sim.html"><a href="6-sim.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
 </ul></li>
-<li class="chapter" data-level="6.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
-<li class="chapter" data-level="6.2.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
-<li class="chapter" data-level="6.2.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
+<li class="chapter" data-level="6.3" data-path="6-sim.html"><a href="6-sim.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
+<li class="chapter" data-level="6.4" data-path="6-sim.html"><a href="6-sim.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
+<li class="chapter" data-level="6.5" data-path="6-sim.html"><a href="6-sim.html#conclusion-2"><i class="fa fa-check"></i><b>6.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="6.5.1" data-path="6-sim.html"><a href="6-sim.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5.1</b> Script of R code</a></li>
+<li class="chapter" data-level="6.5.2" data-path="6-sim.html"><a href="6-sim.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.5.2</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="6.3" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
-<li class="chapter" data-level="6.4" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
-<li class="chapter" data-level="6.5" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5</b> Script of R code</a></li>
-<li class="chapter" data-level="6.6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.6</b> What’s to come?</a></li>
 </ul></li>
 <li class="chapter" data-level="7" data-path="7-hypo.html"><a href="7-hypo.html"><i class="fa fa-check"></i><b>7</b> Hypothesis Testing</a><ul>
-<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="7.1" data-path="7-hypo.html"><a href="7-hypo.html#when-inference-is-not-needed"><i class="fa fa-check"></i><b>7.1</b> When Inference Is Not Needed</a></li>
 <li class="chapter" data-level="7.2" data-path="7-hypo.html"><a href="7-hypo.html#basics-of-hypothesis-testing"><i class="fa fa-check"></i><b>7.2</b> Basics of Hypothesis Testing</a></li>
 <li class="chapter" data-level="7.3" data-path="7-hypo.html"><a href="7-hypo.html#trial"><i class="fa fa-check"></i><b>7.3</b> Criminal trial analogy</a><ul>
@@ -249,44 +262,50 @@
 <li class="chapter" data-level="7.8.1" data-path="7-hypo.html"><a href="7-hypo.html#example-t-test-for-two-independent-samples"><i class="fa fa-check"></i><b>7.8.1</b> EXAMPLE: <span class="math inline">\(t\)</span>-test for two independent samples</a></li>
 <li class="chapter" data-level="7.8.2" data-path="7-hypo.html"><a href="7-hypo.html#conditions-for-t-test"><i class="fa fa-check"></i><b>7.8.2</b> Conditions for t-test</a></li>
 </ul></li>
-<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9</b> What’s to come?</a></li>
+<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#conclusion-3"><i class="fa fa-check"></i><b>7.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="7.9.1" data-path="7-hypo.html"><a href="7-hypo.html#script-of-r-code-3"><i class="fa fa-check"></i><b>7.9.1</b> Script of R code</a></li>
+<li class="chapter" data-level="7.9.2" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="chapter" data-level="8" data-path="8-ci.html"><a href="8-ci.html"><i class="fa fa-check"></i><b>8</b> Confidence Intervals</a><ul>
-<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a></li>
-<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#mean_rating"><i class="fa fa-check"></i><b>8.2</b> mean_rating</a><ul>
-<li class="chapter" data-level="8.2.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.2.1</b> Review of Bootstrapping</a></li>
-</ul></li>
-<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.3</b> Relation to hypothesis testing</a></li>
-<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.4</b> Effect size</a></li>
-<li class="chapter" data-level="8.5" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-3"><i class="fa fa-check"></i><b>8.5</b> Script of R code</a></li>
-<li class="chapter" data-level="8.6" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.6</b> What’s to come?</a></li>
-</ul></li>
-<li class="chapter" data-level="9" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html"><i class="fa fa-check"></i><b>9</b> Regression via <code id="regress">broom</code></a><ul>
-<li class="chapter" data-level="" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="9.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
-<li class="chapter" data-level="9.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
-<li class="chapter" data-level="9.2.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
-</ul></li>
-<li class="chapter" data-level="9.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
-<li class="chapter" data-level="9.3.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
-<li class="chapter" data-level="9.3.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
-<li class="chapter" data-level="9.3.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#interpretting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpretting the slope</a></li>
-<li class="chapter" data-level="9.3.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
-</ul></li>
-<li class="chapter" data-level="9.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
-<li class="chapter" data-level="9.4.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
-<li class="chapter" data-level="9.4.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
-<li class="chapter" data-level="9.4.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
-<li class="chapter" data-level="9.4.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
-<li class="chapter" data-level="9.4.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
-</ul></li>
-<li class="chapter" data-level="9.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
-<li class="chapter" data-level="9.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
-<li class="chapter" data-level="9.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#script-of-r-code-4"><i class="fa fa-check"></i><b>9.7</b> Script of R code</a></li>
-<li class="chapter" data-level="9.8" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.8</b> What’s to come?</a></li>
+<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a><ul>
+<li class="chapter" data-level="8.1.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.1.1</b> Review of Bootstrapping</a></li>
+</ul></li>
+<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.2</b> Relation to hypothesis testing</a></li>
+<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.3</b> Effect size</a></li>
+<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#conclusion-4"><i class="fa fa-check"></i><b>8.4</b> Conclusion</a><ul>
+<li class="chapter" data-level="8.4.1" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-4"><i class="fa fa-check"></i><b>8.4.1</b> Script of R code</a></li>
+<li class="chapter" data-level="8.4.2" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.4.2</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="9" data-path="9-regress.html"><a href="9-regress.html"><i class="fa fa-check"></i><b>9</b> Regression via broom</a><ul>
+<li class="chapter" data-level="" data-path="9-regress.html"><a href="9-regress.html#needed-packages-6"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="9.1" data-path="9-regress.html"><a href="9-regress.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
+<li class="chapter" data-level="9.2" data-path="9-regress.html"><a href="9-regress.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
+<li class="chapter" data-level="9.2.1" data-path="9-regress.html"><a href="9-regress.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
+</ul></li>
+<li class="chapter" data-level="9.3" data-path="9-regress.html"><a href="9-regress.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
+<li class="chapter" data-level="9.3.1" data-path="9-regress.html"><a href="9-regress.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
+<li class="chapter" data-level="9.3.2" data-path="9-regress.html"><a href="9-regress.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
+<li class="chapter" data-level="9.3.3" data-path="9-regress.html"><a href="9-regress.html#interpreting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpreting the slope</a></li>
+<li class="chapter" data-level="9.3.4" data-path="9-regress.html"><a href="9-regress.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
+</ul></li>
+<li class="chapter" data-level="9.4" data-path="9-regress.html"><a href="9-regress.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
+<li class="chapter" data-level="9.4.1" data-path="9-regress.html"><a href="9-regress.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
+<li class="chapter" data-level="9.4.2" data-path="9-regress.html"><a href="9-regress.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
+<li class="chapter" data-level="9.4.3" data-path="9-regress.html"><a href="9-regress.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
+<li class="chapter" data-level="9.4.4" data-path="9-regress.html"><a href="9-regress.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.5" data-path="9-regress.html"><a href="9-regress.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
+<li class="chapter" data-level="9.4.6" data-path="9-regress.html"><a href="9-regress.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.7" data-path="9-regress.html"><a href="9-regress.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
+</ul></li>
+<li class="chapter" data-level="9.5" data-path="9-regress.html"><a href="9-regress.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
+<li class="chapter" data-level="9.6" data-path="9-regress.html"><a href="9-regress.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
+<li class="chapter" data-level="9.7" data-path="9-regress.html"><a href="9-regress.html#conclusion-5"><i class="fa fa-check"></i><b>9.7</b> Conclusion</a><ul>
+<li class="chapter" data-level="9.7.1" data-path="9-regress.html"><a href="9-regress.html#script-of-r-code-5"><i class="fa fa-check"></i><b>9.7.1</b> Script of R code</a></li>
+<li class="chapter" data-level="9.7.2" data-path="9-regress.html"><a href="9-regress.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.7.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="part"><span><b>III Conclusion</b></span></li>
 <li class="chapter" data-level="10" data-path="10-effective-data-storytelling.html"><a href="10-effective-data-storytelling.html"><i class="fa fa-check"></i><b>10</b> Effective Data Storytelling</a><ul>
@@ -304,7 +323,8 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="B" data-path="B-appendixB.html"><a href="B-appendixB.html"><i class="fa fa-check"></i><b>B</b> Inference Examples</a><ul>
-<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-6"><i class="fa fa-check"></i><b>B.1</b> Needed packages</a></li>
+<li class="chapter" data-level="" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-7"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#inference-mind-map"><i class="fa fa-check"></i><b>B.1</b> Inference Mind Map</a></li>
 <li class="chapter" data-level="B.2" data-path="B-appendixB.html"><a href="B-appendixB.html#one-mean"><i class="fa fa-check"></i><b>B.2</b> One Mean</a><ul>
 <li class="chapter" data-level="B.2.1" data-path="B-appendixB.html"><a href="B-appendixB.html#problem-statement"><i class="fa fa-check"></i><b>B.2.1</b> Problem Statement</a></li>
 <li class="chapter" data-level="B.2.2" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses"><i class="fa fa-check"></i><b>B.2.2</b> Competing Hypotheses</a></li>
@@ -352,6 +372,7 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="C" data-path="C-appendixC.html"><a href="C-appendixC.html"><i class="fa fa-check"></i><b>C</b> Reach for the Starts</a><ul>
+<li class="chapter" data-level="" data-path="C-appendixC.html"><a href="C-appendixC.html#needed-packages-8"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="C.1" data-path="C-appendixC.html"><a href="C-appendixC.html#sorted-barplots"><i class="fa fa-check"></i><b>C.1</b> Sorted barplots</a></li>
 <li class="chapter" data-level="C.2" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-graphics"><i class="fa fa-check"></i><b>C.2</b> Interactive graphics</a><ul>
 <li class="chapter" data-level="C.2.1" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-line-graphs"><i class="fa fa-check"></i><b>C.2.1</b> Interactive line-graphs</a></li>
diff --git a/docs/B-appendixB.html b/docs/B-appendixB.html
index b5caaa479..f96c30831 100644
--- a/docs/B-appendixB.html
+++ b/docs/B-appendixB.html
@@ -26,7 +26,7 @@
 <meta name="author" content="Chester Ismay and Albert Y. Kim">
 
 
-<meta name="date" content="2017-01-07">
+<meta name="date" content="2017-01-10">
 
   <meta name="viewport" content="width=device-width, initial-scale=1">
   <meta name="apple-mobile-web-app-capable" content="yes">
@@ -119,9 +119,9 @@
 
 <ul class="summary">
 <li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> Preamble</a><ul>
-<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book</a></li>
+<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book---for-instructors"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book - For Instructors</a></li>
 <li class="chapter" data-level="1.2" data-path="index.html"><a href="index.html#contribute"><i class="fa fa-check"></i><b>1.2</b> Contribute</a></li>
-<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started"><i class="fa fa-check"></i><b>1.3</b> Getting Started</a></li>
+<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started---for-students"><i class="fa fa-check"></i><b>1.3</b> Getting Started - For Students</a></li>
 <li class="chapter" data-level="" data-path="index.html"><a href="index.html#colophon"><i class="fa fa-check"></i>Colophon</a></li>
 </ul></li>
 <li class="chapter" data-level="2" data-path="2-intro.html"><a href="2-intro.html"><i class="fa fa-check"></i><b>2</b> Introduction</a><ul>
@@ -133,87 +133,100 @@
 </ul></li>
 <li class="part"><span><b>I Data Exploration</b></span></li>
 <li class="chapter" data-level="3" data-path="3-tidy.html"><a href="3-tidy.html"><i class="fa fa-check"></i><b>3</b> Tidy Data</a><ul>
+<li class="chapter" data-level="" data-path="3-tidy.html"><a href="3-tidy.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="3.1" data-path="3-tidy.html"><a href="3-tidy.html#what-is-tidy-data"><i class="fa fa-check"></i><b>3.1</b> What is tidy data?</a></li>
-<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#the-nycflights13-datasets"><i class="fa fa-check"></i><b>3.2</b> The <code>nycflights13</code> datasets</a></li>
-<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a></li>
+<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#datasets-in-the-nycflights13-package"><i class="fa fa-check"></i><b>3.2</b> Datasets in the <code>nycflights13</code> package</a></li>
+<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a><ul>
+<li class="chapter" data-level="3.3.1" data-path="3-tidy.html"><a href="3-tidy.html#identification-variables"><i class="fa fa-check"></i><b>3.3.1</b> Identification variables</a></li>
+</ul></li>
 <li class="chapter" data-level="3.4" data-path="3-tidy.html"><a href="3-tidy.html#normal-forms-of-data"><i class="fa fa-check"></i><b>3.4</b> Normal forms of data</a></li>
 <li class="chapter" data-level="3.5" data-path="3-tidy.html"><a href="3-tidy.html#whats-to-come"><i class="fa fa-check"></i><b>3.5</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via <code id="viz">ggplot2</code></a><ul>
-<li class="chapter" data-level="" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
-<li class="chapter" data-level="4.1.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
-<li class="chapter" data-level="4.1.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
-<li class="chapter" data-level="4.1.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
-<li class="chapter" data-level="4.1.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The <code>ggplot2</code> Package</a></li>
+<li class="chapter" data-level="4" data-path="4-viz.html"><a href="4-viz.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via ggplot2</a><ul>
+<li class="chapter" data-level="" data-path="4-viz.html"><a href="4-viz.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="4.1" data-path="4-viz.html"><a href="4-viz.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
+<li class="chapter" data-level="4.1.1" data-path="4-viz.html"><a href="4-viz.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
+<li class="chapter" data-level="4.1.2" data-path="4-viz.html"><a href="4-viz.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
+<li class="chapter" data-level="4.1.3" data-path="4-viz.html"><a href="4-viz.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
+<li class="chapter" data-level="4.1.4" data-path="4-viz.html"><a href="4-viz.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The ggplot2 Package</a></li>
+</ul></li>
+<li class="chapter" data-level="4.2" data-path="4-viz.html"><a href="4-viz.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
+<li class="chapter" data-level="4.3" data-path="4-viz.html"><a href="4-viz.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
+<li class="chapter" data-level="4.3.1" data-path="4-viz.html"><a href="4-viz.html#geompoint"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via geom_point</a></li>
+<li class="chapter" data-level="4.3.2" data-path="4-viz.html"><a href="4-viz.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
+<li class="chapter" data-level="4.3.3" data-path="4-viz.html"><a href="4-viz.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
 </ul></li>
-<li class="chapter" data-level="4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
-<li class="chapter" data-level="4.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
-<li class="chapter" data-level="4.3.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatter-plots-via-geom_point"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via <code id="geompoint">geom_point</code></a></li>
-<li class="chapter" data-level="4.3.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
-<li class="chapter" data-level="4.3.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
+<li class="chapter" data-level="4.4" data-path="4-viz.html"><a href="4-viz.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
+<li class="chapter" data-level="4.4.1" data-path="4-viz.html"><a href="4-viz.html#geomline"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via geom_line</a></li>
+<li class="chapter" data-level="4.4.2" data-path="4-viz.html"><a href="4-viz.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
 </ul></li>
-<li class="chapter" data-level="4.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
-<li class="chapter" data-level="4.4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#line-graphs-via-geom_line"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via <code id="geomline">geom_line</code></a></li>
-<li class="chapter" data-level="4.4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
+<li class="chapter" data-level="4.5" data-path="4-viz.html"><a href="4-viz.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
+<li class="chapter" data-level="4.5.1" data-path="4-viz.html"><a href="4-viz.html#geomhistogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via geom_histogram</a></li>
+<li class="chapter" data-level="4.5.2" data-path="4-viz.html"><a href="4-viz.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
+<li class="chapter" data-level="4.5.3" data-path="4-viz.html"><a href="4-viz.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
 </ul></li>
-<li class="chapter" data-level="4.5" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
-<li class="chapter" data-level="4.5.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms-via-geom_histogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via <code id="geomhistogram">geom_histogram</code></a></li>
-<li class="chapter" data-level="4.5.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
-<li class="chapter" data-level="4.5.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
+<li class="chapter" data-level="4.6" data-path="4-viz.html"><a href="4-viz.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
+<li class="chapter" data-level="4.7" data-path="4-viz.html"><a href="4-viz.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
+<li class="chapter" data-level="4.7.1" data-path="4-viz.html"><a href="4-viz.html#geomboxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via geom_boxplot</a></li>
+<li class="chapter" data-level="4.7.2" data-path="4-viz.html"><a href="4-viz.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
 </ul></li>
-<li class="chapter" data-level="4.6" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
-<li class="chapter" data-level="4.7" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
-<li class="chapter" data-level="4.7.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#boxplots-via-geom_boxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via <code id="geomboxplot">geom_boxplot</code></a></li>
-<li class="chapter" data-level="4.7.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
+<li class="chapter" data-level="4.8" data-path="4-viz.html"><a href="4-viz.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
+<li class="chapter" data-level="4.8.1" data-path="4-viz.html"><a href="4-viz.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via geom_bar</a></li>
+<li class="chapter" data-level="4.8.2" data-path="4-viz.html"><a href="4-viz.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
+<li class="chapter" data-level="4.8.3" data-path="4-viz.html"><a href="4-viz.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
+<li class="chapter" data-level="4.8.4" data-path="4-viz.html"><a href="4-viz.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
 </ul></li>
-<li class="chapter" data-level="4.8" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
-<li class="chapter" data-level="4.8.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via <code>geom_bar</code></a></li>
-<li class="chapter" data-level="4.8.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
-<li class="chapter" data-level="4.8.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
-<li class="chapter" data-level="4.8.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
+<li class="chapter" data-level="4.9" data-path="4-viz.html"><a href="4-viz.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="4.9.1" data-path="4-viz.html"><a href="4-viz.html#resources"><i class="fa fa-check"></i><b>4.9.1</b> Resources</a></li>
+<li class="chapter" data-level="4.9.2" data-path="4-viz.html"><a href="4-viz.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
+<li class="chapter" data-level="4.9.3" data-path="4-viz.html"><a href="4-viz.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.3</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="4.9" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
-<li class="chapter" data-level="4.9.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.1</b> What’s to come?</a></li>
-<li class="chapter" data-level="4.9.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
 </ul></li>
+<li class="chapter" data-level="5" data-path="5-manip.html"><a href="5-manip.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via dplyr</a><ul>
+<li class="chapter" data-level="" data-path="5-manip.html"><a href="5-manip.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="5.1" data-path="5-manip.html"><a href="5-manip.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
+<li class="chapter" data-level="5.2" data-path="5-manip.html"><a href="5-manip.html#five-main-verbs---the-5mv"><i class="fa fa-check"></i><b>5.2</b> Five Main Verbs - The 5MV</a><ul>
+<li class="chapter" data-level="5.2.1" data-path="5-manip.html"><a href="5-manip.html#filter"><i class="fa fa-check"></i><b>5.2.1</b> 5MV#1: Filter observations using filter</a></li>
+<li class="chapter" data-level="5.2.2" data-path="5-manip.html"><a href="5-manip.html#mv2-summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> 5MV#2: Summarize variables using summarize</a></li>
+<li class="chapter" data-level="5.2.3" data-path="5-manip.html"><a href="5-manip.html#mv3-group-rows-using-group_by"><i class="fa fa-check"></i><b>5.2.3</b> 5MV#3: Group rows using group_by</a></li>
+<li class="chapter" data-level="5.2.4" data-path="5-manip.html"><a href="5-manip.html#mv4-create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.4</b> 5MV#4: Create new variables/change old variables using mutate</a></li>
+<li class="chapter" data-level="5.2.5" data-path="5-manip.html"><a href="5-manip.html#arrange"><i class="fa fa-check"></i><b>5.2.5</b> 5MV#5: Reorder the data frame using arrange</a></li>
 </ul></li>
-<li class="chapter" data-level="5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via <code id="manip">dplyr</code></a><ul>
-<li class="chapter" data-level="" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="5.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
-<li class="chapter" data-level="5.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#four-main-verbs---the-4mv"><i class="fa fa-check"></i><b>5.2</b> Four Main Verbs - The 4MV</a><ul>
-<li class="chapter" data-level="5.2.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#filter-observations-using-filter"><i class="fa fa-check"></i><b>5.2.1</b> Filter observations using <code id="filter">filter</code></a></li>
-<li class="chapter" data-level="5.2.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> Summarize variables using <code>summarize</code></a></li>
-<li class="chapter" data-level="5.2.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.3</b> Create new variables/change old variables using <code>mutate</code></a></li>
-<li class="chapter" data-level="5.2.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#reorder-the-data-frame-using-arrange"><i class="fa fa-check"></i><b>5.2.4</b> Reorder the data frame using <code id="arrange">arrange</code></a></li>
+<li class="chapter" data-level="5.3" data-path="5-manip.html"><a href="5-manip.html#joining-data-frames"><i class="fa fa-check"></i><b>5.3</b> Joining data frames</a><ul>
+<li class="chapter" data-level="5.3.1" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables"><i class="fa fa-check"></i><b>5.3.1</b> Joining by Key Variables</a></li>
+<li class="chapter" data-level="5.3.2" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables-with-different-names"><i class="fa fa-check"></i><b>5.3.2</b> Joining by Key Variables with Different Names</a></li>
 </ul></li>
-<li class="chapter" data-level="5.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#other-verbs"><i class="fa fa-check"></i><b>5.3</b> Other verbs</a><ul>
-<li class="chapter" data-level="5.3.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#select-variables-using-select"><i class="fa fa-check"></i><b>5.3.1</b> Select variables using <code id="select">select</code></a></li>
-<li class="chapter" data-level="5.3.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#rename-variables-using-rename"><i class="fa fa-check"></i><b>5.3.2</b> Rename variables using <code id="rename">rename</code></a></li>
-<li class="chapter" data-level="5.3.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.3.3</b> Find the top number of values using <code>top_n</code></a></li>
+<li class="chapter" data-level="5.4" data-path="5-manip.html"><a href="5-manip.html#optional-other-verbs"><i class="fa fa-check"></i><b>5.4</b> Optional: Other verbs</a><ul>
+<li class="chapter" data-level="5.4.1" data-path="5-manip.html"><a href="5-manip.html#select"><i class="fa fa-check"></i><b>5.4.1</b> Select variables using select</a></li>
+<li class="chapter" data-level="5.4.2" data-path="5-manip.html"><a href="5-manip.html#rename"><i class="fa fa-check"></i><b>5.4.2</b> Rename variables using rename</a></li>
+<li class="chapter" data-level="5.4.3" data-path="5-manip.html"><a href="5-manip.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.4.3</b> Find the top number of values using top_n</a></li>
+</ul></li>
+<li class="chapter" data-level="5.5" data-path="5-manip.html"><a href="5-manip.html#conclusion-1"><i class="fa fa-check"></i><b>5.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="5.5.1" data-path="5-manip.html"><a href="5-manip.html#resources-1"><i class="fa fa-check"></i><b>5.5.1</b> Resources</a></li>
+<li class="chapter" data-level="5.5.2" data-path="5-manip.html"><a href="5-manip.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5.2</b> Script of R code</a></li>
+<li class="chapter" data-level="5.5.3" data-path="5-manip.html"><a href="5-manip.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.5.3</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="5.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#joiningmerging-data-frames"><i class="fa fa-check"></i><b>5.4</b> Joining/merging data frames</a></li>
-<li class="chapter" data-level="5.5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5</b> Script of R code</a></li>
-<li class="chapter" data-level="5.6" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.6</b> What’s to come?</a></li>
 </ul></li>
 <li class="part"><span><b>II Inference</b></span></li>
-<li class="chapter" data-level="6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via <code id="sim">mosaic</code></a><ul>
-<li class="chapter" data-level="" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="6.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
-<li class="chapter" data-level="6.1.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
-<li class="chapter" data-level="6.1.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+<li class="chapter" data-level="6" data-path="6-sim.html"><a href="6-sim.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via mosaic</a><ul>
+<li class="chapter" data-level="" data-path="6-sim.html"><a href="6-sim.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="6.1" data-path="6-sim.html"><a href="6-sim.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
+<li class="chapter" data-level="6.1.1" data-path="6-sim.html"><a href="6-sim.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
+<li class="chapter" data-level="6.1.2" data-path="6-sim.html"><a href="6-sim.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+</ul></li>
+<li class="chapter" data-level="6.2" data-path="6-sim.html"><a href="6-sim.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
+<li class="chapter" data-level="6.2.1" data-path="6-sim.html"><a href="6-sim.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
+<li class="chapter" data-level="6.2.2" data-path="6-sim.html"><a href="6-sim.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
 </ul></li>
-<li class="chapter" data-level="6.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
-<li class="chapter" data-level="6.2.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
-<li class="chapter" data-level="6.2.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
+<li class="chapter" data-level="6.3" data-path="6-sim.html"><a href="6-sim.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
+<li class="chapter" data-level="6.4" data-path="6-sim.html"><a href="6-sim.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
+<li class="chapter" data-level="6.5" data-path="6-sim.html"><a href="6-sim.html#conclusion-2"><i class="fa fa-check"></i><b>6.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="6.5.1" data-path="6-sim.html"><a href="6-sim.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5.1</b> Script of R code</a></li>
+<li class="chapter" data-level="6.5.2" data-path="6-sim.html"><a href="6-sim.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.5.2</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="6.3" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
-<li class="chapter" data-level="6.4" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
-<li class="chapter" data-level="6.5" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5</b> Script of R code</a></li>
-<li class="chapter" data-level="6.6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.6</b> What’s to come?</a></li>
 </ul></li>
 <li class="chapter" data-level="7" data-path="7-hypo.html"><a href="7-hypo.html"><i class="fa fa-check"></i><b>7</b> Hypothesis Testing</a><ul>
-<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="7.1" data-path="7-hypo.html"><a href="7-hypo.html#when-inference-is-not-needed"><i class="fa fa-check"></i><b>7.1</b> When Inference Is Not Needed</a></li>
 <li class="chapter" data-level="7.2" data-path="7-hypo.html"><a href="7-hypo.html#basics-of-hypothesis-testing"><i class="fa fa-check"></i><b>7.2</b> Basics of Hypothesis Testing</a></li>
 <li class="chapter" data-level="7.3" data-path="7-hypo.html"><a href="7-hypo.html#trial"><i class="fa fa-check"></i><b>7.3</b> Criminal trial analogy</a><ul>
@@ -249,44 +262,50 @@
 <li class="chapter" data-level="7.8.1" data-path="7-hypo.html"><a href="7-hypo.html#example-t-test-for-two-independent-samples"><i class="fa fa-check"></i><b>7.8.1</b> EXAMPLE: <span class="math inline">\(t\)</span>-test for two independent samples</a></li>
 <li class="chapter" data-level="7.8.2" data-path="7-hypo.html"><a href="7-hypo.html#conditions-for-t-test"><i class="fa fa-check"></i><b>7.8.2</b> Conditions for t-test</a></li>
 </ul></li>
-<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9</b> What’s to come?</a></li>
+<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#conclusion-3"><i class="fa fa-check"></i><b>7.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="7.9.1" data-path="7-hypo.html"><a href="7-hypo.html#script-of-r-code-3"><i class="fa fa-check"></i><b>7.9.1</b> Script of R code</a></li>
+<li class="chapter" data-level="7.9.2" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="chapter" data-level="8" data-path="8-ci.html"><a href="8-ci.html"><i class="fa fa-check"></i><b>8</b> Confidence Intervals</a><ul>
-<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a></li>
-<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#mean_rating"><i class="fa fa-check"></i><b>8.2</b> mean_rating</a><ul>
-<li class="chapter" data-level="8.2.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.2.1</b> Review of Bootstrapping</a></li>
+<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a><ul>
+<li class="chapter" data-level="8.1.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.1.1</b> Review of Bootstrapping</a></li>
+</ul></li>
+<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.2</b> Relation to hypothesis testing</a></li>
+<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.3</b> Effect size</a></li>
+<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#conclusion-4"><i class="fa fa-check"></i><b>8.4</b> Conclusion</a><ul>
+<li class="chapter" data-level="8.4.1" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-4"><i class="fa fa-check"></i><b>8.4.1</b> Script of R code</a></li>
+<li class="chapter" data-level="8.4.2" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.4.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
-<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.3</b> Relation to hypothesis testing</a></li>
-<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.4</b> Effect size</a></li>
-<li class="chapter" data-level="8.5" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-3"><i class="fa fa-check"></i><b>8.5</b> Script of R code</a></li>
-<li class="chapter" data-level="8.6" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.6</b> What’s to come?</a></li>
+<li class="chapter" data-level="9" data-path="9-regress.html"><a href="9-regress.html"><i class="fa fa-check"></i><b>9</b> Regression via broom</a><ul>
+<li class="chapter" data-level="" data-path="9-regress.html"><a href="9-regress.html#needed-packages-6"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="9.1" data-path="9-regress.html"><a href="9-regress.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
+<li class="chapter" data-level="9.2" data-path="9-regress.html"><a href="9-regress.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
+<li class="chapter" data-level="9.2.1" data-path="9-regress.html"><a href="9-regress.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
 </ul></li>
-<li class="chapter" data-level="9" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html"><i class="fa fa-check"></i><b>9</b> Regression via <code id="regress">broom</code></a><ul>
-<li class="chapter" data-level="" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="9.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
-<li class="chapter" data-level="9.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
-<li class="chapter" data-level="9.2.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
+<li class="chapter" data-level="9.3" data-path="9-regress.html"><a href="9-regress.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
+<li class="chapter" data-level="9.3.1" data-path="9-regress.html"><a href="9-regress.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
+<li class="chapter" data-level="9.3.2" data-path="9-regress.html"><a href="9-regress.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
+<li class="chapter" data-level="9.3.3" data-path="9-regress.html"><a href="9-regress.html#interpreting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpreting the slope</a></li>
+<li class="chapter" data-level="9.3.4" data-path="9-regress.html"><a href="9-regress.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
 </ul></li>
-<li class="chapter" data-level="9.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
-<li class="chapter" data-level="9.3.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
-<li class="chapter" data-level="9.3.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
-<li class="chapter" data-level="9.3.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#interpretting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpretting the slope</a></li>
-<li class="chapter" data-level="9.3.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
+<li class="chapter" data-level="9.4" data-path="9-regress.html"><a href="9-regress.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
+<li class="chapter" data-level="9.4.1" data-path="9-regress.html"><a href="9-regress.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
+<li class="chapter" data-level="9.4.2" data-path="9-regress.html"><a href="9-regress.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
+<li class="chapter" data-level="9.4.3" data-path="9-regress.html"><a href="9-regress.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
+<li class="chapter" data-level="9.4.4" data-path="9-regress.html"><a href="9-regress.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.5" data-path="9-regress.html"><a href="9-regress.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
+<li class="chapter" data-level="9.4.6" data-path="9-regress.html"><a href="9-regress.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.7" data-path="9-regress.html"><a href="9-regress.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
 </ul></li>
-<li class="chapter" data-level="9.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
-<li class="chapter" data-level="9.4.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
-<li class="chapter" data-level="9.4.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
-<li class="chapter" data-level="9.4.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
-<li class="chapter" data-level="9.4.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
-<li class="chapter" data-level="9.4.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
+<li class="chapter" data-level="9.5" data-path="9-regress.html"><a href="9-regress.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
+<li class="chapter" data-level="9.6" data-path="9-regress.html"><a href="9-regress.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
+<li class="chapter" data-level="9.7" data-path="9-regress.html"><a href="9-regress.html#conclusion-5"><i class="fa fa-check"></i><b>9.7</b> Conclusion</a><ul>
+<li class="chapter" data-level="9.7.1" data-path="9-regress.html"><a href="9-regress.html#script-of-r-code-5"><i class="fa fa-check"></i><b>9.7.1</b> Script of R code</a></li>
+<li class="chapter" data-level="9.7.2" data-path="9-regress.html"><a href="9-regress.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.7.2</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="9.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
-<li class="chapter" data-level="9.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
-<li class="chapter" data-level="9.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#script-of-r-code-4"><i class="fa fa-check"></i><b>9.7</b> Script of R code</a></li>
-<li class="chapter" data-level="9.8" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.8</b> What’s to come?</a></li>
 </ul></li>
 <li class="part"><span><b>III Conclusion</b></span></li>
 <li class="chapter" data-level="10" data-path="10-effective-data-storytelling.html"><a href="10-effective-data-storytelling.html"><i class="fa fa-check"></i><b>10</b> Effective Data Storytelling</a><ul>
@@ -304,7 +323,8 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="B" data-path="B-appendixB.html"><a href="B-appendixB.html"><i class="fa fa-check"></i><b>B</b> Inference Examples</a><ul>
-<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-6"><i class="fa fa-check"></i><b>B.1</b> Needed packages</a></li>
+<li class="chapter" data-level="" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-7"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#inference-mind-map"><i class="fa fa-check"></i><b>B.1</b> Inference Mind Map</a></li>
 <li class="chapter" data-level="B.2" data-path="B-appendixB.html"><a href="B-appendixB.html#one-mean"><i class="fa fa-check"></i><b>B.2</b> One Mean</a><ul>
 <li class="chapter" data-level="B.2.1" data-path="B-appendixB.html"><a href="B-appendixB.html#problem-statement"><i class="fa fa-check"></i><b>B.2.1</b> Problem Statement</a></li>
 <li class="chapter" data-level="B.2.2" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses"><i class="fa fa-check"></i><b>B.2.2</b> Competing Hypotheses</a></li>
@@ -352,6 +372,7 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="C" data-path="C-appendixC.html"><a href="C-appendixC.html"><i class="fa fa-check"></i><b>C</b> Reach for the Starts</a><ul>
+<li class="chapter" data-level="" data-path="C-appendixC.html"><a href="C-appendixC.html#needed-packages-8"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="C.1" data-path="C-appendixC.html"><a href="C-appendixC.html#sorted-barplots"><i class="fa fa-check"></i><b>C.1</b> Sorted barplots</a></li>
 <li class="chapter" data-level="C.2" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-graphics"><i class="fa fa-check"></i><b>C.2</b> Interactive graphics</a><ul>
 <li class="chapter" data-level="C.2.1" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-line-graphs"><i class="fa fa-check"></i><b>C.2.1</b> Interactive line-graphs</a></li>
@@ -378,8 +399,8 @@ <h1>
 <div id="appendixB" class="section level1">
 <h1><span class="header-section-number">B</span> Inference Examples</h1>
 <p>This appendix is designed to provide you with example of the five basic hypothesis tests and their corresponding confidence intervals. Traditional theory-based methods as well as computational-based methods are presented.</p>
-<div id="needed-packages-6" class="section level2">
-<h2><span class="header-section-number">B.1</span> Needed packages</h2>
+<div id="needed-packages-7" class="section level2 unnumbered">
+<h2>Needed packages</h2>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(dplyr)
 <span class="kw">library</span>(ggplot2)
 <span class="kw">library</span>(mosaic)
@@ -388,6 +409,16 @@ <h2><span class="header-section-number">B.1</span> Needed packages</h2>
 <hr />
 <hr />
 </div>
+<div id="inference-mind-map" class="section level2">
+<h2><span class="header-section-number">B.1</span> Inference Mind Map</h2>
+<p>To help you better navigate and choose the appropriate analysis, we’ve created a mind map on <a href="http://coggle.it" class="uri">http://coggle.it</a> available <a href="https://coggle.it/diagram/Vxlydu1akQFeqo6-">here</a> and below.</p>
+<div class="figure" style="text-align: center"><span id="fig:infer-map"></span>
+<img src="images/coggle.png" alt="Mind map for Inference" width="200%" />
+<p class="caption">
+Figure B.1: Mind map for Inference
+</p>
+</div>
+</div>
 <div id="one-mean" class="section level2">
 <h2><span class="header-section-number">B.2</span> One Mean</h2>
 <div id="problem-statement" class="section level3">
@@ -421,10 +452,6 @@ <h3><span class="header-section-number">B.2.3</span> Exploring the sample data</
 <span class="co">#  destfile = &quot;data/ageAtMar.csv&quot;,</span>
 <span class="co">#  method = &quot;curl&quot;)</span>
 ageAtMar &lt;-<span class="st"> </span><span class="kw">read_csv</span>(<span class="st">&quot;data/ageAtMar.csv&quot;</span>)</code></pre></div>
-<pre><code>## Parsed with column specification:
-## cols(
-##   age = col_integer()
-## )</code></pre>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">age_summ &lt;-<span class="st"> </span>ageAtMar %&gt;%
 <span class="st">  </span><span class="kw">summarize</span>(<span class="dt">sample_size =</span> <span class="kw">n</span>(),
     <span class="dt">mean =</span> <span class="kw">mean</span>(age),
@@ -451,8 +478,8 @@ <h3><span class="header-section-number">B.2.3</span> Exploring the sample data</
 <tbody>
 <tr class="odd">
 <td align="right">5534</td>
-<td align="right">23.44019</td>
-<td align="right">4.721365</td>
+<td align="right">23.44</td>
+<td align="right">4.721</td>
 <td align="right">10</td>
 <td align="right">20</td>
 <td align="right">23</td>
@@ -467,7 +494,7 @@ <h3><span class="header-section-number">B.2.3</span> Exploring the sample data</
 <p><img src="ismaykim_files/figure-html/hist1b-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 <div id="guess-about-statistical-significance" class="section level4">
 <h4><span class="header-section-number">B.2.3.1</span> Guess about statistical significance</h4>
-<p>We are looking to see if the observed sample mean of 23.4401879 is statistically greater than <span class="math inline">\(\mu_0 = 23\)</span>. They seem to be quite close, but we have a large sample size here. Let’s guess that the large sample size will lead us to reject this practically small difference.</p>
+<p>We are looking to see if the observed sample mean of 23.4402 is statistically greater than <span class="math inline">\(\mu_0 = 23\)</span>. They seem to be quite close, but we have a large sample size here. Let’s guess that the large sample size will lead us to reject this practically small difference.</p>
 <hr />
 </div>
 </div>
@@ -475,13 +502,13 @@ <h4><span class="header-section-number">B.2.3.1</span> Guess about statistical s
 <h3><span class="header-section-number">B.2.4</span> Non-traditional methods</h3>
 <div id="bootstrapping-for-hypothesis-test" class="section level4">
 <h4><span class="header-section-number">B.2.4.1</span> Bootstrapping for Hypothesis Test</h4>
-<p>In order to look to see if the observed sample mean of 23.4401879 is statistically greater than <span class="math inline">\(\mu_0 = 23\)</span>, we need to account for the sample size. We also need to determine a process that replicates how the original sample of size 5534 was selected.</p>
+<p>In order to look to see if the observed sample mean of 23.4402 is statistically greater than <span class="math inline">\(\mu_0 = 23\)</span>, we need to account for the sample size. We also need to determine a process that replicates how the original sample of size 5534 was selected.</p>
 <p>We can use the idea of <em>bootstrapping</em> to simulate the population from which the sample came and then generate samples from that simulated population to account for sampling variability. Recall how bootstrapping would apply in this context:</p>
 <ol style="list-style-type: decimal">
 <li>Sample with replacement from our original sample of 5534 women and repeat this process 10,000 times,</li>
 <li>calculate the mean for each of the 10,000 bootstrap samples created in Step 1.,</li>
 <li>combine all of these bootstrap statistics calculated in Step 2 into a <code>boot_distn</code> object, and</li>
-<li>shift the center of this distribution over to the null value of 23. (This is needed since it will be centered at 23.4401879 via the process of bootstrapping.)</li>
+<li>shift the center of this distribution over to the null value of 23. (This is needed since it will be centered at 23.4402 via the process of bootstrapping.)</li>
 </ol>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">set.seed</span>(<span class="dv">2016</span>)
 mu0 &lt;-<span class="st"> </span><span class="dv">23</span>
@@ -493,12 +520,12 @@ <h4><span class="header-section-number">B.2.4.1</span> Bootstrapping for Hypothe
 null_distn %&gt;%<span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> mean_age)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">bins =</span> <span class="dv">30</span>, <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>)</code></pre></div>
 <p><img src="ismaykim_files/figure-html/sim1-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
-<p>We can next use this distribution to observe our <span class="math inline">\(p\)</span>-value. Recall this is a right-tailed test so we will be looking for values that are greater than or equal to 23.4401879 for our <span class="math inline">\(p\)</span>-value.</p>
+<p>We can next use this distribution to observe our <span class="math inline">\(p\)</span>-value. Recall this is a right-tailed test so we will be looking for values that are greater than or equal to 23.4402 for our <span class="math inline">\(p\)</span>-value.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">obs_mean &lt;-<span class="st"> </span>age_summ$mean
 null_distn %&gt;%<span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> mean_age)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">bins =</span> <span class="dv">30</span>, <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>) +
 <span class="st">  </span><span class="kw">geom_vline</span>(<span class="dt">color =</span> <span class="st">&quot;red&quot;</span>, <span class="dt">xintercept =</span> obs_mean)</code></pre></div>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-142-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-146-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 <div id="calculate-p-value" class="section level5">
 <h5><span class="header-section-number">B.2.4.1.1</span> Calculate <span class="math inline">\(p\)</span>-value</h5>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">pvalue &lt;-<span class="st"> </span>null_distn %&gt;%
@@ -511,17 +538,17 @@ <h5><span class="header-section-number">B.2.4.1.1</span> Calculate <span class="
 </div>
 <div id="bootstrapping-for-confidence-interval" class="section level4">
 <h4><span class="header-section-number">B.2.4.2</span> Bootstrapping for Confidence Interval</h4>
-<p>We can also create a confidence interval for the unknown population parameter <span class="math inline">\(\mu\)</span> using our sample data using <em>bootstrapping</em>. Note that we don’t need to shift this distribution since we want the center of our confidence interval to be our point estimate <span class="math inline">\(\bar{x}_{obs} = 23.4401879\)</span>.</p>
+<p>We can also create a confidence interval for the unknown population parameter <span class="math inline">\(\mu\)</span> using our sample data using <em>bootstrapping</em>. Note that we don’t need to shift this distribution since we want the center of our confidence interval to be our point estimate <span class="math inline">\(\bar{x}_{obs} = 23.4402\)</span>.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">boot_distn &lt;-<span class="st"> </span><span class="kw">do</span>(<span class="dv">10000</span>) *<span class="st"> </span>
 <span class="st">  </span><span class="kw">resample</span>(ageAtMar, <span class="dt">replace =</span> <span class="ot">TRUE</span>) %&gt;%<span class="st"> </span>
 <span class="st">  </span><span class="kw">summarize</span>(<span class="dt">mean_age =</span> <span class="kw">mean</span>(age))</code></pre></div>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">boot_distn %&gt;%<span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> mean_age)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">bins =</span> <span class="dv">30</span>, <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>)</code></pre></div>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-144-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-148-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">boot_distn %&gt;%<span class="st"> </span><span class="kw">summarize</span>(<span class="dt">lower =</span> <span class="kw">quantile</span>(mean_age, <span class="dt">probs =</span> <span class="fl">0.025</span>),
     <span class="dt">upper =</span> <span class="kw">quantile</span>(mean_age, <span class="dt">probs =</span> <span class="fl">0.975</span>))</code></pre></div>
-<pre><code>##      lower    upper
-## 1 23.31821 23.56361</code></pre>
+<pre><code>##   lower upper
+## 1 23.32 23.56</code></pre>
 <p>We see that 23 is not contained in this confidence interval as a plausible value of <span class="math inline">\(\mu\)</span> (the unknown population mean) and the entire interval is larger than 23. This matches with our hypothesis test results of rejecting the null hypothesis in favor of the alternative (<span class="math inline">\(\mu &gt; 23\)</span>).</p>
 <p><strong>Interpretation</strong>: We are 95% confident the true mean age of first marriage for all US women from 2006 to 2010 is between and .</p>
 <hr />
@@ -541,11 +568,12 @@ <h4><span class="header-section-number">B.2.5.1</span> Check conditions</h4>
 <p>The Q-Q plot below also shows some skew.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> ageAtMar, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">sample =</span> age)) +
 <span class="st">  </span><span class="kw">stat_qq</span>()</code></pre></div>
+<p><img src="ismaykim_files/figure-html/qqplotmean-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 <p>The sample size here is quite large though (<span class="math inline">\(n = 5534\)</span>) so both conditions are met.</p>
 </div>
 <div id="test-statistic" class="section level4">
 <h4><span class="header-section-number">B.2.5.2</span> Test statistic</h4>
-<p>The test statistic is a random variable based on the sample data. Here, we want to look at a way to estimate the population mean <span class="math inline">\(\mu\)</span>. A good guess is the sample mean <span class="math inline">\(\bar{X}\)</span>. Recall that this sample mean is actually a random variable that will vary as different samples are (theoretically, would be) collected. We are looking to see how likely is it for us to have observed a sample mean of <span class="math inline">\(\bar{x}_{obs} = 23.4401879\)</span> or larger assuming that the population mean is 23 (assuming the null hypothesis is true). If the conditions are met and assuming <span class="math inline">\(H_0\)</span> is true, we can “standardize” this original test statistic of <span class="math inline">\(\bar{X}\)</span> into a <span class="math inline">\(T\)</span> statistic that follows a <span class="math inline">\(t\)</span> distribution with degrees of freedom equal to <span class="math inline">\(df = n - 1\)</span>:</p>
+<p>The test statistic is a random variable based on the sample data. Here, we want to look at a way to estimate the population mean <span class="math inline">\(\mu\)</span>. A good guess is the sample mean <span class="math inline">\(\bar{X}\)</span>. Recall that this sample mean is actually a random variable that will vary as different samples are (theoretically, would be) collected. We are looking to see how likely is it for us to have observed a sample mean of <span class="math inline">\(\bar{x}_{obs} = 23.4402\)</span> or larger assuming that the population mean is 23 (assuming the null hypothesis is true). If the conditions are met and assuming <span class="math inline">\(H_0\)</span> is true, we can “standardize” this original test statistic of <span class="math inline">\(\bar{X}\)</span> into a <span class="math inline">\(T\)</span> statistic that follows a <span class="math inline">\(t\)</span> distribution with degrees of freedom equal to <span class="math inline">\(df = n - 1\)</span>:</p>
 <p><span class="math display">\[ T =\dfrac{ \bar{X} - \mu_0}{ S / \sqrt{n} } \sim t (df = n - 1) \]</span></p>
 <p>where <span class="math inline">\(S\)</span> represents the standard deviation of the sample and <span class="math inline">\(n\)</span> is the sample size.</p>
 <div id="observed-test-statistic" class="section level5">
@@ -558,13 +586,13 @@ <h5><span class="header-section-number">B.2.5.2.1</span> Observed test statistic
 ##  One Sample t-test
 ## 
 ## data:  ageAtMar$age
-## t = 6.9357, df = 5533, p-value = 0.000000000002252
+## t = 6.9, df = 5500, p-value = 0.000000000002
 ## alternative hypothesis: true mean is greater than 23
 ## 95 percent confidence interval:
-##  23.33578      Inf
+##  23.34   Inf
 ## sample estimates:
 ## mean of x 
-##  23.44019</code></pre>
+##     23.44</code></pre>
 <p>We see here that the <span class="math inline">\(t_{obs}\)</span> value is around 6.94. Recall that for large sample sizes the <span class="math inline">\(t\)</span> distribution is essentially the standard normal distribution and this is why the statistic is reported as <code>Z</code>.</p>
 </div>
 </div>
@@ -572,10 +600,10 @@ <h5><span class="header-section-number">B.2.5.2.1</span> Observed test statistic
 <h4><span class="header-section-number">B.2.5.3</span> Compute <span class="math inline">\(p\)</span>-value</h4>
 <p>The <span class="math inline">\(p\)</span>-value—the probability of observing an <span class="math inline">\(t_{obs}\)</span> value of 6.94 or more in our null distribution of a <span class="math inline">\(t\)</span> with 5433 degrees of freedom—is essentially 0. This can also be calculated in R directly:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">pt</span>(<span class="fl">6.936</span>, <span class="dt">df =</span> <span class="kw">nrow</span>(ageAtMar) -<span class="st"> </span><span class="dv">1</span>, <span class="dt">lower.tail =</span> <span class="ot">FALSE</span>)</code></pre></div>
-<pre><code>## [1] 0.000000000002247382</code></pre>
+<pre><code>## [1] 0.000000000002247</code></pre>
 <p>We can also use the <span class="math inline">\(N(0, 1)\)</span> distribution here:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">pnorm</span>(<span class="fl">6.936</span>, <span class="dt">lower.tail =</span> <span class="ot">FALSE</span>)</code></pre></div>
-<pre><code>## [1] 0.000000000002016788</code></pre>
+<pre><code>## [1] 0.000000000002017</code></pre>
 </div>
 <div id="state-conclusion" class="section level4">
 <h4><span class="header-section-number">B.2.5.4</span> State conclusion</h4>
@@ -587,7 +615,7 @@ <h4><span class="header-section-number">B.2.5.5</span> Confidence interval</h4>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">t.test</span>(<span class="dt">x =</span> ageAtMar$age, 
        <span class="dt">alternative =</span> <span class="st">&quot;two.sided&quot;</span>,
        <span class="dt">mu =</span> <span class="dv">23</span>)$conf</code></pre></div>
-<pre><code>## [1] 23.31577 23.56461
+<pre><code>## [1] 23.32 23.56
 ## attr(,&quot;conf.level&quot;)
 ## [1] 0.95</code></pre>
 <hr />
@@ -665,7 +693,7 @@ <h4><span class="header-section-number">B.3.4.1</span> Simulation for Hypothesis
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">bins =</span> <span class="dv">30</span>, <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>) +
 <span class="st">  </span><span class="kw">geom_vline</span>(<span class="dt">color =</span> <span class="st">&quot;red&quot;</span>, <span class="dt">xintercept =</span> <span class="fl">0.8</span> +<span class="st"> </span>dist) +
 <span class="st">  </span><span class="kw">geom_vline</span>(<span class="dt">color =</span> <span class="st">&quot;red&quot;</span>, <span class="dt">xintercept =</span> p_hat)</code></pre></div>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-146-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-150-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 <div id="calculate-p-value-1" class="section level5">
 <h5><span class="header-section-number">B.3.4.1.1</span> Calculate <span class="math inline">\(p\)</span>-value</h5>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">pvalue &lt;-<span class="st"> </span>null_distn %&gt;%
@@ -692,7 +720,7 @@ <h4><span class="header-section-number">B.3.4.2</span> Bootstrapping for Confide
 <p>Just as we use the <code>mean</code> function for calculating the mean over a numerical variable, we can also use it to compute the proportion of successes for a categorical variable where we specify what we are calling a “success” after the <code>==</code>. (Think about the formula for calculating a mean and how R handles logical statements such as <code>satisfy == &quot;satisfied&quot;</code> for why this must be true.)</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">boot_distn %&gt;%<span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> success_rate)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">bins =</span> <span class="dv">30</span>, <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>)</code></pre></div>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-148-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-152-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">boot_distn %&gt;%<span class="st"> </span><span class="kw">summarize</span>(<span class="dt">lower =</span> <span class="kw">quantile</span>(success_rate, <span class="dt">probs =</span> <span class="fl">0.025</span>),
     <span class="dt">upper =</span> <span class="kw">quantile</span>(success_rate, <span class="dt">probs =</span> <span class="fl">0.975</span>))</code></pre></div>
 <pre><code>##   lower upper
@@ -733,7 +761,7 @@ <h5><span class="header-section-number">B.3.5.2.1</span> Observed test statistic
 <div id="compute-p-value-1" class="section level4">
 <h4><span class="header-section-number">B.3.5.3</span> Compute <span class="math inline">\(p\)</span>-value</h4>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="dv">2</span> *<span class="st"> </span><span class="kw">pnorm</span>(z_obs)</code></pre></div>
-<pre><code>## [1] 0.08011831</code></pre>
+<pre><code>## [1] 0.08012</code></pre>
 <p>The <span class="math inline">\(p\)</span>-value—the probability of observing an <span class="math inline">\(z_{obs}\)</span> value of -1.75 or more extreme (in both directions) in our null distribution—is around 8%.</p>
 <p>Note that we could also do this test directly using the <code>prop.test</code> function.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">stats::<span class="kw">prop.test</span>(<span class="dt">x =</span> <span class="kw">table</span>(elec$satisfy),
@@ -745,10 +773,10 @@ <h4><span class="header-section-number">B.3.5.3</span> Compute <span class="math
 ##  1-sample proportions test without continuity correction
 ## 
 ## data:  table(elec$satisfy), null probability 0.8
-## X-squared = 3.0625, df = 1, p-value = 0.08012
+## X-squared = 3.1, df = 1, p-value = 0.08
 ## alternative hypothesis: true p is not equal to 0.8
 ## 95 percent confidence interval:
-##  0.6356788 0.8073042
+##  0.6357 0.8073
 ## sample estimates:
 ##    p 
 ## 0.73</code></pre>
@@ -820,8 +848,8 @@ <h3><span class="header-section-number">B.4.3</span> Exploring the sample data</
 <pre><code>## # A tibble: 2 × 3
 ##   college_grad prop_no_opinion sample_size
 ##          &lt;chr&gt;           &lt;dbl&gt;       &lt;int&gt;
-## 1           no       0.3367609         389
-## 2          yes       0.2374429         438</code></pre>
+## 1           no          0.3368         389
+## 2          yes          0.2374         438</code></pre>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">offshore %&gt;%<span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> college_grad, <span class="dt">fill =</span> response)) +
 <span class="st">  </span><span class="kw">geom_bar</span>(<span class="dt">position =</span> <span class="st">&quot;fill&quot;</span>) +
 <span class="st">  </span><span class="kw">coord_flip</span>()</code></pre></div>
@@ -845,7 +873,7 @@ <h4><span class="header-section-number">B.4.4.1</span> Collecting summary info</
 </div>
 <div id="randomization-for-hypothesis-test" class="section level4">
 <h4><span class="header-section-number">B.4.4.2</span> Randomization for Hypothesis Test</h4>
-<p>In order to look to see if the observed sample proportion of no opinion for college graduates of 0.3367609 is statistically different than that for graduates of 0.2374429, we need to account for the sample sizes. Note that this is the same as looking to see if <span class="math inline">\(\hat{p}_{grad} - \hat{p}_{nograd}\)</span> is statistically different than 0. We also need to determine a process that replicates how the original group sizes of 389 and 438 were selected.</p>
+<p>In order to look to see if the observed sample proportion of no opinion for college graduates of 0.3368 is statistically different than that for graduates of 0.2374, we need to account for the sample sizes. Note that this is the same as looking to see if <span class="math inline">\(\hat{p}_{grad} - \hat{p}_{nograd}\)</span> is statistically different than 0. We also need to determine a process that replicates how the original group sizes of 389 and 438 were selected.</p>
 <p>We can use the idea of <em>randomization testing</em> (also known as <em>permutation testing</em>) to simulate the population from which the sample came (with two groups of different sizes) and then generate samples using <em>shuffling</em> from that simulated population to account for sampling variability.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">set.seed</span>(<span class="dv">2016</span>)
 many_shuffles &lt;-<span class="st"> </span><span class="kw">do</span>(<span class="dv">10000</span>) *<span class="st"> </span>
@@ -860,12 +888,12 @@ <h4><span class="header-section-number">B.4.4.2</span> Randomization for Hypothe
 null_distn %&gt;%<span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> diffprop)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">bins =</span> <span class="dv">25</span>, <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>)</code></pre></div>
 <p><img src="ismaykim_files/figure-html/sim3-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
-<p>We can next use this distribution to observe our <span class="math inline">\(p\)</span>-value. Recall this is a two-tailed test so we will be looking for values that are greater than or equal to -0.099318 or less than or equal to 0.099318 for our <span class="math inline">\(p\)</span>-value.</p>
+<p>We can next use this distribution to observe our <span class="math inline">\(p\)</span>-value. Recall this is a two-tailed test so we will be looking for values that are greater than or equal to -0.0993 or less than or equal to 0.0993 for our <span class="math inline">\(p\)</span>-value.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">null_distn %&gt;%<span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> diffprop)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">bins =</span> <span class="dv">20</span>, <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>) +
 <span class="st">  </span><span class="kw">geom_vline</span>(<span class="dt">color =</span> <span class="st">&quot;red&quot;</span>, <span class="dt">xintercept =</span> obs_diff) +
 <span class="st">  </span><span class="kw">geom_vline</span>(<span class="dt">color =</span> <span class="st">&quot;red&quot;</span>, <span class="dt">xintercept =</span> -obs_diff)</code></pre></div>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-151-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-155-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 <div id="calculate-p-value-2" class="section level5">
 <h5><span class="header-section-number">B.4.4.2.1</span> Calculate <span class="math inline">\(p\)</span>-value</h5>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">pvalue &lt;-<span class="st"> </span>null_distn %&gt;%
@@ -890,13 +918,13 @@ <h4><span class="header-section-number">B.4.4.3</span> Bootstrapping for Confide
 <span class="st">  </span><span class="kw">summarize</span>(<span class="dt">diffprop =</span> <span class="kw">diff</span>(prop_no_opinion))</code></pre></div>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">boot_distn %&gt;%<span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> diffprop)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">bins =</span> <span class="dv">30</span>, <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>)</code></pre></div>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-154-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-158-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">(ci_boot &lt;-<span class="st"> </span>boot_distn %&gt;%<span class="st"> </span><span class="kw">summarize</span>(<span class="dt">lower =</span> <span class="kw">quantile</span>(diffprop, <span class="dt">probs =</span> <span class="fl">0.025</span>),
     <span class="dt">upper =</span> <span class="kw">quantile</span>(diffprop, <span class="dt">probs =</span> <span class="fl">0.975</span>)))</code></pre></div>
 <pre><code>## # A tibble: 1 × 2
-##        lower       upper
-##        &lt;dbl&gt;       &lt;dbl&gt;
-## 1 -0.1595767 -0.03791979</code></pre>
+##     lower    upper
+##     &lt;dbl&gt;    &lt;dbl&gt;
+## 1 -0.1596 -0.03792</code></pre>
 <p>We see that 0 is not contained in this confidence interval as a plausible value of <span class="math inline">\(\pi_{college} - \pi_{no\_college}\)</span> (the unknown population parameter). This matches with our hypothesis test results of rejecting the null hypothesis. Since zero is not a plausible value of the population parameter, we have evidence that the proportion of college graduates in California with no opinion on drilling is different than that of non-college graduates.</p>
 <p><strong>Interpretation</strong>: We are 95% confident the true proportion of non-college graduates with no opinion on offshore drilling in California is between 0.16 dollars smaller to 0.04 dollars smaller than for college graduates.</p>
 <p><strong>Note</strong>: You could also use the null distribution based on randomization with a shift to have its center at <span class="math inline">\(\hat{p}_{college} - \hat{p}_{no\_college} = \$-0.1\)</span> instead of at 0 and calculate its percentiles. The confidence interval produced via this method should be comparable to the one done using bootstrapping above.</p>
@@ -922,7 +950,7 @@ <h3><span class="header-section-number">B.4.6</span> Check conditions</h3>
 </div>
 <div id="test-statistic-2" class="section level3">
 <h3><span class="header-section-number">B.4.7</span> Test statistic</h3>
-<p>The test statistic is a random variable based on the sample data. Here, we are interested in seeing if our observed difference in sample proportions corresponding to no opinion on drilling (<span class="math inline">\(\hat{p}_{college, obs} - \hat{p}_{no\_college, obs}\)</span> = 0.0326481) is statistically different than 0. Assuming that conditions are met and the null hypothesis is true, we can use the standard normal distribution to standardize the difference in sample proportions (<span class="math inline">\(\hat{P}_{college} - \hat{P}_{no\_college}\)</span>) using the standard error of <span class="math inline">\(\hat{P}_{college} - \hat{P}_{no\_college}\)</span> and the pooled estimate:</p>
+<p>The test statistic is a random variable based on the sample data. Here, we are interested in seeing if our observed difference in sample proportions corresponding to no opinion on drilling (<span class="math inline">\(\hat{p}_{college, obs} - \hat{p}_{no\_college, obs}\)</span> = 0.0326) is statistically different than 0. Assuming that conditions are met and the null hypothesis is true, we can use the standard normal distribution to standardize the difference in sample proportions (<span class="math inline">\(\hat{P}_{college} - \hat{P}_{no\_college}\)</span>) using the standard error of <span class="math inline">\(\hat{P}_{college} - \hat{P}_{no\_college}\)</span> and the pooled estimate:</p>
 <p><span class="math display">\[ Z =\dfrac{ (\hat{P}_1 - \hat{P}_2) - 0}{\sqrt{\dfrac{\hat{P}(1 - \hat{P})}{n_1} + \dfrac{\hat{P}(1 - \hat{P})}{n_2} }} \sim N(0, 1) \]</span> where <span class="math inline">\(\hat{P} = \dfrac{\text{total number of successes} }{ \text{total number of cases}}.\)</span></p>
 <div id="observed-test-statistic-2" class="section level4">
 <h4><span class="header-section-number">B.4.7.1</span> Observed test statistic</h4>
@@ -936,17 +964,17 @@ <h4><span class="header-section-number">B.4.7.1</span> Observed test statistic</
 ##  correction
 ## 
 ## data:  table(offshore$college_grad, offshore$response)
-## X-squared = 9.9907, df = 1, p-value = 0.001573
+## X-squared = 10, df = 1, p-value = 0.002
 ## alternative hypothesis: two.sided
 ## 95 percent confidence interval:
-##  0.03772522 0.16091078
+##  0.03773 0.16091
 ## sample estimates:
-##    prop 1    prop 2 
-## 0.3367609 0.2374429</code></pre>
+## prop 1 prop 2 
+## 0.3368 0.2374</code></pre>
 <p><code>prop.test</code> does a <span class="math inline">\(\chi^2\)</span> test here but this matches up exactly with what we would expect from the test statistic above since <span class="math inline">\(Z^2 = \chi^2\)</span> so <span class="math inline">\(\sqrt{9.99} = 3.16 = z_{obs}\)</span>: The <span class="math inline">\(p\)</span>-values are the same because we are focusing on a two-tailed test. The observed difference in sample proportions is 3.16 standard deviations larger than 0.</p>
 <p>The <span class="math inline">\(p\)</span>-value—the probability of observing a <span class="math inline">\(Z\)</span> value of 3.16 or more extreme in our null distribution—is 0.0016. This can also be calculated in R directly:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="dv">2</span> *<span class="st"> </span><span class="kw">pnorm</span>(<span class="fl">3.16</span>, <span class="dt">lower.tail =</span> <span class="ot">FALSE</span>)</code></pre></div>
-<pre><code>## [1] 0.001577691</code></pre>
+<pre><code>## [1] 0.001578</code></pre>
 <p>The 95% confidence interval is also stated above in the <code>prop.test</code> results.</p>
 </div>
 </div>
@@ -1026,8 +1054,8 @@ <h3><span class="header-section-number">B.5.3</span> Exploring the sample data</
 <tr class="odd">
 <td align="left">Cleveland_ OH</td>
 <td align="right">212</td>
-<td align="right">27467.07</td>
-<td align="right">27680.68</td>
+<td align="right">27467</td>
+<td align="right">27681</td>
 <td align="right">0</td>
 <td align="right">8475</td>
 <td align="right">21000</td>
@@ -1037,8 +1065,8 @@ <h3><span class="header-section-number">B.5.3</span> Exploring the sample data</
 <tr class="even">
 <td align="left">Sacramento_ CA</td>
 <td align="right">175</td>
-<td align="right">32427.54</td>
-<td align="right">35773.63</td>
+<td align="right">32428</td>
+<td align="right">35774</td>
 <td align="right">0</td>
 <td align="right">8050</td>
 <td align="right">20000</td>
@@ -1071,7 +1099,7 @@ <h4><span class="header-section-number">B.5.4.1</span> Collecting summary info</
 </div>
 <div id="randomization-for-hypothesis-test-1" class="section level4">
 <h4><span class="header-section-number">B.5.4.2</span> Randomization for Hypothesis Test</h4>
-<p>In order to look to see if the observed sample mean for Sacramento of 27467.0660377 is statistically different than that for Cleveland of 32427.5428571, we need to account for the sample sizes. Note that this is the same as looking to see if <span class="math inline">\(\bar{x}_{sac} - \bar{x}_{cle}\)</span> is statistically different than 0. We also need to determine a process that replicates how the original group sizes of 212 and 175 were selected.</p>
+<p>In order to look to see if the observed sample mean for Sacramento of 27467.066 is statistically different than that for Cleveland of 32427.5429, we need to account for the sample sizes. Note that this is the same as looking to see if <span class="math inline">\(\bar{x}_{sac} - \bar{x}_{cle}\)</span> is statistically different than 0. We also need to determine a process that replicates how the original group sizes of 212 and 175 were selected.</p>
 <p>We can use the idea of <em>randomization testing</em> (also known as <em>permutation testing</em>) to simulate the population from which the sample came (with two groups of different sizes) and then generate samples using <em>shuffling</em> from that simulated population to account for sampling variability.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">set.seed</span>(<span class="dv">2016</span>)
 many_shuffles &lt;-<span class="st"> </span><span class="kw">do</span>(<span class="dv">10000</span>) *<span class="st"> </span>
@@ -1086,12 +1114,12 @@ <h4><span class="header-section-number">B.5.4.2</span> Randomization for Hypothe
 null_distn %&gt;%<span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> diffmean)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">bins =</span> <span class="dv">30</span>, <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>)</code></pre></div>
 <p><img src="ismaykim_files/figure-html/sim4-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
-<p>We can next use this distribution to observe our <span class="math inline">\(p\)</span>-value. Recall this is a two-tailed test so we will be looking for values that are greater than or equal to 4960.4768194 or less than or equal to -4960.4768194 for our <span class="math inline">\(p\)</span>-value.</p>
+<p>We can next use this distribution to observe our <span class="math inline">\(p\)</span>-value. Recall this is a two-tailed test so we will be looking for values that are greater than or equal to 4960.4768 or less than or equal to -4960.4768 for our <span class="math inline">\(p\)</span>-value.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">null_distn %&gt;%<span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> diffmean)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">bins =</span> <span class="dv">30</span>, <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>) +
 <span class="st">  </span><span class="kw">geom_vline</span>(<span class="dt">color =</span> <span class="st">&quot;red&quot;</span>, <span class="dt">xintercept =</span> obs_diff) +
 <span class="st">  </span><span class="kw">geom_vline</span>(<span class="dt">color =</span> <span class="st">&quot;red&quot;</span>, <span class="dt">xintercept =</span> -obs_diff)</code></pre></div>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-156-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-160-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 <div id="calculate-p-value-3" class="section level5">
 <h5><span class="header-section-number">B.5.4.2.1</span> Calculate <span class="math inline">\(p\)</span>-value</h5>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">pvalue &lt;-<span class="st"> </span>null_distn %&gt;%
@@ -1116,13 +1144,13 @@ <h4><span class="header-section-number">B.5.4.3</span> Bootstrapping for Confide
 <span class="st">  </span><span class="kw">summarize</span>(<span class="dt">diffmean =</span> <span class="kw">diff</span>(mean_inc))</code></pre></div>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">boot_distn %&gt;%<span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> diffmean)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">bins =</span> <span class="dv">30</span>, <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>)</code></pre></div>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-159-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-163-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">(ci_boot &lt;-<span class="st"> </span>boot_distn %&gt;%<span class="st"> </span><span class="kw">summarize</span>(<span class="dt">lower =</span> <span class="kw">quantile</span>(diffmean, <span class="dt">probs =</span> <span class="fl">0.025</span>),
     <span class="dt">upper =</span> <span class="kw">quantile</span>(diffmean, <span class="dt">probs =</span> <span class="fl">0.975</span>)))</code></pre></div>
 <pre><code>## # A tibble: 1 × 2
-##      lower    upper
-##      &lt;dbl&gt;    &lt;dbl&gt;
-## 1 -1512.59 11458.85</code></pre>
+##   lower upper
+##   &lt;dbl&gt; &lt;dbl&gt;
+## 1 -1513 11459</code></pre>
 <p>We see that 0 is contained in this confidence interval as a plausible value of <span class="math inline">\(\mu_{sac} - \mu_{cle}\)</span> (the unknown population parameter). This matches with our hypothesis test results of failing to reject the null hypothesis. Since zero is a plausible value of the population parameter, we do not have evidence that Sacramento incomes are different than Cleveland incomes.</p>
 <p><strong>Interpretation</strong>: We are 95% confident the true mean yearly income for those living in Sacramento is between 1512.59 dollars smaller to 11458.85 dollars higher than for Cleveland.</p>
 <p><strong>Note</strong>: You could also use the null distribution based on randomization with a shift to have its center at <span class="math inline">\(\bar{x}_{sac} - \bar{x}_{cle} = \$4960.48\)</span> instead of at 0 and calculate its percentiles. The confidence interval produced via this method should be comparable to the one done using bootstrapping above.</p>
@@ -1150,7 +1178,7 @@ <h5><span class="header-section-number">B.5.5.0.1</span> Check conditions</h5>
 </div>
 <div id="test-statistic-3" class="section level3">
 <h3><span class="header-section-number">B.5.6</span> Test statistic</h3>
-<p>The test statistic is a random variable based on the sample data. Here, we are interested in seeing if our observed difference in sample means (<span class="math inline">\(\bar{x}_{sac, obs} - \bar{x}_{cle, obs}\)</span> = 4960.4768194) is statistically different than 0. Assuming that conditions are met and the null hypothesis is true, we can use the <span class="math inline">\(t\)</span> distribution to standardize the difference in sample means (<span class="math inline">\(\bar{X}_{sac} - \bar{X}_{cle}\)</span>) using the approximate standard error of <span class="math inline">\(\bar{X}_{sac} - \bar{X}_{cle}\)</span> (invoking <span class="math inline">\(S_{sac}\)</span> and <span class="math inline">\(S_{cle}\)</span> as estimates of unknown <span class="math inline">\(\sigma_{sac}\)</span> and <span class="math inline">\(\sigma_{cle}\)</span>).</p>
+<p>The test statistic is a random variable based on the sample data. Here, we are interested in seeing if our observed difference in sample means (<span class="math inline">\(\bar{x}_{sac, obs} - \bar{x}_{cle, obs}\)</span> = 4960.4768) is statistically different than 0. Assuming that conditions are met and the null hypothesis is true, we can use the <span class="math inline">\(t\)</span> distribution to standardize the difference in sample means (<span class="math inline">\(\bar{X}_{sac} - \bar{X}_{cle}\)</span>) using the approximate standard error of <span class="math inline">\(\bar{X}_{sac} - \bar{X}_{cle}\)</span> (invoking <span class="math inline">\(S_{sac}\)</span> and <span class="math inline">\(S_{cle}\)</span> as estimates of unknown <span class="math inline">\(\sigma_{sac}\)</span> and <span class="math inline">\(\sigma_{cle}\)</span>).</p>
 <p><span class="math display">\[ T =\dfrac{ (\bar{X}_1 - \bar{X}_2) - 0}{ \sqrt{\dfrac{S_1^2}{n_1} + \dfrac{S_2^2}{n_2}}  } \sim t (df = min(n_1 - 1, n_2 - 1)) \]</span> where 1 = Sacramento and 2 = Cleveland with <span class="math inline">\(S_1^2\)</span> and <span class="math inline">\(S_2^2\)</span> the sample variance of the incomes of both cities, respectively, and <span class="math inline">\(n_1 = 175\)</span> for Sacramento and <span class="math inline">\(n_2 = 212\)</span> for Cleveland.</p>
 <div id="observed-test-statistic-3" class="section level4">
 <h4><span class="header-section-number">B.5.6.1</span> Observed test statistic</h4>
@@ -1163,13 +1191,13 @@ <h4><span class="header-section-number">B.5.6.1</span> Observed test statistic</
 ##  Welch Two Sample t-test
 ## 
 ## data:  sacramento$income and cleveland$income
-## t = 1.5006, df = 323.36, p-value = 0.1344
+## t = 1.5, df = 320, p-value = 0.1
 ## alternative hypothesis: true difference in means is not equal to 0
 ## 95 percent confidence interval:
-##  -1542.758 11463.712
+##  -1543 11464
 ## sample estimates:
 ## mean of x mean of y 
-##  32427.54  27467.07</code></pre>
+##     32428     27467</code></pre>
 <p>Note that the degrees of freedom reported above are different than what we used above in specifying the <strong>Test Statistic</strong>. The degrees of freedom used here is also known as the Satterthwaite approximation and involves a quite complicated formula. For most problems, the must simpler smaller of sample sizes minus one will suffice.</p>
 <p>While one could compute this observed test statistic by “hand”, the focus here is on the set-up of the problem and in understanding which formula for the test statistic applies.</p>
 <!--
@@ -1182,10 +1210,10 @@ <h4><span class="header-section-number">B.5.6.1</span> Observed test statistic</
 <h3><span class="header-section-number">B.5.7</span> Compute <span class="math inline">\(p\)</span>-value</h3>
 <p>The <span class="math inline">\(p\)</span>-value—the probability of observing an <span class="math inline">\(t_{174}\)</span> value of -1.501 or more extreme (in both directions) in our null distribution—is 0.13. This can also be calculated in R directly:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="dv">2</span> *<span class="st"> </span><span class="kw">pt</span>(-<span class="fl">1.501</span>, <span class="dt">df =</span> <span class="kw">min</span>(<span class="dv">212</span> -<span class="st"> </span><span class="dv">1</span>, <span class="dv">175</span> -<span class="st"> </span><span class="dv">1</span>), <span class="dt">lower.tail =</span> <span class="ot">TRUE</span>)</code></pre></div>
-<pre><code>## [1] 0.135168</code></pre>
+<pre><code>## [1] 0.1352</code></pre>
 <p>We can also approximate by using the standard normal curve:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="dv">2</span> *<span class="st"> </span><span class="kw">pnorm</span>(-<span class="fl">1.501</span>)</code></pre></div>
-<pre><code>## [1] 0.1333556</code></pre>
+<pre><code>## [1] 0.1334</code></pre>
 <p>Note that the 95 percent confidence interval given above matches well with the one calculated using bootstrapping.</p>
 </div>
 <div id="state-conclusion-3" class="section level3">
@@ -1264,7 +1292,7 @@ <h3><span class="header-section-number">B.6.2</span> Exploring the sample data</
 <tr class="odd">
 <td align="right">10</td>
 <td align="right">-0.0804</td>
-<td align="right">0.0522732</td>
+<td align="right">0.0523</td>
 <td align="right">-0.177</td>
 <td align="right">-0.11</td>
 <td align="right">-0.084</td>
@@ -1313,7 +1341,7 @@ <h4><span class="header-section-number">B.6.3.2</span> Randomization for Hypothe
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">null_distn %&gt;%<span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> mean_diff)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">bins =</span> <span class="dv">30</span>, <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>) +
 <span class="st">  </span><span class="kw">geom_vline</span>(<span class="dt">color =</span> <span class="st">&quot;red&quot;</span>, <span class="dt">xintercept =</span> obs_diff)</code></pre></div>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-162-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-166-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 <div id="calculate-p-value-4" class="section level5">
 <h5><span class="header-section-number">B.6.3.2.1</span> Calculate <span class="math inline">\(p\)</span>-value</h5>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">pvalue &lt;-<span class="st"> </span>null_distn %&gt;%
@@ -1332,11 +1360,11 @@ <h4><span class="header-section-number">B.6.3.3</span> Bootstrapping for Confide
 <span class="st">  </span><span class="kw">summarize</span>(<span class="dt">mean_diff =</span> <span class="kw">mean</span>(pair_diff))</code></pre></div>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">boot_distn %&gt;%<span class="st"> </span><span class="kw">ggplot</span>(<span class="kw">aes</span>(<span class="dt">x =</span> mean_diff)) +
 <span class="st">  </span><span class="kw">geom_histogram</span>(<span class="dt">bins =</span> <span class="dv">30</span>, <span class="dt">color =</span> <span class="st">&quot;white&quot;</span>)</code></pre></div>
-<p><img src="ismaykim_files/figure-html/unnamed-chunk-164-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
+<p><img src="ismaykim_files/figure-html/unnamed-chunk-168-1.png" width="\textwidth" style="display: block; margin: auto;" /></p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">(ci_boot &lt;-<span class="st"> </span>boot_distn %&gt;%<span class="st"> </span><span class="kw">summarize</span>(<span class="dt">lower =</span> <span class="kw">quantile</span>(mean_diff, <span class="dt">probs =</span> <span class="fl">0.025</span>),
     <span class="dt">upper =</span> <span class="kw">quantile</span>(mean_diff, <span class="dt">probs =</span> <span class="fl">0.975</span>)))</code></pre></div>
-<pre><code>##     lower      upper
-## 1 -0.1114 -0.0504975</code></pre>
+<pre><code>##     lower   upper
+## 1 -0.1114 -0.0505</code></pre>
 <p>We see that 0 is not contained in this confidence interval as a plausible value of <span class="math inline">\(\mu_{diff}\)</span> (the unknown population parameter). This matches with our hypothesis test results of rejecting the null hypothesis. Since zero is not a plausible value of the population parameter and since the entire confidence interval falls below zero, we have evidence that surface zinc concentration levels are lower, on average, than bottom level zinc concentrations.</p>
 <p><strong>Interpretation</strong>: We are 95% confident the true mean zinc concentration on the surface is between 0.11 units smaller to 0.05 units smaller than on the bottom.</p>
 <p><strong>Note</strong>: You could also use the null distribution based on randomization with a shift to have its center at <span class="math inline">\(\bar{x}_{diff} = -0.08\)</span> instead of at 0 and calculate its percentiles. The confidence interval produced via this method should be comparable to the one done using bootstrapping above.</p>
@@ -1372,10 +1400,10 @@ <h5><span class="header-section-number">B.6.4.2.1</span> Observed test statistic
 ##  One Sample t-test
 ## 
 ## data:  zinc_diff$pair_diff
-## t = -4.8638, df = 9, p-value = 0.0004456
+## t = -4.9, df = 9, p-value = 0.0004
 ## alternative hypothesis: true mean is less than 0
 ## 95 percent confidence interval:
-##        -Inf -0.0500982
+##     -Inf -0.0501
 ## sample estimates:
 ## mean of x 
 ##   -0.0804</code></pre>
@@ -1386,7 +1414,7 @@ <h5><span class="header-section-number">B.6.4.2.1</span> Observed test statistic
 <h4><span class="header-section-number">B.6.4.3</span> Compute <span class="math inline">\(p\)</span>-value</h4>
 <p>The <span class="math inline">\(p\)</span>-value—the probability of observing a <span class="math inline">\(t_{obs}\)</span> value of -5 or less in our null distribution of a <span class="math inline">\(t\)</span> with 9 degrees of freedom—is 0.0004. This can also be calculated in R directly:</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">pt</span>(-<span class="dv">5</span>, <span class="dt">df =</span> <span class="kw">nrow</span>(zinc_diff) -<span class="st"> </span><span class="dv">1</span>, <span class="dt">lower.tail =</span> <span class="ot">TRUE</span>)</code></pre></div>
-<pre><code>## [1] 0.000369484</code></pre>
+<pre><code>## [1] 0.0003695</code></pre>
 </div>
 <div id="state-conclusion-4" class="section level4">
 <h4><span class="header-section-number">B.6.4.4</span> State conclusion</h4>
diff --git a/docs/C-appendixC.html b/docs/C-appendixC.html
index e6f7243a6..df2011ca3 100644
--- a/docs/C-appendixC.html
+++ b/docs/C-appendixC.html
@@ -26,7 +26,7 @@
 <meta name="author" content="Chester Ismay and Albert Y. Kim">
 
 
-<meta name="date" content="2017-01-07">
+<meta name="date" content="2017-01-10">
 
   <meta name="viewport" content="width=device-width, initial-scale=1">
   <meta name="apple-mobile-web-app-capable" content="yes">
@@ -119,9 +119,9 @@
 
 <ul class="summary">
 <li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> Preamble</a><ul>
-<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book</a></li>
+<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book---for-instructors"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book - For Instructors</a></li>
 <li class="chapter" data-level="1.2" data-path="index.html"><a href="index.html#contribute"><i class="fa fa-check"></i><b>1.2</b> Contribute</a></li>
-<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started"><i class="fa fa-check"></i><b>1.3</b> Getting Started</a></li>
+<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started---for-students"><i class="fa fa-check"></i><b>1.3</b> Getting Started - For Students</a></li>
 <li class="chapter" data-level="" data-path="index.html"><a href="index.html#colophon"><i class="fa fa-check"></i>Colophon</a></li>
 </ul></li>
 <li class="chapter" data-level="2" data-path="2-intro.html"><a href="2-intro.html"><i class="fa fa-check"></i><b>2</b> Introduction</a><ul>
@@ -133,87 +133,100 @@
 </ul></li>
 <li class="part"><span><b>I Data Exploration</b></span></li>
 <li class="chapter" data-level="3" data-path="3-tidy.html"><a href="3-tidy.html"><i class="fa fa-check"></i><b>3</b> Tidy Data</a><ul>
+<li class="chapter" data-level="" data-path="3-tidy.html"><a href="3-tidy.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="3.1" data-path="3-tidy.html"><a href="3-tidy.html#what-is-tidy-data"><i class="fa fa-check"></i><b>3.1</b> What is tidy data?</a></li>
-<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#the-nycflights13-datasets"><i class="fa fa-check"></i><b>3.2</b> The <code>nycflights13</code> datasets</a></li>
-<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a></li>
+<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#datasets-in-the-nycflights13-package"><i class="fa fa-check"></i><b>3.2</b> Datasets in the <code>nycflights13</code> package</a></li>
+<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a><ul>
+<li class="chapter" data-level="3.3.1" data-path="3-tidy.html"><a href="3-tidy.html#identification-variables"><i class="fa fa-check"></i><b>3.3.1</b> Identification variables</a></li>
+</ul></li>
 <li class="chapter" data-level="3.4" data-path="3-tidy.html"><a href="3-tidy.html#normal-forms-of-data"><i class="fa fa-check"></i><b>3.4</b> Normal forms of data</a></li>
 <li class="chapter" data-level="3.5" data-path="3-tidy.html"><a href="3-tidy.html#whats-to-come"><i class="fa fa-check"></i><b>3.5</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via <code id="viz">ggplot2</code></a><ul>
-<li class="chapter" data-level="" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
-<li class="chapter" data-level="4.1.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
-<li class="chapter" data-level="4.1.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
-<li class="chapter" data-level="4.1.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
-<li class="chapter" data-level="4.1.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The <code>ggplot2</code> Package</a></li>
-</ul></li>
-<li class="chapter" data-level="4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
-<li class="chapter" data-level="4.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
-<li class="chapter" data-level="4.3.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatter-plots-via-geom_point"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via <code id="geompoint">geom_point</code></a></li>
-<li class="chapter" data-level="4.3.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
-<li class="chapter" data-level="4.3.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
-<li class="chapter" data-level="4.4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#line-graphs-via-geom_line"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via <code id="geomline">geom_line</code></a></li>
-<li class="chapter" data-level="4.4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.5" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
-<li class="chapter" data-level="4.5.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms-via-geom_histogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via <code id="geomhistogram">geom_histogram</code></a></li>
-<li class="chapter" data-level="4.5.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
-<li class="chapter" data-level="4.5.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.6" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
-<li class="chapter" data-level="4.7" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
-<li class="chapter" data-level="4.7.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#boxplots-via-geom_boxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via <code id="geomboxplot">geom_boxplot</code></a></li>
-<li class="chapter" data-level="4.7.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.8" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
-<li class="chapter" data-level="4.8.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via <code>geom_bar</code></a></li>
-<li class="chapter" data-level="4.8.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
-<li class="chapter" data-level="4.8.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
-<li class="chapter" data-level="4.8.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.9" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
-<li class="chapter" data-level="4.9.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.1</b> What’s to come?</a></li>
-<li class="chapter" data-level="4.9.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
-</ul></li>
-</ul></li>
-<li class="chapter" data-level="5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via <code id="manip">dplyr</code></a><ul>
-<li class="chapter" data-level="" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="5.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
-<li class="chapter" data-level="5.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#four-main-verbs---the-4mv"><i class="fa fa-check"></i><b>5.2</b> Four Main Verbs - The 4MV</a><ul>
-<li class="chapter" data-level="5.2.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#filter-observations-using-filter"><i class="fa fa-check"></i><b>5.2.1</b> Filter observations using <code id="filter">filter</code></a></li>
-<li class="chapter" data-level="5.2.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> Summarize variables using <code>summarize</code></a></li>
-<li class="chapter" data-level="5.2.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.3</b> Create new variables/change old variables using <code>mutate</code></a></li>
-<li class="chapter" data-level="5.2.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#reorder-the-data-frame-using-arrange"><i class="fa fa-check"></i><b>5.2.4</b> Reorder the data frame using <code id="arrange">arrange</code></a></li>
-</ul></li>
-<li class="chapter" data-level="5.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#other-verbs"><i class="fa fa-check"></i><b>5.3</b> Other verbs</a><ul>
-<li class="chapter" data-level="5.3.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#select-variables-using-select"><i class="fa fa-check"></i><b>5.3.1</b> Select variables using <code id="select">select</code></a></li>
-<li class="chapter" data-level="5.3.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#rename-variables-using-rename"><i class="fa fa-check"></i><b>5.3.2</b> Rename variables using <code id="rename">rename</code></a></li>
-<li class="chapter" data-level="5.3.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.3.3</b> Find the top number of values using <code>top_n</code></a></li>
-</ul></li>
-<li class="chapter" data-level="5.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#joiningmerging-data-frames"><i class="fa fa-check"></i><b>5.4</b> Joining/merging data frames</a></li>
-<li class="chapter" data-level="5.5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5</b> Script of R code</a></li>
-<li class="chapter" data-level="5.6" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.6</b> What’s to come?</a></li>
+<li class="chapter" data-level="4" data-path="4-viz.html"><a href="4-viz.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via ggplot2</a><ul>
+<li class="chapter" data-level="" data-path="4-viz.html"><a href="4-viz.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="4.1" data-path="4-viz.html"><a href="4-viz.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
+<li class="chapter" data-level="4.1.1" data-path="4-viz.html"><a href="4-viz.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
+<li class="chapter" data-level="4.1.2" data-path="4-viz.html"><a href="4-viz.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
+<li class="chapter" data-level="4.1.3" data-path="4-viz.html"><a href="4-viz.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
+<li class="chapter" data-level="4.1.4" data-path="4-viz.html"><a href="4-viz.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The ggplot2 Package</a></li>
+</ul></li>
+<li class="chapter" data-level="4.2" data-path="4-viz.html"><a href="4-viz.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
+<li class="chapter" data-level="4.3" data-path="4-viz.html"><a href="4-viz.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
+<li class="chapter" data-level="4.3.1" data-path="4-viz.html"><a href="4-viz.html#geompoint"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via geom_point</a></li>
+<li class="chapter" data-level="4.3.2" data-path="4-viz.html"><a href="4-viz.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
+<li class="chapter" data-level="4.3.3" data-path="4-viz.html"><a href="4-viz.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.4" data-path="4-viz.html"><a href="4-viz.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
+<li class="chapter" data-level="4.4.1" data-path="4-viz.html"><a href="4-viz.html#geomline"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via geom_line</a></li>
+<li class="chapter" data-level="4.4.2" data-path="4-viz.html"><a href="4-viz.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.5" data-path="4-viz.html"><a href="4-viz.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
+<li class="chapter" data-level="4.5.1" data-path="4-viz.html"><a href="4-viz.html#geomhistogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via geom_histogram</a></li>
+<li class="chapter" data-level="4.5.2" data-path="4-viz.html"><a href="4-viz.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
+<li class="chapter" data-level="4.5.3" data-path="4-viz.html"><a href="4-viz.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.6" data-path="4-viz.html"><a href="4-viz.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
+<li class="chapter" data-level="4.7" data-path="4-viz.html"><a href="4-viz.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
+<li class="chapter" data-level="4.7.1" data-path="4-viz.html"><a href="4-viz.html#geomboxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via geom_boxplot</a></li>
+<li class="chapter" data-level="4.7.2" data-path="4-viz.html"><a href="4-viz.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.8" data-path="4-viz.html"><a href="4-viz.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
+<li class="chapter" data-level="4.8.1" data-path="4-viz.html"><a href="4-viz.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via geom_bar</a></li>
+<li class="chapter" data-level="4.8.2" data-path="4-viz.html"><a href="4-viz.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
+<li class="chapter" data-level="4.8.3" data-path="4-viz.html"><a href="4-viz.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
+<li class="chapter" data-level="4.8.4" data-path="4-viz.html"><a href="4-viz.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.9" data-path="4-viz.html"><a href="4-viz.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="4.9.1" data-path="4-viz.html"><a href="4-viz.html#resources"><i class="fa fa-check"></i><b>4.9.1</b> Resources</a></li>
+<li class="chapter" data-level="4.9.2" data-path="4-viz.html"><a href="4-viz.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
+<li class="chapter" data-level="4.9.3" data-path="4-viz.html"><a href="4-viz.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.3</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="5" data-path="5-manip.html"><a href="5-manip.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via dplyr</a><ul>
+<li class="chapter" data-level="" data-path="5-manip.html"><a href="5-manip.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="5.1" data-path="5-manip.html"><a href="5-manip.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
+<li class="chapter" data-level="5.2" data-path="5-manip.html"><a href="5-manip.html#five-main-verbs---the-5mv"><i class="fa fa-check"></i><b>5.2</b> Five Main Verbs - The 5MV</a><ul>
+<li class="chapter" data-level="5.2.1" data-path="5-manip.html"><a href="5-manip.html#filter"><i class="fa fa-check"></i><b>5.2.1</b> 5MV#1: Filter observations using filter</a></li>
+<li class="chapter" data-level="5.2.2" data-path="5-manip.html"><a href="5-manip.html#mv2-summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> 5MV#2: Summarize variables using summarize</a></li>
+<li class="chapter" data-level="5.2.3" data-path="5-manip.html"><a href="5-manip.html#mv3-group-rows-using-group_by"><i class="fa fa-check"></i><b>5.2.3</b> 5MV#3: Group rows using group_by</a></li>
+<li class="chapter" data-level="5.2.4" data-path="5-manip.html"><a href="5-manip.html#mv4-create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.4</b> 5MV#4: Create new variables/change old variables using mutate</a></li>
+<li class="chapter" data-level="5.2.5" data-path="5-manip.html"><a href="5-manip.html#arrange"><i class="fa fa-check"></i><b>5.2.5</b> 5MV#5: Reorder the data frame using arrange</a></li>
+</ul></li>
+<li class="chapter" data-level="5.3" data-path="5-manip.html"><a href="5-manip.html#joining-data-frames"><i class="fa fa-check"></i><b>5.3</b> Joining data frames</a><ul>
+<li class="chapter" data-level="5.3.1" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables"><i class="fa fa-check"></i><b>5.3.1</b> Joining by Key Variables</a></li>
+<li class="chapter" data-level="5.3.2" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables-with-different-names"><i class="fa fa-check"></i><b>5.3.2</b> Joining by Key Variables with Different Names</a></li>
+</ul></li>
+<li class="chapter" data-level="5.4" data-path="5-manip.html"><a href="5-manip.html#optional-other-verbs"><i class="fa fa-check"></i><b>5.4</b> Optional: Other verbs</a><ul>
+<li class="chapter" data-level="5.4.1" data-path="5-manip.html"><a href="5-manip.html#select"><i class="fa fa-check"></i><b>5.4.1</b> Select variables using select</a></li>
+<li class="chapter" data-level="5.4.2" data-path="5-manip.html"><a href="5-manip.html#rename"><i class="fa fa-check"></i><b>5.4.2</b> Rename variables using rename</a></li>
+<li class="chapter" data-level="5.4.3" data-path="5-manip.html"><a href="5-manip.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.4.3</b> Find the top number of values using top_n</a></li>
+</ul></li>
+<li class="chapter" data-level="5.5" data-path="5-manip.html"><a href="5-manip.html#conclusion-1"><i class="fa fa-check"></i><b>5.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="5.5.1" data-path="5-manip.html"><a href="5-manip.html#resources-1"><i class="fa fa-check"></i><b>5.5.1</b> Resources</a></li>
+<li class="chapter" data-level="5.5.2" data-path="5-manip.html"><a href="5-manip.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5.2</b> Script of R code</a></li>
+<li class="chapter" data-level="5.5.3" data-path="5-manip.html"><a href="5-manip.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.5.3</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="part"><span><b>II Inference</b></span></li>
-<li class="chapter" data-level="6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via <code id="sim">mosaic</code></a><ul>
-<li class="chapter" data-level="" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="6.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
-<li class="chapter" data-level="6.1.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
-<li class="chapter" data-level="6.1.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+<li class="chapter" data-level="6" data-path="6-sim.html"><a href="6-sim.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via mosaic</a><ul>
+<li class="chapter" data-level="" data-path="6-sim.html"><a href="6-sim.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="6.1" data-path="6-sim.html"><a href="6-sim.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
+<li class="chapter" data-level="6.1.1" data-path="6-sim.html"><a href="6-sim.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
+<li class="chapter" data-level="6.1.2" data-path="6-sim.html"><a href="6-sim.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
 </ul></li>
-<li class="chapter" data-level="6.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
-<li class="chapter" data-level="6.2.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
-<li class="chapter" data-level="6.2.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
+<li class="chapter" data-level="6.2" data-path="6-sim.html"><a href="6-sim.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
+<li class="chapter" data-level="6.2.1" data-path="6-sim.html"><a href="6-sim.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
+<li class="chapter" data-level="6.2.2" data-path="6-sim.html"><a href="6-sim.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
+</ul></li>
+<li class="chapter" data-level="6.3" data-path="6-sim.html"><a href="6-sim.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
+<li class="chapter" data-level="6.4" data-path="6-sim.html"><a href="6-sim.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
+<li class="chapter" data-level="6.5" data-path="6-sim.html"><a href="6-sim.html#conclusion-2"><i class="fa fa-check"></i><b>6.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="6.5.1" data-path="6-sim.html"><a href="6-sim.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5.1</b> Script of R code</a></li>
+<li class="chapter" data-level="6.5.2" data-path="6-sim.html"><a href="6-sim.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.5.2</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="6.3" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
-<li class="chapter" data-level="6.4" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
-<li class="chapter" data-level="6.5" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5</b> Script of R code</a></li>
-<li class="chapter" data-level="6.6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.6</b> What’s to come?</a></li>
 </ul></li>
 <li class="chapter" data-level="7" data-path="7-hypo.html"><a href="7-hypo.html"><i class="fa fa-check"></i><b>7</b> Hypothesis Testing</a><ul>
-<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="7.1" data-path="7-hypo.html"><a href="7-hypo.html#when-inference-is-not-needed"><i class="fa fa-check"></i><b>7.1</b> When Inference Is Not Needed</a></li>
 <li class="chapter" data-level="7.2" data-path="7-hypo.html"><a href="7-hypo.html#basics-of-hypothesis-testing"><i class="fa fa-check"></i><b>7.2</b> Basics of Hypothesis Testing</a></li>
 <li class="chapter" data-level="7.3" data-path="7-hypo.html"><a href="7-hypo.html#trial"><i class="fa fa-check"></i><b>7.3</b> Criminal trial analogy</a><ul>
@@ -249,44 +262,50 @@
 <li class="chapter" data-level="7.8.1" data-path="7-hypo.html"><a href="7-hypo.html#example-t-test-for-two-independent-samples"><i class="fa fa-check"></i><b>7.8.1</b> EXAMPLE: <span class="math inline">\(t\)</span>-test for two independent samples</a></li>
 <li class="chapter" data-level="7.8.2" data-path="7-hypo.html"><a href="7-hypo.html#conditions-for-t-test"><i class="fa fa-check"></i><b>7.8.2</b> Conditions for t-test</a></li>
 </ul></li>
-<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9</b> What’s to come?</a></li>
+<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#conclusion-3"><i class="fa fa-check"></i><b>7.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="7.9.1" data-path="7-hypo.html"><a href="7-hypo.html#script-of-r-code-3"><i class="fa fa-check"></i><b>7.9.1</b> Script of R code</a></li>
+<li class="chapter" data-level="7.9.2" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="chapter" data-level="8" data-path="8-ci.html"><a href="8-ci.html"><i class="fa fa-check"></i><b>8</b> Confidence Intervals</a><ul>
-<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a></li>
-<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#mean_rating"><i class="fa fa-check"></i><b>8.2</b> mean_rating</a><ul>
-<li class="chapter" data-level="8.2.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.2.1</b> Review of Bootstrapping</a></li>
-</ul></li>
-<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.3</b> Relation to hypothesis testing</a></li>
-<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.4</b> Effect size</a></li>
-<li class="chapter" data-level="8.5" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-3"><i class="fa fa-check"></i><b>8.5</b> Script of R code</a></li>
-<li class="chapter" data-level="8.6" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.6</b> What’s to come?</a></li>
-</ul></li>
-<li class="chapter" data-level="9" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html"><i class="fa fa-check"></i><b>9</b> Regression via <code id="regress">broom</code></a><ul>
-<li class="chapter" data-level="" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="9.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
-<li class="chapter" data-level="9.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
-<li class="chapter" data-level="9.2.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
-</ul></li>
-<li class="chapter" data-level="9.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
-<li class="chapter" data-level="9.3.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
-<li class="chapter" data-level="9.3.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
-<li class="chapter" data-level="9.3.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#interpretting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpretting the slope</a></li>
-<li class="chapter" data-level="9.3.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
-</ul></li>
-<li class="chapter" data-level="9.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
-<li class="chapter" data-level="9.4.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
-<li class="chapter" data-level="9.4.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
-<li class="chapter" data-level="9.4.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
-<li class="chapter" data-level="9.4.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
-<li class="chapter" data-level="9.4.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
-</ul></li>
-<li class="chapter" data-level="9.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
-<li class="chapter" data-level="9.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
-<li class="chapter" data-level="9.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#script-of-r-code-4"><i class="fa fa-check"></i><b>9.7</b> Script of R code</a></li>
-<li class="chapter" data-level="9.8" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.8</b> What’s to come?</a></li>
+<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a><ul>
+<li class="chapter" data-level="8.1.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.1.1</b> Review of Bootstrapping</a></li>
+</ul></li>
+<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.2</b> Relation to hypothesis testing</a></li>
+<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.3</b> Effect size</a></li>
+<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#conclusion-4"><i class="fa fa-check"></i><b>8.4</b> Conclusion</a><ul>
+<li class="chapter" data-level="8.4.1" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-4"><i class="fa fa-check"></i><b>8.4.1</b> Script of R code</a></li>
+<li class="chapter" data-level="8.4.2" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.4.2</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="9" data-path="9-regress.html"><a href="9-regress.html"><i class="fa fa-check"></i><b>9</b> Regression via broom</a><ul>
+<li class="chapter" data-level="" data-path="9-regress.html"><a href="9-regress.html#needed-packages-6"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="9.1" data-path="9-regress.html"><a href="9-regress.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
+<li class="chapter" data-level="9.2" data-path="9-regress.html"><a href="9-regress.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
+<li class="chapter" data-level="9.2.1" data-path="9-regress.html"><a href="9-regress.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
+</ul></li>
+<li class="chapter" data-level="9.3" data-path="9-regress.html"><a href="9-regress.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
+<li class="chapter" data-level="9.3.1" data-path="9-regress.html"><a href="9-regress.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
+<li class="chapter" data-level="9.3.2" data-path="9-regress.html"><a href="9-regress.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
+<li class="chapter" data-level="9.3.3" data-path="9-regress.html"><a href="9-regress.html#interpreting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpreting the slope</a></li>
+<li class="chapter" data-level="9.3.4" data-path="9-regress.html"><a href="9-regress.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
+</ul></li>
+<li class="chapter" data-level="9.4" data-path="9-regress.html"><a href="9-regress.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
+<li class="chapter" data-level="9.4.1" data-path="9-regress.html"><a href="9-regress.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
+<li class="chapter" data-level="9.4.2" data-path="9-regress.html"><a href="9-regress.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
+<li class="chapter" data-level="9.4.3" data-path="9-regress.html"><a href="9-regress.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
+<li class="chapter" data-level="9.4.4" data-path="9-regress.html"><a href="9-regress.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.5" data-path="9-regress.html"><a href="9-regress.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
+<li class="chapter" data-level="9.4.6" data-path="9-regress.html"><a href="9-regress.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.7" data-path="9-regress.html"><a href="9-regress.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
+</ul></li>
+<li class="chapter" data-level="9.5" data-path="9-regress.html"><a href="9-regress.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
+<li class="chapter" data-level="9.6" data-path="9-regress.html"><a href="9-regress.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
+<li class="chapter" data-level="9.7" data-path="9-regress.html"><a href="9-regress.html#conclusion-5"><i class="fa fa-check"></i><b>9.7</b> Conclusion</a><ul>
+<li class="chapter" data-level="9.7.1" data-path="9-regress.html"><a href="9-regress.html#script-of-r-code-5"><i class="fa fa-check"></i><b>9.7.1</b> Script of R code</a></li>
+<li class="chapter" data-level="9.7.2" data-path="9-regress.html"><a href="9-regress.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.7.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="part"><span><b>III Conclusion</b></span></li>
 <li class="chapter" data-level="10" data-path="10-effective-data-storytelling.html"><a href="10-effective-data-storytelling.html"><i class="fa fa-check"></i><b>10</b> Effective Data Storytelling</a><ul>
@@ -304,7 +323,8 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="B" data-path="B-appendixB.html"><a href="B-appendixB.html"><i class="fa fa-check"></i><b>B</b> Inference Examples</a><ul>
-<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-6"><i class="fa fa-check"></i><b>B.1</b> Needed packages</a></li>
+<li class="chapter" data-level="" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-7"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#inference-mind-map"><i class="fa fa-check"></i><b>B.1</b> Inference Mind Map</a></li>
 <li class="chapter" data-level="B.2" data-path="B-appendixB.html"><a href="B-appendixB.html#one-mean"><i class="fa fa-check"></i><b>B.2</b> One Mean</a><ul>
 <li class="chapter" data-level="B.2.1" data-path="B-appendixB.html"><a href="B-appendixB.html#problem-statement"><i class="fa fa-check"></i><b>B.2.1</b> Problem Statement</a></li>
 <li class="chapter" data-level="B.2.2" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses"><i class="fa fa-check"></i><b>B.2.2</b> Competing Hypotheses</a></li>
@@ -352,6 +372,7 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="C" data-path="C-appendixC.html"><a href="C-appendixC.html"><i class="fa fa-check"></i><b>C</b> Reach for the Starts</a><ul>
+<li class="chapter" data-level="" data-path="C-appendixC.html"><a href="C-appendixC.html#needed-packages-8"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="C.1" data-path="C-appendixC.html"><a href="C-appendixC.html#sorted-barplots"><i class="fa fa-check"></i><b>C.1</b> Sorted barplots</a></li>
 <li class="chapter" data-level="C.2" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-graphics"><i class="fa fa-check"></i><b>C.2</b> Interactive graphics</a><ul>
 <li class="chapter" data-level="C.2.1" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-line-graphs"><i class="fa fa-check"></i><b>C.2.1</b> Interactive line-graphs</a></li>
@@ -377,13 +398,18 @@ <h1>
             <section class="normal" id="section-">
 <div id="appendixC" class="section level1">
 <h1><span class="header-section-number">C</span> Reach for the Starts</h1>
+<div id="needed-packages-8" class="section level2 unnumbered">
+<h2>Needed packages</h2>
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(dplyr)
+<span class="kw">library</span>(ggplot2)
+<span class="kw">library</span>(knitr)
+<span class="kw">library</span>(dygraphs)
+<span class="kw">library</span>(nycflights13)</code></pre></div>
+</div>
 <div id="sorted-barplots" class="section level2">
 <h2><span class="header-section-number">C.1</span> Sorted barplots</h2>
 <p>Building upon the example in Section <a href="#barplots"><strong>??</strong></a>:</p>
-<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">library</span>(nycflights13)
-<span class="kw">library</span>(ggplot2)
-<span class="kw">library</span>(dplyr)
-flights_table &lt;-<span class="st"> </span><span class="kw">table</span>(flights$carrier)
+<div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">flights_table &lt;-<span class="st"> </span><span class="kw">table</span>(flights$carrier)
 flights_table</code></pre></div>
 <pre><code>## 
 ##    9E    AA    AS    B6    DL    EV    F9    FL    HA    MQ    OO    UA 
@@ -400,8 +426,8 @@ <h2><span class="header-section-number">C.1</span> Sorted barplots</h2>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r"><span class="kw">ggplot</span>(<span class="dt">data =</span> flights, <span class="dt">mapping =</span> <span class="kw">aes</span>(<span class="dt">x =</span> carrier)) +
 <span class="st">  </span><span class="kw">geom_bar</span>() +
 <span class="st">  </span><span class="kw">scale_x_discrete</span>(<span class="dt">limits =</span> <span class="kw">names</span>(sorted_flights))</code></pre></div>
-<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-170"></span>
-<img src="ismaykim_files/figure-html/unnamed-chunk-170-1.png" alt="Number of flights departing NYC in 2013 by airline - Descending numbers" width="\textwidth" />
+<div class="figure" style="text-align: center"><span id="fig:unnamed-chunk-175"></span>
+<img src="ismaykim_files/figure-html/unnamed-chunk-175-1.png" alt="Number of flights departing NYC in 2013 by airline - Descending numbers" width="\textwidth" />
 <p class="caption">
 Figure C.1: Number of flights departing NYC in 2013 by airline - Descending numbers
 </p>
@@ -426,7 +452,7 @@ <h3><span class="header-section-number">C.2.1</span> Interactive line-graphs</h3
 <p><br></p>
 <p>The syntax here is a little different than what we have covered so far. The <code>dygraph</code> function is expecting for the dates to be given as the <code>rownames</code> of the object. We then remove the <code>date</code> variable from the <code>flights_summarized</code> dataframe since it is accounted for in the <code>rownames</code>. Lastly, we run the <code>dygraph</code> function on the new dataframe that only contains the median arrival delay as a column and then provide the ability to have a selector to zoom in on the interactive plot via <code>dyRangeSelector</code>. (Note that this plot will only be interactive in the HTML version of this book.)</p>
 <!--
-**(LC9.9)** Use the interactive line-graph to determine the highest median arrival delay for flights from NYC in 2013.  What date was it and what do you think contributed to it?
+**`paste0("(LC", chap, ".", (lc <- lc + 1), ")")`** Use the interactive line-graph to determine the highest median arrival delay for flights from NYC in 2013.  What date was it and what do you think contributed to it?
 
 
 ** ** What are three specific questions that can be more easily answered by looking at Figure 4.6 instead of Figure 4.5?
@@ -434,10 +460,11 @@ <h3><span class="header-section-number">C.2.1</span> Interactive line-graphs</h3
 ***
 
 - Changing the labels of a plot (x-axis, y-axis)
+- stat = "identity" for aggregated data and barplots
 - Changing the theme for ggplots (`ggthemes` package too)
 - Adding `code_folding` and `code_download` to YAML
 - `kable` function from `knitr`
-- Reading in data from files in different formats
+- Reading in data from files in different formats - Getting Used to R book reference
 - Reshaping the data with `tidyr`
 
 -->
diff --git a/docs/images/coggle.png b/docs/images/coggle.png
new file mode 100644
index 000000000..668944334
Binary files /dev/null and b/docs/images/coggle.png differ
diff --git a/docs/images/coggleviz.png b/docs/images/coggleviz.png
new file mode 100644
index 000000000..5687bf5f8
Binary files /dev/null and b/docs/images/coggleviz.png differ
diff --git a/docs/index.html b/docs/index.html
index a00ff6721..f55476760 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -26,7 +26,7 @@
 <meta name="author" content="Chester Ismay and Albert Y. Kim">
 
 
-<meta name="date" content="2017-01-07">
+<meta name="date" content="2017-01-10">
 
   <meta name="viewport" content="width=device-width, initial-scale=1">
   <meta name="apple-mobile-web-app-capable" content="yes">
@@ -119,9 +119,9 @@
 
 <ul class="summary">
 <li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> Preamble</a><ul>
-<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book</a></li>
+<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book---for-instructors"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book - For Instructors</a></li>
 <li class="chapter" data-level="1.2" data-path="index.html"><a href="index.html#contribute"><i class="fa fa-check"></i><b>1.2</b> Contribute</a></li>
-<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started"><i class="fa fa-check"></i><b>1.3</b> Getting Started</a></li>
+<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started---for-students"><i class="fa fa-check"></i><b>1.3</b> Getting Started - For Students</a></li>
 <li class="chapter" data-level="" data-path="index.html"><a href="index.html#colophon"><i class="fa fa-check"></i>Colophon</a></li>
 </ul></li>
 <li class="chapter" data-level="2" data-path="2-intro.html"><a href="2-intro.html"><i class="fa fa-check"></i><b>2</b> Introduction</a><ul>
@@ -133,87 +133,100 @@
 </ul></li>
 <li class="part"><span><b>I Data Exploration</b></span></li>
 <li class="chapter" data-level="3" data-path="3-tidy.html"><a href="3-tidy.html"><i class="fa fa-check"></i><b>3</b> Tidy Data</a><ul>
+<li class="chapter" data-level="" data-path="3-tidy.html"><a href="3-tidy.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="3.1" data-path="3-tidy.html"><a href="3-tidy.html#what-is-tidy-data"><i class="fa fa-check"></i><b>3.1</b> What is tidy data?</a></li>
-<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#the-nycflights13-datasets"><i class="fa fa-check"></i><b>3.2</b> The <code>nycflights13</code> datasets</a></li>
-<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a></li>
+<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#datasets-in-the-nycflights13-package"><i class="fa fa-check"></i><b>3.2</b> Datasets in the <code>nycflights13</code> package</a></li>
+<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a><ul>
+<li class="chapter" data-level="3.3.1" data-path="3-tidy.html"><a href="3-tidy.html#identification-variables"><i class="fa fa-check"></i><b>3.3.1</b> Identification variables</a></li>
+</ul></li>
 <li class="chapter" data-level="3.4" data-path="3-tidy.html"><a href="3-tidy.html#normal-forms-of-data"><i class="fa fa-check"></i><b>3.4</b> Normal forms of data</a></li>
 <li class="chapter" data-level="3.5" data-path="3-tidy.html"><a href="3-tidy.html#whats-to-come"><i class="fa fa-check"></i><b>3.5</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via <code id="viz">ggplot2</code></a><ul>
-<li class="chapter" data-level="" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
-<li class="chapter" data-level="4.1.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
-<li class="chapter" data-level="4.1.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
-<li class="chapter" data-level="4.1.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
-<li class="chapter" data-level="4.1.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The <code>ggplot2</code> Package</a></li>
-</ul></li>
-<li class="chapter" data-level="4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
-<li class="chapter" data-level="4.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
-<li class="chapter" data-level="4.3.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatter-plots-via-geom_point"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via <code id="geompoint">geom_point</code></a></li>
-<li class="chapter" data-level="4.3.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
-<li class="chapter" data-level="4.3.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
-<li class="chapter" data-level="4.4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#line-graphs-via-geom_line"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via <code id="geomline">geom_line</code></a></li>
-<li class="chapter" data-level="4.4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.5" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
-<li class="chapter" data-level="4.5.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms-via-geom_histogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via <code id="geomhistogram">geom_histogram</code></a></li>
-<li class="chapter" data-level="4.5.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
-<li class="chapter" data-level="4.5.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.6" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
-<li class="chapter" data-level="4.7" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
-<li class="chapter" data-level="4.7.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#boxplots-via-geom_boxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via <code id="geomboxplot">geom_boxplot</code></a></li>
-<li class="chapter" data-level="4.7.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.8" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
-<li class="chapter" data-level="4.8.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via <code>geom_bar</code></a></li>
-<li class="chapter" data-level="4.8.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
-<li class="chapter" data-level="4.8.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
-<li class="chapter" data-level="4.8.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.9" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
-<li class="chapter" data-level="4.9.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.1</b> What’s to come?</a></li>
-<li class="chapter" data-level="4.9.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
-</ul></li>
-</ul></li>
-<li class="chapter" data-level="5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via <code id="manip">dplyr</code></a><ul>
-<li class="chapter" data-level="" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="5.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
-<li class="chapter" data-level="5.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#four-main-verbs---the-4mv"><i class="fa fa-check"></i><b>5.2</b> Four Main Verbs - The 4MV</a><ul>
-<li class="chapter" data-level="5.2.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#filter-observations-using-filter"><i class="fa fa-check"></i><b>5.2.1</b> Filter observations using <code id="filter">filter</code></a></li>
-<li class="chapter" data-level="5.2.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> Summarize variables using <code>summarize</code></a></li>
-<li class="chapter" data-level="5.2.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.3</b> Create new variables/change old variables using <code>mutate</code></a></li>
-<li class="chapter" data-level="5.2.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#reorder-the-data-frame-using-arrange"><i class="fa fa-check"></i><b>5.2.4</b> Reorder the data frame using <code id="arrange">arrange</code></a></li>
-</ul></li>
-<li class="chapter" data-level="5.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#other-verbs"><i class="fa fa-check"></i><b>5.3</b> Other verbs</a><ul>
-<li class="chapter" data-level="5.3.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#select-variables-using-select"><i class="fa fa-check"></i><b>5.3.1</b> Select variables using <code id="select">select</code></a></li>
-<li class="chapter" data-level="5.3.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#rename-variables-using-rename"><i class="fa fa-check"></i><b>5.3.2</b> Rename variables using <code id="rename">rename</code></a></li>
-<li class="chapter" data-level="5.3.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.3.3</b> Find the top number of values using <code>top_n</code></a></li>
-</ul></li>
-<li class="chapter" data-level="5.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#joiningmerging-data-frames"><i class="fa fa-check"></i><b>5.4</b> Joining/merging data frames</a></li>
-<li class="chapter" data-level="5.5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5</b> Script of R code</a></li>
-<li class="chapter" data-level="5.6" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.6</b> What’s to come?</a></li>
+<li class="chapter" data-level="4" data-path="4-viz.html"><a href="4-viz.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via ggplot2</a><ul>
+<li class="chapter" data-level="" data-path="4-viz.html"><a href="4-viz.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="4.1" data-path="4-viz.html"><a href="4-viz.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
+<li class="chapter" data-level="4.1.1" data-path="4-viz.html"><a href="4-viz.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
+<li class="chapter" data-level="4.1.2" data-path="4-viz.html"><a href="4-viz.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
+<li class="chapter" data-level="4.1.3" data-path="4-viz.html"><a href="4-viz.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
+<li class="chapter" data-level="4.1.4" data-path="4-viz.html"><a href="4-viz.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The ggplot2 Package</a></li>
+</ul></li>
+<li class="chapter" data-level="4.2" data-path="4-viz.html"><a href="4-viz.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
+<li class="chapter" data-level="4.3" data-path="4-viz.html"><a href="4-viz.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
+<li class="chapter" data-level="4.3.1" data-path="4-viz.html"><a href="4-viz.html#geompoint"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via geom_point</a></li>
+<li class="chapter" data-level="4.3.2" data-path="4-viz.html"><a href="4-viz.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
+<li class="chapter" data-level="4.3.3" data-path="4-viz.html"><a href="4-viz.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.4" data-path="4-viz.html"><a href="4-viz.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
+<li class="chapter" data-level="4.4.1" data-path="4-viz.html"><a href="4-viz.html#geomline"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via geom_line</a></li>
+<li class="chapter" data-level="4.4.2" data-path="4-viz.html"><a href="4-viz.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.5" data-path="4-viz.html"><a href="4-viz.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
+<li class="chapter" data-level="4.5.1" data-path="4-viz.html"><a href="4-viz.html#geomhistogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via geom_histogram</a></li>
+<li class="chapter" data-level="4.5.2" data-path="4-viz.html"><a href="4-viz.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
+<li class="chapter" data-level="4.5.3" data-path="4-viz.html"><a href="4-viz.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.6" data-path="4-viz.html"><a href="4-viz.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
+<li class="chapter" data-level="4.7" data-path="4-viz.html"><a href="4-viz.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
+<li class="chapter" data-level="4.7.1" data-path="4-viz.html"><a href="4-viz.html#geomboxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via geom_boxplot</a></li>
+<li class="chapter" data-level="4.7.2" data-path="4-viz.html"><a href="4-viz.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.8" data-path="4-viz.html"><a href="4-viz.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
+<li class="chapter" data-level="4.8.1" data-path="4-viz.html"><a href="4-viz.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via geom_bar</a></li>
+<li class="chapter" data-level="4.8.2" data-path="4-viz.html"><a href="4-viz.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
+<li class="chapter" data-level="4.8.3" data-path="4-viz.html"><a href="4-viz.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
+<li class="chapter" data-level="4.8.4" data-path="4-viz.html"><a href="4-viz.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.9" data-path="4-viz.html"><a href="4-viz.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="4.9.1" data-path="4-viz.html"><a href="4-viz.html#resources"><i class="fa fa-check"></i><b>4.9.1</b> Resources</a></li>
+<li class="chapter" data-level="4.9.2" data-path="4-viz.html"><a href="4-viz.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
+<li class="chapter" data-level="4.9.3" data-path="4-viz.html"><a href="4-viz.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.3</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="5" data-path="5-manip.html"><a href="5-manip.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via dplyr</a><ul>
+<li class="chapter" data-level="" data-path="5-manip.html"><a href="5-manip.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="5.1" data-path="5-manip.html"><a href="5-manip.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
+<li class="chapter" data-level="5.2" data-path="5-manip.html"><a href="5-manip.html#five-main-verbs---the-5mv"><i class="fa fa-check"></i><b>5.2</b> Five Main Verbs - The 5MV</a><ul>
+<li class="chapter" data-level="5.2.1" data-path="5-manip.html"><a href="5-manip.html#filter"><i class="fa fa-check"></i><b>5.2.1</b> 5MV#1: Filter observations using filter</a></li>
+<li class="chapter" data-level="5.2.2" data-path="5-manip.html"><a href="5-manip.html#mv2-summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> 5MV#2: Summarize variables using summarize</a></li>
+<li class="chapter" data-level="5.2.3" data-path="5-manip.html"><a href="5-manip.html#mv3-group-rows-using-group_by"><i class="fa fa-check"></i><b>5.2.3</b> 5MV#3: Group rows using group_by</a></li>
+<li class="chapter" data-level="5.2.4" data-path="5-manip.html"><a href="5-manip.html#mv4-create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.4</b> 5MV#4: Create new variables/change old variables using mutate</a></li>
+<li class="chapter" data-level="5.2.5" data-path="5-manip.html"><a href="5-manip.html#arrange"><i class="fa fa-check"></i><b>5.2.5</b> 5MV#5: Reorder the data frame using arrange</a></li>
+</ul></li>
+<li class="chapter" data-level="5.3" data-path="5-manip.html"><a href="5-manip.html#joining-data-frames"><i class="fa fa-check"></i><b>5.3</b> Joining data frames</a><ul>
+<li class="chapter" data-level="5.3.1" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables"><i class="fa fa-check"></i><b>5.3.1</b> Joining by Key Variables</a></li>
+<li class="chapter" data-level="5.3.2" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables-with-different-names"><i class="fa fa-check"></i><b>5.3.2</b> Joining by Key Variables with Different Names</a></li>
+</ul></li>
+<li class="chapter" data-level="5.4" data-path="5-manip.html"><a href="5-manip.html#optional-other-verbs"><i class="fa fa-check"></i><b>5.4</b> Optional: Other verbs</a><ul>
+<li class="chapter" data-level="5.4.1" data-path="5-manip.html"><a href="5-manip.html#select"><i class="fa fa-check"></i><b>5.4.1</b> Select variables using select</a></li>
+<li class="chapter" data-level="5.4.2" data-path="5-manip.html"><a href="5-manip.html#rename"><i class="fa fa-check"></i><b>5.4.2</b> Rename variables using rename</a></li>
+<li class="chapter" data-level="5.4.3" data-path="5-manip.html"><a href="5-manip.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.4.3</b> Find the top number of values using top_n</a></li>
+</ul></li>
+<li class="chapter" data-level="5.5" data-path="5-manip.html"><a href="5-manip.html#conclusion-1"><i class="fa fa-check"></i><b>5.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="5.5.1" data-path="5-manip.html"><a href="5-manip.html#resources-1"><i class="fa fa-check"></i><b>5.5.1</b> Resources</a></li>
+<li class="chapter" data-level="5.5.2" data-path="5-manip.html"><a href="5-manip.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5.2</b> Script of R code</a></li>
+<li class="chapter" data-level="5.5.3" data-path="5-manip.html"><a href="5-manip.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.5.3</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="part"><span><b>II Inference</b></span></li>
-<li class="chapter" data-level="6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via <code id="sim">mosaic</code></a><ul>
-<li class="chapter" data-level="" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="6.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
-<li class="chapter" data-level="6.1.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
-<li class="chapter" data-level="6.1.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+<li class="chapter" data-level="6" data-path="6-sim.html"><a href="6-sim.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via mosaic</a><ul>
+<li class="chapter" data-level="" data-path="6-sim.html"><a href="6-sim.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="6.1" data-path="6-sim.html"><a href="6-sim.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
+<li class="chapter" data-level="6.1.1" data-path="6-sim.html"><a href="6-sim.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
+<li class="chapter" data-level="6.1.2" data-path="6-sim.html"><a href="6-sim.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+</ul></li>
+<li class="chapter" data-level="6.2" data-path="6-sim.html"><a href="6-sim.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
+<li class="chapter" data-level="6.2.1" data-path="6-sim.html"><a href="6-sim.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
+<li class="chapter" data-level="6.2.2" data-path="6-sim.html"><a href="6-sim.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
 </ul></li>
-<li class="chapter" data-level="6.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
-<li class="chapter" data-level="6.2.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
-<li class="chapter" data-level="6.2.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
+<li class="chapter" data-level="6.3" data-path="6-sim.html"><a href="6-sim.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
+<li class="chapter" data-level="6.4" data-path="6-sim.html"><a href="6-sim.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
+<li class="chapter" data-level="6.5" data-path="6-sim.html"><a href="6-sim.html#conclusion-2"><i class="fa fa-check"></i><b>6.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="6.5.1" data-path="6-sim.html"><a href="6-sim.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5.1</b> Script of R code</a></li>
+<li class="chapter" data-level="6.5.2" data-path="6-sim.html"><a href="6-sim.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.5.2</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="6.3" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
-<li class="chapter" data-level="6.4" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
-<li class="chapter" data-level="6.5" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5</b> Script of R code</a></li>
-<li class="chapter" data-level="6.6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.6</b> What’s to come?</a></li>
 </ul></li>
 <li class="chapter" data-level="7" data-path="7-hypo.html"><a href="7-hypo.html"><i class="fa fa-check"></i><b>7</b> Hypothesis Testing</a><ul>
-<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="7.1" data-path="7-hypo.html"><a href="7-hypo.html#when-inference-is-not-needed"><i class="fa fa-check"></i><b>7.1</b> When Inference Is Not Needed</a></li>
 <li class="chapter" data-level="7.2" data-path="7-hypo.html"><a href="7-hypo.html#basics-of-hypothesis-testing"><i class="fa fa-check"></i><b>7.2</b> Basics of Hypothesis Testing</a></li>
 <li class="chapter" data-level="7.3" data-path="7-hypo.html"><a href="7-hypo.html#trial"><i class="fa fa-check"></i><b>7.3</b> Criminal trial analogy</a><ul>
@@ -249,44 +262,50 @@
 <li class="chapter" data-level="7.8.1" data-path="7-hypo.html"><a href="7-hypo.html#example-t-test-for-two-independent-samples"><i class="fa fa-check"></i><b>7.8.1</b> EXAMPLE: <span class="math inline">\(t\)</span>-test for two independent samples</a></li>
 <li class="chapter" data-level="7.8.2" data-path="7-hypo.html"><a href="7-hypo.html#conditions-for-t-test"><i class="fa fa-check"></i><b>7.8.2</b> Conditions for t-test</a></li>
 </ul></li>
-<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9</b> What’s to come?</a></li>
+<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#conclusion-3"><i class="fa fa-check"></i><b>7.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="7.9.1" data-path="7-hypo.html"><a href="7-hypo.html#script-of-r-code-3"><i class="fa fa-check"></i><b>7.9.1</b> Script of R code</a></li>
+<li class="chapter" data-level="7.9.2" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="chapter" data-level="8" data-path="8-ci.html"><a href="8-ci.html"><i class="fa fa-check"></i><b>8</b> Confidence Intervals</a><ul>
-<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a></li>
-<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#mean_rating"><i class="fa fa-check"></i><b>8.2</b> mean_rating</a><ul>
-<li class="chapter" data-level="8.2.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.2.1</b> Review of Bootstrapping</a></li>
-</ul></li>
-<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.3</b> Relation to hypothesis testing</a></li>
-<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.4</b> Effect size</a></li>
-<li class="chapter" data-level="8.5" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-3"><i class="fa fa-check"></i><b>8.5</b> Script of R code</a></li>
-<li class="chapter" data-level="8.6" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.6</b> What’s to come?</a></li>
-</ul></li>
-<li class="chapter" data-level="9" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html"><i class="fa fa-check"></i><b>9</b> Regression via <code id="regress">broom</code></a><ul>
-<li class="chapter" data-level="" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="9.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
-<li class="chapter" data-level="9.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
-<li class="chapter" data-level="9.2.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
-</ul></li>
-<li class="chapter" data-level="9.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
-<li class="chapter" data-level="9.3.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
-<li class="chapter" data-level="9.3.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
-<li class="chapter" data-level="9.3.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#interpretting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpretting the slope</a></li>
-<li class="chapter" data-level="9.3.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
-</ul></li>
-<li class="chapter" data-level="9.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
-<li class="chapter" data-level="9.4.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
-<li class="chapter" data-level="9.4.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
-<li class="chapter" data-level="9.4.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
-<li class="chapter" data-level="9.4.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
-<li class="chapter" data-level="9.4.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
-</ul></li>
-<li class="chapter" data-level="9.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
-<li class="chapter" data-level="9.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
-<li class="chapter" data-level="9.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#script-of-r-code-4"><i class="fa fa-check"></i><b>9.7</b> Script of R code</a></li>
-<li class="chapter" data-level="9.8" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.8</b> What’s to come?</a></li>
+<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a><ul>
+<li class="chapter" data-level="8.1.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.1.1</b> Review of Bootstrapping</a></li>
+</ul></li>
+<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.2</b> Relation to hypothesis testing</a></li>
+<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.3</b> Effect size</a></li>
+<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#conclusion-4"><i class="fa fa-check"></i><b>8.4</b> Conclusion</a><ul>
+<li class="chapter" data-level="8.4.1" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-4"><i class="fa fa-check"></i><b>8.4.1</b> Script of R code</a></li>
+<li class="chapter" data-level="8.4.2" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.4.2</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="9" data-path="9-regress.html"><a href="9-regress.html"><i class="fa fa-check"></i><b>9</b> Regression via broom</a><ul>
+<li class="chapter" data-level="" data-path="9-regress.html"><a href="9-regress.html#needed-packages-6"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="9.1" data-path="9-regress.html"><a href="9-regress.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
+<li class="chapter" data-level="9.2" data-path="9-regress.html"><a href="9-regress.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
+<li class="chapter" data-level="9.2.1" data-path="9-regress.html"><a href="9-regress.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
+</ul></li>
+<li class="chapter" data-level="9.3" data-path="9-regress.html"><a href="9-regress.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
+<li class="chapter" data-level="9.3.1" data-path="9-regress.html"><a href="9-regress.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
+<li class="chapter" data-level="9.3.2" data-path="9-regress.html"><a href="9-regress.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
+<li class="chapter" data-level="9.3.3" data-path="9-regress.html"><a href="9-regress.html#interpreting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpreting the slope</a></li>
+<li class="chapter" data-level="9.3.4" data-path="9-regress.html"><a href="9-regress.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
+</ul></li>
+<li class="chapter" data-level="9.4" data-path="9-regress.html"><a href="9-regress.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
+<li class="chapter" data-level="9.4.1" data-path="9-regress.html"><a href="9-regress.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
+<li class="chapter" data-level="9.4.2" data-path="9-regress.html"><a href="9-regress.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
+<li class="chapter" data-level="9.4.3" data-path="9-regress.html"><a href="9-regress.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
+<li class="chapter" data-level="9.4.4" data-path="9-regress.html"><a href="9-regress.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.5" data-path="9-regress.html"><a href="9-regress.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
+<li class="chapter" data-level="9.4.6" data-path="9-regress.html"><a href="9-regress.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.7" data-path="9-regress.html"><a href="9-regress.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
+</ul></li>
+<li class="chapter" data-level="9.5" data-path="9-regress.html"><a href="9-regress.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
+<li class="chapter" data-level="9.6" data-path="9-regress.html"><a href="9-regress.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
+<li class="chapter" data-level="9.7" data-path="9-regress.html"><a href="9-regress.html#conclusion-5"><i class="fa fa-check"></i><b>9.7</b> Conclusion</a><ul>
+<li class="chapter" data-level="9.7.1" data-path="9-regress.html"><a href="9-regress.html#script-of-r-code-5"><i class="fa fa-check"></i><b>9.7.1</b> Script of R code</a></li>
+<li class="chapter" data-level="9.7.2" data-path="9-regress.html"><a href="9-regress.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.7.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="part"><span><b>III Conclusion</b></span></li>
 <li class="chapter" data-level="10" data-path="10-effective-data-storytelling.html"><a href="10-effective-data-storytelling.html"><i class="fa fa-check"></i><b>10</b> Effective Data Storytelling</a><ul>
@@ -304,7 +323,8 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="B" data-path="B-appendixB.html"><a href="B-appendixB.html"><i class="fa fa-check"></i><b>B</b> Inference Examples</a><ul>
-<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-6"><i class="fa fa-check"></i><b>B.1</b> Needed packages</a></li>
+<li class="chapter" data-level="" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-7"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#inference-mind-map"><i class="fa fa-check"></i><b>B.1</b> Inference Mind Map</a></li>
 <li class="chapter" data-level="B.2" data-path="B-appendixB.html"><a href="B-appendixB.html#one-mean"><i class="fa fa-check"></i><b>B.2</b> One Mean</a><ul>
 <li class="chapter" data-level="B.2.1" data-path="B-appendixB.html"><a href="B-appendixB.html#problem-statement"><i class="fa fa-check"></i><b>B.2.1</b> Problem Statement</a></li>
 <li class="chapter" data-level="B.2.2" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses"><i class="fa fa-check"></i><b>B.2.2</b> Competing Hypotheses</a></li>
@@ -352,6 +372,7 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="C" data-path="C-appendixC.html"><a href="C-appendixC.html"><i class="fa fa-check"></i><b>C</b> Reach for the Starts</a><ul>
+<li class="chapter" data-level="" data-path="C-appendixC.html"><a href="C-appendixC.html#needed-packages-8"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="C.1" data-path="C-appendixC.html"><a href="C-appendixC.html#sorted-barplots"><i class="fa fa-check"></i><b>C.1</b> Sorted barplots</a></li>
 <li class="chapter" data-level="C.2" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-graphics"><i class="fa fa-check"></i><b>C.2</b> Interactive graphics</a><ul>
 <li class="chapter" data-level="C.2.1" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-line-graphs"><i class="fa fa-check"></i><b>C.2.1</b> Interactive line-graphs</a></li>
@@ -379,12 +400,12 @@ <h1>
 <h1 class="title">ModernDive</h1>
 <h3 class="subtitle"><em>An Introduction to Statistical and Data Sciences via R</em></h3>
 <h4 class="author"><em>Chester Ismay and Albert Y. Kim</em></h4>
-<h4 class="date"><em>2017-01-07</em></h4>
+<h4 class="date"><em>2017-01-10</em></h4>
 </div>
 <div id="preamble" class="section level1">
 <h1><span class="header-section-number">1</span> Preamble</h1>
-<div id="principles-of-this-book" class="section level2">
-<h2><span class="header-section-number">1.1</span> Principles of this Book</h2>
+<div id="principles-of-this-book---for-instructors" class="section level2">
+<h2><span class="header-section-number">1.1</span> Principles of this Book - For Instructors</h2>
 <p>These are some principles we keep in mind. If you agree with them, this might be the book for you.</p>
 <ol style="list-style-type: decimal">
 <li><strong>Blur the lines between lecture and lab</strong>
@@ -399,7 +420,7 @@ <h2><span class="header-section-number">1.1</span> Principles of this Book</h2>
 </ul></li>
 <li><strong>It’s all about data, data, data</strong>
 <ul>
-<li>We leverage R packages for rich/complex yet easy-to-load data sets.</li>
+<li>We leverage R packages for rich/complex, yet easy-to-load data sets.</li>
 <li>We’ve heard it before: “You can’t teach <code>ggplot2</code> for data visualization in intro stats!” We, like <a href="http://varianceexplained.org/r/teach_ggplot2_to_beginners/">David Robinson</a>, are more optimistic and we’ve had success doing so.</li>
 <li><code>dplyr</code> is a <a href="http://chance.amstat.org/2015/04/setting-the-stage/">game changer</a> for data manipulation: the verb describing your desired data action <em>is</em> the command name!</li>
 </ul></li>
@@ -410,7 +431,7 @@ <h2><span class="header-section-number">1.1</span> Principles of this Book</h2>
 </ul></li>
 <li><strong>Don’t fence off students from the computation pool, throw them in!</strong>
 <ul>
-<li>Don’t teach them coding/programming per se, but computation and algorithmic thinking.</li>
+<li>Don’t teach them coding/programming per se, but computational and algorithmic thinking.</li>
 <li>Drawing Venn diagrams delineating statistics, computer science, and data science is also ever more archaic; embrace computation!</li>
 </ul></li>
 <li><strong>Complete reproducibility</strong>
@@ -419,7 +440,7 @@ <h2><span class="header-section-number">1.1</span> Principles of this Book</h2>
 <li>We encourage use of R Markdown to foster notions of reproducible research.</li>
 <li><strong>Ultimately the best textbook is one you’ve written yourself</strong>
 <ul>
-<li>You best know your audience, their background, and their priorities and you know best your own style and types of examples and problems you like best. Customizability is the ultimate end.</li>
+<li>You best know your audience, their background, and their priorities and you know best your own style and the types of examples and problems you like best. Customizability is the ultimate end.</li>
 <li>A new paradigm for textbooks? Versions, not editions? Pull requests, crowd-sourcing, and development versions?</li>
 </ul></li>
 </ul></li>
@@ -428,16 +449,16 @@ <h2><span class="header-section-number">1.1</span> Principles of this Book</h2>
 <div id="contribute" class="section level2">
 <h2><span class="header-section-number">1.2</span> Contribute</h2>
 <ul>
-<li>This book is in beta testing and is currently at Version 0.1.0. If you would like to receive periodic updates on this book and other similar projects, please fill out this <a href="https://goo.gl/forms/IxiwBeEnk72NxMMx2">Google Form</a>.</li>
-<li>The source code for this book is available for download/forking on <a href="https://github.com/ismayc/moderndiver-book">GitHub</a>. If you find typos or other errors or have suggestions on how to better word something in the book, please create a pull request too!</li>
+<li>This book is in beta testing and is currently at Version 0.1.1. If you would like to receive periodic updates on this book and other similar projects, please fill out this <a href="https://goo.gl/forms/IxiwBeEnk72NxMMx2">Google Form</a>.</li>
+<li>The source code for this book is available for download/forking on <a href="https://github.com/ismayc/moderndiver-book">GitHub</a>. If you click on the <strong>release</strong> link near the top of the page there, you can download all of the source code for whichever release version you’d like to work with and use. If you find typos or other errors or have suggestions on how to better word something in the book, please create a pull request too! We also welcome issue creation. Let’s all work together to make this book as great as possible for as many students and instructors as possible.</li>
 <li>Please feel free to modify the book as you wish for your own needs! All we ask is that you list the authors field above as “Chester Ismay, Albert Y. Kim, and YOU!”</li>
-<li>We’d also appreciate if you let us now what changes you’ve made and how you’ve used the textbook. We’d love some data on what’s working well and what’s not working so well.</li>
+<li>We’d also appreciate if you let us know what changes you’ve made and how you’ve used the textbook. We’d love some data on what’s working well and what’s not working so well.</li>
 </ul>
 </div>
-<div id="getting-started" class="section level2">
-<h2><span class="header-section-number">1.3</span> Getting Started</h2>
-<p>This book was written using the <strong>bookdown</strong> R package from Yihui Xie. In order to follow along and run the code in this book on your own, you’ll need to have access to R and RStudio. You can find more information on both of these with a simple Google search for “R” and for “RStudio.” An introduction to using R, RStudio, and R Markdown is also available in a free book <a href="http://ismayc.github.io/rbasics-book">here</a> <span class="citation">(Ismay <a href="#ref-usedtor2016">2016</a>)</span>. It is recommended that you refer back to this book frequently as it has GIF screen recordings that you can follow along with as you learn.</p>
-<p>We will keep a running list of R packages you will need to have installed to complete the analysis as well here in the <code>needed_pkgs</code> character vector. You can check if you have all of the needed packages installed by running all of the lines below. The last lines including the <code>if</code> will install them as needed (i.e., download their needed files from the internet to your hard drive).</p>
+<div id="getting-started---for-students" class="section level2">
+<h2><span class="header-section-number">1.3</span> Getting Started - For Students</h2>
+<p>This book was written using the <strong>bookdown</strong> R package from Yihui Xie <span class="citation">(Xie <a href="#ref-R-bookdown">2016</a>)</span>. In order to follow along and run the code in this book on your own, you’ll need to have access to R and RStudio. You can find more information on both of these with a simple Google search for “R” and for “RStudio.” An introduction to using R, RStudio, and R Markdown is also available in a free book <a href="http://ismayc.github.io/rbasics-book">here</a> <span class="citation">(Ismay <a href="#ref-usedtor2016">2016</a>)</span>. It is recommended that you refer back to this book frequently as it has GIF screen recordings that you can follow along with as you learn.</p>
+<p>We will keep a running list of R packages you will need to have installed to complete the analysis as well here in the <code>needed_pkgs</code> character vector. You can check if you have all of the needed packages installed by running all of the lines below in the next chunk of R code. The last lines including the <code>if</code> will install them as needed (i.e., download their needed files from the internet to your hard drive and install them for your use).</p>
 <p>You can run the <code>library</code> function on them to load them into your current analysis. Prior to each analysis where a package is needed, you will see the corresponding <code>library</code> function in the text. Make sure to check the top of the chapter to see if a package was loaded there.</p>
 <div class="sourceCode"><pre class="sourceCode r"><code class="sourceCode r">needed_pkgs &lt;-<span class="st"> </span><span class="kw">c</span>(<span class="st">&quot;nycflights13&quot;</span>, <span class="st">&quot;dplyr&quot;</span>, <span class="st">&quot;ggplot2&quot;</span>, <span class="st">&quot;knitr&quot;</span>, 
   <span class="st">&quot;okcupiddata&quot;</span>, <span class="st">&quot;dygraphs&quot;</span>, <span class="st">&quot;rmarkdown&quot;</span>, <span class="st">&quot;mosaic&quot;</span>, <span class="st">&quot;ggplot2movies&quot;</span>)
@@ -850,12 +871,15 @@ <h2>Colophon</h2>
 </tbody>
 </table>
 <p><strong>Book was last updated:</strong></p>
-<pre><code>## [1] &quot;By Chester on Saturday, January 07, 2017 11:29:21 EST&quot;</code></pre>
+<pre><code>## [1] &quot;By Chester on Tuesday, January 10, 2017 21:08:44 EST&quot;</code></pre>
 
 </div>
 </div>
 <h3>References</h3>
 <div id="refs" class="references">
+<div id="ref-R-bookdown">
+<p>Xie, Yihui. 2016. <em>Bookdown: Authoring Books and Technical Documents with R Markdown</em>. <a href="https://CRAN.R-project.org/package=bookdown" class="uri">https://CRAN.R-project.org/package=bookdown</a>.</p>
+</div>
 <div id="ref-usedtor2016">
 <p>Ismay, Chester. 2016. <em>Getting Used to R, RStudio, and R Markdown</em>. <a href="http://ismayc.github.io/rbasics-book" class="uri">http://ismayc.github.io/rbasics-book</a>.</p>
 </div>
diff --git a/docs/ismaykim.pdf b/docs/ismaykim.pdf
index 12be10103..b4084dab2 100644
Binary files a/docs/ismaykim.pdf and b/docs/ismaykim.pdf differ
diff --git a/docs/ismaykim.tex b/docs/ismaykim.tex
index dd28a7242..c5e64f31d 100644
--- a/docs/ismaykim.tex
+++ b/docs/ismaykim.tex
@@ -115,7 +115,7 @@
 \title{ModernDive}
 
 \author{Chester Ismay and Albert Y. Kim}
-\date{2017-01-07}
+\date{2017-01-10}
 
 \usepackage{booktabs}
 \usepackage{longtable}
@@ -197,7 +197,8 @@
 
 \chapter{Preamble}\label{preamble}
 
-\section{Principles of this Book}\label{principles-of-this-book}
+\section{Principles of this Book - For
+Instructors}\label{principles-of-this-book---for-instructors}
 
 These are some principles we keep in mind. If you agree with them, this
 might be the book for you.
@@ -238,7 +239,7 @@ \section{Principles of this Book}\label{principles-of-this-book}
   \begin{itemize}
   \tightlist
   \item
-    We leverage R packages for rich/complex yet easy-to-load data sets.
+    We leverage R packages for rich/complex, yet easy-to-load data sets.
   \item
     We've heard it before: ``You can't teach \texttt{ggplot2} for data
     visualization in intro stats!'' We, like
@@ -271,7 +272,7 @@ \section{Principles of this Book}\label{principles-of-this-book}
   \begin{itemize}
   \tightlist
   \item
-    Don't teach them coding/programming per se, but computation and
+    Don't teach them coding/programming per se, but computational and
     algorithmic thinking.
   \item
     Drawing Venn diagrams delineating statistics, computer science, and
@@ -296,8 +297,9 @@ \section{Principles of this Book}\label{principles-of-this-book}
     \tightlist
     \item
       You best know your audience, their background, and their
-      priorities and you know best your own style and types of examples
-      and problems you like best. Customizability is the ultimate end.
+      priorities and you know best your own style and the types of
+      examples and problems you like best. Customizability is the
+      ultimate end.
     \item
       A new paradigm for textbooks? Versions, not editions? Pull
       requests, crowd-sourcing, and development versions?
@@ -310,33 +312,39 @@ \section{Contribute}\label{contribute}
 \begin{itemize}
 \tightlist
 \item
-  This book is in beta testing and is currently at Version 0.1.0. If you
+  This book is in beta testing and is currently at Version 0.1.1. If you
   would like to receive periodic updates on this book and other similar
   projects, please fill out this
   \href{https://goo.gl/forms/IxiwBeEnk72NxMMx2}{Google Form}.
 \item
   The source code for this book is available for download/forking on
-  \href{https://github.com/ismayc/moderndiver-book}{GitHub}. If you find
-  typos or other errors or have suggestions on how to better word
-  something in the book, please create a pull request too!
+  \href{https://github.com/ismayc/moderndiver-book}{GitHub}. If you
+  click on the \textbf{release} link near the top of the page there, you
+  can download all of the source code for whichever release version
+  you'd like to work with and use. If you find typos or other errors or
+  have suggestions on how to better word something in the book, please
+  create a pull request too! We also welcome issue creation. Let's all
+  work together to make this book as great as possible for as many
+  students and instructors as possible.
 \item
   Please feel free to modify the book as you wish for your own needs!
   All we ask is that you list the authors field above as ``Chester
   Ismay, Albert Y. Kim, and YOU!''
 \item
-  We'd also appreciate if you let us now what changes you've made and
+  We'd also appreciate if you let us know what changes you've made and
   how you've used the textbook. We'd love some data on what's working
   well and what's not working so well.
 \end{itemize}
 
-\section{Getting Started}\label{getting-started}
+\section{Getting Started - For
+Students}\label{getting-started---for-students}
 
 This book was written using the \textbf{bookdown} R package from Yihui
-Xie. In order to follow along and run the code in this book on your own,
-you'll need to have access to R and RStudio. You can find more
-information on both of these with a simple Google search for ``R'' and
-for ``RStudio.'' An introduction to using R, RStudio, and R Markdown is
-also available in a free book
+Xie \citep{R-bookdown}. In order to follow along and run the code in
+this book on your own, you'll need to have access to R and RStudio. You
+can find more information on both of these with a simple Google search
+for ``R'' and for ``RStudio.'' An introduction to using R, RStudio, and
+R Markdown is also available in a free book
 \href{http://ismayc.github.io/rbasics-book}{here} \citep{usedtor2016}.
 It is recommended that you refer back to this book frequently as it has
 GIF screen recordings that you can follow along with as you learn.
@@ -344,9 +352,10 @@ \section{Getting Started}\label{getting-started}
 We will keep a running list of R packages you will need to have
 installed to complete the analysis as well here in the
 \texttt{needed\_pkgs} character vector. You can check if you have all of
-the needed packages installed by running all of the lines below. The
-last lines including the \texttt{if} will install them as needed (i.e.,
-download their needed files from the internet to your hard drive).
+the needed packages installed by running all of the lines below in the
+next chunk of R code. The last lines including the \texttt{if} will
+install them as needed (i.e., download their needed files from the
+internet to your hard drive and install them for your use).
 
 You can run the \texttt{library} function on them to load them into your
 current analysis. Prior to each analysis where a package is needed, you
@@ -452,7 +461,7 @@ \section*{Colophon}\label{colophon}
 \textbf{Book was last updated:}
 
 \begin{verbatim}
-## [1] "By Chester on Saturday, January 07, 2017 11:27:14 EST"
+## [1] "By Chester on Tuesday, January 10, 2017 21:06:31 EST"
 \end{verbatim}
 
 \chapter{Introduction}\label{intro}
@@ -477,7 +486,7 @@ \section{Preamble}\label{preamble-1}
 computers instead of focusing on memorization of formulas. The last two
 books also provide a path towards free alternatives to the traditionally
 expensive introductory statistics textbook. When looking over the vast
-number of introductory statistics textbooks we found that there wasn't
+number of introductory statistics textbooks, we found that there wasn't
 one that incorporated many of the new R packages directly into the text.
 Additionally, there wasn't an open-source, free textbook available that
 showed new learners all of the following
@@ -506,8 +515,8 @@ \section{Preamble}\label{preamble-1}
 Additionally, this book will focus on the triad of computational
 thinking, data thinking, and inferential thinking. We'll see throughout
 the book how these three modes of thinking can build effective ways to
-work with, describe, and convey statistical knowledge. In order to do
-so, you'll see the importance of literate programming to develop
+work with, to describe, and to convey statistical knowledge. In order to
+do so, you'll see the importance of literate programming to develop
 literate data science. In other words, you'll see how to write code and
 descriptions that are useful not just for a computer to execute but also
 for readers to understand exactly what a statistical analysis is doing
@@ -521,8 +530,8 @@ \section{Preamble}\label{preamble-1}
 
 \section{Three driving data sources}\label{three-driving-data-sources}
 
-Instead of hopping from one data set to the next, we've decided to focus
-throughout the book on three different data sources:
+Instead of hopping from one data set to the next in the text of this
+book, we've decided to focus throughout on three different data sources:
 
 \begin{itemize}
 \tightlist
@@ -537,7 +546,8 @@ \section{Three driving data sources}\label{three-driving-data-sources}
 By focusing on just three large data sources, it is our hope that you'll
 be able to see how each of the chapters is interconnected. You'll see
 how the data being tidy leads into data visualization and manipulation
-and how those concepts tie into inference and regression.
+in exploratory data analysis and how those concepts tie into inference
+and regression.
 
 \section{Data/science pipeline}\label{datascience-pipeline}
 
@@ -566,6 +576,8 @@ \section{Data/science pipeline}\label{datascience-pipeline}
   data modeling
 \item
   inference
+\item
+  correlation and regression
 \item
   interpretation of results
 \item
@@ -594,7 +606,7 @@ \section{Data/science pipeline}\label{datascience-pipeline}
 There's a reason so many fields require a statistics course. Scientific
 knowledge grows through an understanding of statistical significance and
 data analysis. You needn't be intimidated by statistics. It's not the
-beast that it used to be and paired with computation you'll see how
+beast that it used to be and, paired with computation, you'll see how
 reproducible research in the sciences particularly increases scientific
 knowledge.
 
@@ -613,10 +625,11 @@ \section{Reproducibility}\label{reproducibility}
 important for you to keep track of your code and well-document it to
 help yourself later and any potential collaborators as well.
 
-Copying and pasting is not the way that efficient and effective
-scientific research is conducted. It's much more important for time to
-be spent on data collection and data analysis and not on copying and
-pasting plots back and forth across a variety of programs.
+Copying and pasting results from one program into a word processor is
+not the way that efficient and effective scientific research is
+conducted. It's much more important for time to be spent on data
+collection and data analysis and not on copying and pasting plots back
+and forth across a variety of programs.
 
 In a traditional analyses if an error was made with the original data,
 we'd need to step through the entire process again: recreate the plots
@@ -634,31 +647,32 @@ \section{Reproducibility}\label{reproducibility}
 fields. Are experiments conducted in a way that another researcher could
 follow the steps and get similar results? In this book, we will focus on
 what is known as \textbf{computational reproducibility}. This refers to
-being able to pass all of one's data analysis and conclusions to someone
-else and have them get exactly the same results on their machine. This
-allows for time to be spent doing actual science and interpreting of
-results and assumptions instead of the more error prone way of starting
-from scratch or follow a list of steps that may be different from
-machine to machine.
+being able to pass all of one's data analysis, data sets, and
+conclusions to someone else and have them get exactly the same results
+on their machine. This allows for time to be spent doing actual science
+and interpreting of results and assumptions instead of the more error
+prone way of starting from scratch or following a list of steps that may
+be different from machine to machine.
 
 \section{Who is this book for?}\label{who-is-this-book-for}
 
 This book is targeted at students taking a traditional intro stats class
 in a small college environment using RStudio and preferably RStudio
-Server. We assume no prerequisites: no calculus and no prior programming
-experience. This is intended to be a gentle and nice introduction to the
-practice of statistics in terms of how data scientists, statisticians,
-and other scientists analyze data and write stories about data. We have
-intentionally avoided the use of throwing formulas at you and instead
-have focused on developing statistical concepts via data visualization
-and statistical computing. We hope this is a more intuitive experience
-than the way statistics has traditionally been taught in the past (and
-how it is commonly perceived from the outside). We additionally hope
-that you see the value of reproducible research via R as you continue in
-your studies. We understand that there will initially be growing pains
-in learning to program but we are here to help you and you should know
-that there is a huge community of R users that are always happy to help
-newbies along.
+Server. We assume no prerequisites: no algebra, no calculus, and no
+prior programming experience. This is intended to be a gentle and nice
+introduction to the practice of statistics in terms of how data
+scientists, statisticians, data journalists, and other scientists
+analyze data and write stories about data. We have intentionally avoided
+the use of throwing formulas at you as much as possible and instead have
+focused on developing statistical concepts via data visualization and
+statistical computing. We hope this is a more intuitive experience than
+the way statistics has traditionally been taught in the past (and how it
+is commonly perceived from the outside). We additionally hope that you
+see the value of reproducible research via R as you continue in your
+studies. We understand that there will initially be growing pains in
+learning to program but we are here to help you and you should know that
+there is a huge community of R users that are always happy to help
+newbies along as well.
 
 Now let's get into learning about how to create good stories about and
 with data!
@@ -670,17 +684,49 @@ \chapter{Tidy Data}\label{tidy}
 In this chapter, we'll discuss the importance of tidy data. You may
 think that this means just having your data in a spreadsheet, but you'll
 see that it is actually more specific than that. Data actually comes to
-us in a variety of formats from pictures to text and to just numbers.
-We'll focus on datasets that can be stored in a spreadsheet throughout
-this book as that is the most common way data is collected in the
-sciences.
+us in a variety of formats from pictures to text to just numbers. We'll
+focus on datasets that can be stored in a spreadsheet throughout this
+book as that is the most common way data is collected in the sciences.
 
 Having tidy data will allow us to more easily create data visualizations
 as we will see in Chapter \ref{viz}. It will also help us with
 manipulating data in Chapter \ref{manip} and in all subsequent chapters
 when we discuss statistical inference. You may not necessarily
-understand the importance for \textbf{tidy data} but it will become more
-and more apparent as we proceed through the book.
+understand the importance for \textbf{tidy data} immediately but it will
+become more and more apparent as we proceed through the book.
+
+\subsection*{Needed packages}\label{needed-packages}
+\addcontentsline{toc}{subsection}{Needed packages}
+
+At the beginning of this and all subsequent chapters, we'll always have
+a list of packages you should have installed and loaded. In particular
+we load the \texttt{nycflights13} package which we'll discuss shortly
+and the \texttt{dplyr} package for data manipulation, the subject of
+Chapter \ref{manip}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\KeywordTok{library}\NormalTok{(nycflights13)}
+\KeywordTok{library}\NormalTok{(dplyr)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+## 
+## Attaching package: 'dplyr'
+\end{verbatim}
+
+\begin{verbatim}
+## The following objects are masked from 'package:stats':
+## 
+##     filter, lag
+\end{verbatim}
+
+\begin{verbatim}
+## The following objects are masked from 'package:base':
+## 
+##     intersect, setdiff, setequal, union
+\end{verbatim}
 
 \section{What is tidy data?}\label{what-is-tidy-data}
 
@@ -705,12 +751,12 @@ \section{What is tidy data?}\label{what-is-tidy-data}
   Grant
 \end{itemize}
 
-So what does it mean for your data to be \textbf{tidy}? Put simply: it
+So what does it mean for your data to be \textbf{tidy}? Put simply, it
 means that your data is organized. But it's more than just that. It
 means that your data follows the same standard format making it easy for
 others to find elements of your data, to manipulate and transform your
-data, and for our purposes continuing with the common theme: it makes it
-easier to visualize your data and the relationships between different
+data, and, for our purposes, continuing with the common theme: it makes
+it easier to visualize your data and the relationships between different
 variables in your data.
 
 We will follow Hadley Wickham's definition of \textbf{tidy data} here
@@ -756,7 +802,8 @@ \section{What is tidy data?}\label{what-is-tidy-data}
 \end{figure}
 
 Reading over this definition, you can begin to think about datasets that
-won't follow this nice format.
+won't follow this nice format. This format of data is also known as
+``long'' format.
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
@@ -775,10 +822,29 @@ \section{What is tidy data?}\label{what-is-tidy-data}
   How could the dataset be tweaked to make it \textbf{tidy}?
 \end{itemize}
 
+\textbf{(LC3.2)} Say the following table are stock prices, how would you
+make this tidy?
+
+\begin{tabular}{l|r|r|r}
+\hline
+time & x & y & z\\
+\hline
+2009-01-01 & 0.630 & -0.972 & -3.277\\
+\hline
+2009-01-02 & -1.568 & -1.329 & 0.686\\
+\hline
+2009-01-03 & -0.899 & -0.545 & -3.828\\
+\hline
+2009-01-04 & -0.017 & -0.185 & 4.114\\
+\hline
+2009-01-05 & -0.483 & -3.478 & 0.215\\
+\hline
+\end{tabular}
+
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
-\section{\texorpdfstring{The \texttt{nycflights13}
-datasets}{The nycflights13 datasets}}\label{the-nycflights13-datasets}
+\section{\texorpdfstring{Datasets in the \texttt{nycflights13}
+package}{Datasets in the nycflights13 package}}\label{datasets-in-the-nycflights13-package}
 
 We likely have all flown on airplanes or know someone that has. Air
 travel has become an ever-present aspect of our daily lives. If you live
@@ -791,55 +857,48 @@ \section{\texorpdfstring{The \texttt{nycflights13}
 We'd all like to arrive at our destinations on time whenever possible.
 (Unless you secretly love hanging out at airports. If you are one of
 these people, pretend for the moment that you are very much anticipating
-being at your final destination.) Hadley Wickham (herein just referred
-to as ``Hadley'') created multiple datasets containing information about
-departing flights from the New York City area in 2013
-\citep{R-nycflights13}. We will begin by loading in one of these
-datasets, the \texttt{flights} dataset, and getting an idea of its
-structure:
+being at your final destination.) Throughout this book, we're going to
+analyze data related to flights contained in the \texttt{nycflights13}
+package we loaded earlier \citep{R-nycflights13}. Specifically, this
+package contains information about all flights that departed from NYC
+(e.g.~EWR, JFK and LGA) in 2013 in 5 data sets:
 
-\begin{Shaded}
-\begin{Highlighting}[]
-\KeywordTok{library}\NormalTok{(nycflights13)}
-\KeywordTok{data}\NormalTok{(flights)}
-\end{Highlighting}
-\end{Shaded}
+\begin{itemize}
+\tightlist
+\item
+  \texttt{flights}: information on all 336,776 flights
+\item
+  \texttt{weather}: hourly meterological data for each airport
+\item
+  \texttt{planes}: construction information about each plane
+\item
+  \texttt{airports}: airport names and locations
+\item
+  \texttt{airlines}: translation between two letter carrier codes and
+  names
+\end{itemize}
 
-The \texttt{library} function here loads the R package
-\texttt{nycflights13} into the current R environment in which you are
-working. The \texttt{data(flights)} loads in the \texttt{flights}
-dataset that is stored in the \texttt{nycflights13} package. Note that
-you'll get an error if you try to load this package in and it hasn't
-been downloaded and installed. You can ensure it is installed by running
-the code below:
+We will begin by loading in the \texttt{flights} dataset and getting an
+idea of its structure. Run the following in your console
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{if(!}\KeywordTok{require}\NormalTok{(nycflights13))}
-  \KeywordTok{install.packages}\NormalTok{(}\StringTok{"nycflights13"}\NormalTok{, }\DataTypeTok{repos =} \StringTok{"http://cran.rstudio.org"}\NormalTok{)}
+\KeywordTok{data}\NormalTok{(flights)}
 \end{Highlighting}
 \end{Shaded}
 
-This code checks to see if \texttt{nycflights13} is installed and, if
-not, then goes to the specified repository of
-``\url{http://cran.rstudio.org}'' and downloads the package from there
-and installs it. If it is already installed you can see it listed in the
-\textbf{Packages} tab in the bottom right portion of RStudio and the
-code will not install the package again since this is redundant and you
-won't need to do it over and over again.
+This line of code loads in the \texttt{flights} dataset that is stored
+in the \texttt{nycflights13} package. This dataset and most others
+presented in this book will be in the ``data frame'' format in R. Data
+frames are essentially spreadsheets and allow us to look at collections
+of variables that are tightly coupled together.
 
-This dataset and most others presented in this book will be in the
-\texttt{data.frame} format in R. Data frames are ways to look at
-collections of variables that are tightly coupled together. Frequently,
-the best way to get a feel for a data frame is to use the \texttt{View}
+The best way to get a feel for a data frame is to use the \texttt{View}
 function in RStudio. This command will be given throughout the book as a
-reminder, but the actual output will be hidden.
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\KeywordTok{View}\NormalTok{(flights)}
-\end{Highlighting}
-\end{Shaded}
+reminder, but the actual output will be hidden. Run
+\texttt{View(flights)} in R and look over this data frame. You should
+slowly get into the habit of always \texttt{View}ing any data frames
+that come your way.
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
@@ -847,7 +906,7 @@ \section{\texorpdfstring{The \texttt{nycflights13}
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC3.2)} What does any \emph{ONE} row in this \texttt{flights}
+\textbf{(LC3.3)} What does any \emph{ONE} row in this \texttt{flights}
 dataset refer to?
 
 \begin{itemize}
@@ -878,7 +937,9 @@ \section{\texorpdfstring{The \texttt{nycflights13}
 to. In other words, this will allow you to identify what object is being
 referred to in a given row. This is often called the
 \textbf{observational unit}. The \textbf{observational unit} in this
-example is an individual flight departing New York City in 2013.
+example is an individual flight departing New York City in 2013. You can
+identify the observational unit by determining what the \textbf{thing}
+is that is being measured in each of the variables.
 
 \textbf{Note}: Frequently the first thing you should do when given a
 dataset is to
@@ -893,33 +954,37 @@ \section{\texorpdfstring{The \texttt{nycflights13}
   give the types of variables you are presented with.
 \end{itemize}
 
+The \texttt{glimpse()} command in the \texttt{dplyr} package provides us
+with much of the above information and more:
+
 \begin{Shaded}
 \begin{Highlighting}[]
-\KeywordTok{str}\NormalTok{(flights)}
+\KeywordTok{glimpse}\NormalTok{(flights)}
 \end{Highlighting}
 \end{Shaded}
 
 \begin{verbatim}
-## Classes 'tbl_df', 'tbl' and 'data.frame':    336776 obs. of  19 variables:
-##  $ year          : int  2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
-##  $ month         : int  1 1 1 1 1 1 1 1 1 1 ...
-##  $ day           : int  1 1 1 1 1 1 1 1 1 1 ...
-##  $ dep_time      : int  517 533 542 544 554 554 555 557 557 558 ...
-##  $ sched_dep_time: int  515 529 540 545 600 558 600 600 600 600 ...
-##  $ dep_delay     : num  2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
-##  $ arr_time      : int  830 850 923 1004 812 740 913 709 838 753 ...
-##  $ sched_arr_time: int  819 830 850 1022 837 728 854 723 846 745 ...
-##  $ arr_delay     : num  11 20 33 -18 -25 12 19 -14 -8 8 ...
-##  $ carrier       : chr  "UA" "UA" "AA" "B6" ...
-##  $ flight        : int  1545 1714 1141 725 461 1696 507 5708 79 301 ...
-##  $ tailnum       : chr  "N14228" "N24211" "N619AA" "N804JB" ...
-##  $ origin        : chr  "EWR" "LGA" "JFK" "JFK" ...
-##  $ dest          : chr  "IAH" "IAH" "MIA" "BQN" ...
-##  $ air_time      : num  227 227 160 183 116 150 158 53 140 138 ...
-##  $ distance      : num  1400 1416 1089 1576 762 ...
-##  $ hour          : num  5 5 5 5 6 5 6 6 6 6 ...
-##  $ minute        : num  15 29 40 45 0 58 0 0 0 0 ...
-##  $ time_hour     : POSIXct, format: "2013-01-01 05:00:00" ...
+## Observations: 336,776
+## Variables: 19
+## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 20...
+## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
+## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
+## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557,...
+## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600,...
+## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2,...
+## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838...
+## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846...
+## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2...
+## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "E...
+## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708,...
+## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N66...
+## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "E...
+## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "F...
+## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, ...
+## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229,...
+## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6,...
+## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, ...
+## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2...
 \end{verbatim}
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
@@ -928,33 +993,27 @@ \section{\texorpdfstring{The \texttt{nycflights13}
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC3.3)} What are some examples in this dataset of
+\textbf{(LC3.4)} What are some examples in this dataset of
 \textbf{categorical} variables? What makes them different than
 \textbf{quantitative} variables?
 
-\textbf{(LC3.4)} What does \texttt{int}, \texttt{num}, and \texttt{chr}
+\textbf{(LC3.5)} What does \texttt{int}, \texttt{num}, and \texttt{chr}
 mean in the output above?
 
-\textbf{(LC3.5)} How many different columns are in this dataset?
+\textbf{(LC3.6)} How many different columns are in this dataset?
 
-\textbf{(LC3.6)} How many different rows are in this dataset?
+\textbf{(LC3.7)} How many different rows are in this dataset?
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
-Another way to view the properties of a dataset is to use the
-\texttt{str} function (``str'' is short for ``structure''). The
-\texttt{str} function is expecting an object for its argument. In this
-case, the object is a data frame named \texttt{flights}. You can use the
-\texttt{str} function on other objects and data frames using the syntax
-\texttt{str(object)} where \texttt{object} is the name of an object in
-R. This will give you the first few entries of each variable in a row
-after the variable. In addition, the type of the variable is given
-immediately after the \texttt{:} following each variable's name. Here,
-\texttt{int} and \texttt{num} refer to quantitative variables. In
-contrast, \texttt{chr} refers to categorical variables. One more type of
-variable is given here with the \texttt{time\_hour} variable:
-\textbf{POSIXct}. As you may suspect, this variable corresponds to a
-specific date and time of day.
+We see that \texttt{glimpse} will give you the first few entries of each
+variable in a row after the variable. In addition, the type of the
+variable is given immediately after each variable's name inside
+\texttt{\textless{}\ \textgreater{}}. Here, \texttt{int} and
+\texttt{num} refer to quantitative variables. In contrast, \texttt{chr}
+refers to categorical variables. One more type of variable is given here
+with the \texttt{time\_hour} variable: \textbf{dttm}. As you may
+suspect, this variable corresponds to a specific date and time of day.
 
 Another nice feature of R is the help system. You can get help in R by
 simply entering a question mark before the name of a function or an
@@ -965,7 +1024,7 @@ \section{\texorpdfstring{The \texttt{nycflights13}
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{?str}
+\NormalTok{?glimpse}
 \NormalTok{?flights}
 \end{Highlighting}
 \end{Shaded}
@@ -988,7 +1047,7 @@ \section{\texorpdfstring{How is \texttt{flights}
 We see that \texttt{flights} has a rectangular shape with each row
 corresponding to a different flight and each column corresponding to a
 characteristic of that flight. This matches exactly with how Hadley
-defined tidy data:
+Wickham defined tidy data:
 
 \begin{enumerate}
 \def\labelenumi{\arabic{enumi}.}
@@ -1036,7 +1095,7 @@ \section{\texorpdfstring{How is \texttt{flights}
 \end{itemize}
 
 You may have been asking yourself what \texttt{carrier} refers to in the
-\texttt{str(flights)} output above. The \texttt{airlines} dataset
+\texttt{glimpse(flights)} output above. The \texttt{airlines} dataset
 provides a description of this with each airline being the observational
 unit:
 
@@ -1048,7 +1107,9 @@ \section{\texorpdfstring{How is \texttt{flights}
 \end{Shaded}
 
 \begin{verbatim}
+## # A tibble: 16 × 2
 ##    carrier                        name
+##      <chr>                       <chr>
 ## 1       9E           Endeavor Air Inc.
 ## 2       AA      American Airlines Inc.
 ## 3       AS        Alaska Airlines Inc.
@@ -1073,6 +1134,88 @@ \section{\texorpdfstring{How is \texttt{flights}
 RStudio since larger objects may take awhile to print to the screen and
 it likely won't be helpful to you to have hundreds of lines outputted.
 
+\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
+
+\begin{learncheck}
+\textbf{\emph{Learning check}}
+\end{learncheck}
+
+\textbf{(LC3.8)} Run the following block of code in R to load and view
+each of the four data frames in the \texttt{nycflights13} package.
+Switch between the different tabs that have opened to view each of the
+four data frames. Describe in two sentences for each data frame what
+stands out to you and what the most important features are of each.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\KeywordTok{data}\NormalTok{(weather)}
+\KeywordTok{data}\NormalTok{(planes)}
+\KeywordTok{data}\NormalTok{(airports)}
+\KeywordTok{data}\NormalTok{(airlines)}
+\KeywordTok{View}\NormalTok{(weather)}
+\KeywordTok{View}\NormalTok{(planes)}
+\KeywordTok{View}\NormalTok{(airports)}
+\KeywordTok{View}\NormalTok{(airlines)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
+
+\subsection{Identification variables}\label{identification-variables}
+
+There is a subtle difference between the kinds of variables that you
+will encounter in data frames. The \texttt{airports} data frame you
+worked with above contains data in these different kinds. Let's pull
+them apart using the \texttt{glimpse} function:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\KeywordTok{glimpse}\NormalTok{(airports)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{verbatim}
+## Observations: 1,458
+## Variables: 8
+## $ faa   <chr> "04G", "06A", "06C", "06N", "09J", "0A9", "0G6", "0G7...
+## $ name  <chr> "Lansdowne Airport", "Moton Field Municipal Airport",...
+## $ lat   <dbl> 41.13047, 32.46057, 41.98934, 41.43191, 31.07447, 36....
+## $ lon   <dbl> -80.61958, -85.68003, -88.10124, -74.39156, -81.42778...
+## $ alt   <int> 1044, 264, 801, 523, 11, 1593, 730, 492, 1000, 108, 4...
+## $ tz    <dbl> -5, -6, -6, -5, -5, -5, -5, -5, -5, -8, -5, -6, -5, -...
+## $ dst   <chr> "A", "A", "A", "A", "A", "A", "A", "A", "U", "A", "A"...
+## $ tzone <chr> "America/New_York", "America/Chicago", "America/Chica...
+\end{verbatim}
+
+The variables \texttt{faa} and \texttt{name} are what we will call
+\emph{identification variables}. They are mainly used to provide a name
+to the observational unit. Here the observational unit is an airport and
+the \texttt{faa} gives the code provided by the FAA for that airport
+while the \texttt{name} variable gives the longer more natural name of
+the airport. These ID variables differ from the other variables that are
+often called \emph{measurement} or \emph{characteristic} variables. The
+remaining variables (aside from \texttt{faa} and \texttt{name}) are of
+this type in \texttt{airports}. They don't uniquely identify the
+observational unit, but instead describe properties of the observational
+unit. For organizational purposes, it is best practice to have your
+identification variables in the far leftmost columns of your data frame.
+
+\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
+
+\begin{learncheck}
+\textbf{\emph{Learning check}}
+\end{learncheck}
+
+\textbf{(LC3.9)} What properties of the observational unit do each of
+\texttt{lat}, \texttt{lon}, \texttt{alt}, \texttt{tz}, \texttt{dst}, and
+\texttt{tzone} describe for the \texttt{airports} data frame?
+
+\textbf{(LC3.10)} Provide the names of variables in a data frame with at
+least three variables in which one of them is an identification variable
+and the other two are not.
+
+\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
+
 \section{Normal forms of data}\label{normal-forms-of-data}
 
 The datasets included in the \texttt{nycflights13} package are in a form
@@ -1095,12 +1238,12 @@ \section{Normal forms of data}\label{normal-forms-of-data}
 \texttt{airlines} data frame together with the \texttt{flights} data
 frame by linking together the two datasets via a common \textbf{key} of
 \texttt{"carrier"}. Note that this ``joined'' data frame is assigned to
-a new data frame called \texttt{joined\_flights}.
+a new data frame called \texttt{joined\_flights}. The \textbf{key}
+variable that we frequently join by is one of the \emph{identification
+variables} mentioned above.
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{if(!}\KeywordTok{require}\NormalTok{(nycflights13))}
-  \KeywordTok{install.packages}\NormalTok{(}\StringTok{"nycflights13"}\NormalTok{, }\DataTypeTok{repos =} \StringTok{"http://cran.rstudio.org"}\NormalTok{)}
 \KeywordTok{library}\NormalTok{(dplyr)}
 \NormalTok{joined_flights <-}\StringTok{ }\KeywordTok{inner_join}\NormalTok{(}\DataTypeTok{x =} \NormalTok{flights, }\DataTypeTok{y =} \NormalTok{airlines, }\DataTypeTok{by =} \StringTok{"carrier"}\NormalTok{)}
 \end{Highlighting}
@@ -1113,12 +1256,11 @@ \section{Normal forms of data}\label{normal-forms-of-data}
 \end{Shaded}
 
 If we \texttt{View} this dataset, we see a new variable has been created
-called (We will see in Subsection 5.1.1 ways to change \texttt{name} to
-a more descriptive variable name.)
-
-More discussion about joining data frames together will be given in
-Chapter \ref{manip}. We will see there that the names of the columns to
-be linked need not match as they did here with \texttt{"carrier"}.
+called \texttt{name}. (We will see in Subsection \ref{rename} ways to
+change \texttt{name} to a more descriptive variable name.) More
+discussion about joining data frames together will be given in Chapter
+\ref{manip}. We will see there that the names of the columns to be
+linked need not match as they did here with \texttt{"carrier"}.
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
@@ -1219,11 +1361,10 @@ \section{What's to come?}\label{whats-to-come}
 understanding how this variable varies in relation to the values of
 other variables in the dataset. We will see that visualization is often
 a powerful tool in helping us see what is going on in a dataset. It will
-be a useful way to expand on the \texttt{str} function we have seen here
-for tidy data.
+be a useful way to expand on the \texttt{glimpse} function we have seen
+here for tidy data.
 
-\chapter{\texorpdfstring{Data Visualization via
-\texttt{ggplot2}}{Data Visualization via ggplot2}}\label{data-visualization-via-ggplot2}
+\chapter{Data Visualization via ggplot2}\label{viz}
 
 In Chapter \ref{tidy}, we discussed the importance of datasets being
 \textbf{tidy}. You will see in examples here why having a tidy dataset
@@ -1245,29 +1386,30 @@ \chapter{\texorpdfstring{Data Visualization via
 relationships and interesting findings can be easily seen. As we will
 see, plots/graphics also help us to identify patterns and outliers in
 our data. We will see that a common extension of these ideas is to
-compare the distribution of one quantitative variable (i.e., what the
-spread of a variable looks like) as we go across the levels of a
-different categorical variable.
+compare the \textbf{distribution} of one quantitative variable (i.e.,
+what the spread of a variable looks like or how the variable is
+\emph{distributed} in terms of its values) as we go across the levels of
+a different categorical variable.
 
-\section*{Needed packages}\label{needed-packages}
-\addcontentsline{toc}{section}{Needed packages}
+\subsection*{Needed packages}\label{needed-packages-1}
+\addcontentsline{toc}{subsection}{Needed packages}
 
 Before we proceed with this chapter, let's load all the necessary
-packages, in particular the \texttt{nycflights13} package introduced in
-Chapter \ref{tidy} containing various data sets.
+packages.
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\KeywordTok{library}\NormalTok{(dplyr)}
 \KeywordTok{library}\NormalTok{(ggplot2)}
 \KeywordTok{library}\NormalTok{(nycflights13)}
+\KeywordTok{library}\NormalTok{(knitr)}
+\KeywordTok{library}\NormalTok{(dplyr)}
 \end{Highlighting}
 \end{Shaded}
 
 \section{The Grammar of Graphics}\label{grammarofgraphics}
 
 We begin with a discussion of a theoretical framework for data
-visualization known as the ``The Grammar of Graphics'', which serves as
+visualization known as the ``The Grammar of Graphics,'' which serves as
 the basis for the \texttt{ggplot2} package. Much like the way we
 construct sentences in any language using a linguistic grammar (nouns,
 verbs, subjects, objects, etc.), the theoretical framework given by
@@ -1308,7 +1450,7 @@ \subsection{Napolean's March on Moscow}\label{napoleans-march-on-moscow}
 It was one of the biggest military disasters due in large part to the
 Russian winter. In 1869, a French civil engineer named Charles Joseph
 Minard published arguably one of the greatest statistical visualizations
-of all time which summarized this march:
+of all-time, which summarized this march:
 
 \begin{figure}
 
@@ -1325,7 +1467,7 @@ \subsection{Napolean's March on Moscow}\label{napoleans-march-on-moscow}
 Let's view this graphic through the lens of the Grammar of Graphics:
 
 \begin{table}
-\caption{\label{tab:unnamed-chunk-13}Grammar of Map (Top) and Line-Graph (Bottom) in Minard's Graphic of Napolean's March}
+\caption{\label{tab:unnamed-chunk-16}Grammar of Map (Top) and Line-Graph (Bottom) in Minard's Graphic of Napolean's March}
 
 \centering
 \begin{tabular}[t]{lll}
@@ -1349,12 +1491,12 @@ \subsection{Napolean's March on Moscow}\label{napoleans-march-on-moscow}
 \end{tabular}
 \end{table}
 
-For example, the data variable \texttt{longitude} gets mapped to
+For example, the data variable \texttt{longitude} gets mapped to the
 \texttt{x} \texttt{aes}thetic of the points \texttt{geom}etric objects
 on the map while the annotated line-graph displays \texttt{date} and
 \texttt{temperature} variable information via its mapping to the
-\texttt{x} and \texttt{y} aesthetic of the line \texttt{geom}etric
-object.
+\texttt{x} and \texttt{y} \texttt{aes}thetic of the line
+\texttt{geom}etric object.
 
 \subsection{Other Components of the
 Grammar}\label{other-components-of-the-grammar}
@@ -1367,7 +1509,7 @@ \subsection{Other Components of the
   \texttt{facet}: how to break up a plot into subsets
 \item
   \texttt{stat}istical transformations: this includes smoothing, binning
-  values into a histogram, or just itself untransformed
+  values into a histogram, or just itself un-transformed as
   \texttt{"identity"}.
 \item
   \texttt{scales} both
@@ -1383,15 +1525,15 @@ \subsection{Other Components of the
   \end{itemize}
 \item
   \texttt{coord}inate system for x/y values: typically
-  \texttt{cartesian}, but can also be \texttt{polar}, \texttt{map}
+  \texttt{cartesian}, but can also be \texttt{polar} or \texttt{map}
 \item
   \texttt{position} adjustments
 \end{itemize}
 
 In this text, we will only focus on the first two: \texttt{facet}ing
 (introduced in Section \ref{facets}) and \texttt{stat}istical
-transformations (in a limited sense when consider Barplots in Section
-\ref{geombar}) ; the other components are left to a more advanced text.
+transformations (in a limited sense, when consider Barplots in Section
+\ref{geombar}); the other components are left to a more advanced text.
 This is not a problem when producing a plot as each of these components
 have default settings.
 
@@ -1401,10 +1543,9 @@ \subsection{Other Components of the
 consistent framework that allows the user to easily tweak their
 creations as needed in order to convey a message about their data.
 
-\subsection{\texorpdfstring{The \texttt{ggplot2}
-Package}{The ggplot2 Package}}\label{the-ggplot2-package}
+\subsection{The ggplot2 Package}\label{the-ggplot2-package}
 
-We introduce Hadley Wickham's \texttt{ggplot2} package, which is an
+We next introduce Hadley Wickham's \texttt{ggplot2} package, which is an
 implementation of the Grammar of Graphics for R \citep{R-ggplot2}. You
 may have noticed that a lot of previous text in this chapter is written
 in computer font. This is because the various components of the Grammar
@@ -1424,40 +1565,6 @@ \subsection{\texorpdfstring{The \texttt{ggplot2}
 The names of the variables will be entered into the \texttt{aes}
 function as arguments where \texttt{aes} stands for ``aesthetics''.
 
-The plot given above is not a histogram, but the output does show us a
-bit of what is going on with
-\texttt{ggplot(data\ =\ weather,\ mapping\ =\ aes(x\ =\ temp))}. It is
-producing a backdrop onto which we will ``paint'' elements. We next
-proceed by adding a layer---hence, the use of the \texttt{+} symbol---to
-the plot to produce a histogram. (Note also here that we don't have to
-specify the \texttt{data\ =} and \texttt{mapping\ =} text in our
-function calls. This is covered in more detail in Chapter 5 of the
-``Getting Used to R, RStudio, and R Markdown'' book
-\citep{usedtor2016}).
-
-You are encouraged to enter \textbf{Return} on your keyboard after
-entering the \texttt{+}. As we add more and more elements, it will be
-nice to keep them indented as you see below. Note that this will not
-work if you begin the line with the \texttt{+}.
-
-An excellent resource as you begin to create plots using the
-\texttt{ggplot2} package is a cheatsheet that RStudio has put together
-entitled ``Data Visualization with ggplot2'' available
-
-\begin{itemize}
-\tightlist
-\item
-  By clicking
-  \href{https://www.rstudio.com/wp-content/uploads/2015/12/ggplot2-cheatsheet-2.0.pdf}{here}
-\item
-  or by clicking the RStudio Menu Bar -\textgreater{} Help
-  -\textgreater{} Cheatsheets -\textgreater{} ``Data Visualization with
-  ggplot2''
-\end{itemize}
-
-This covers more than what we've discussed in this chapter but provides
-nice visual descriptions of what each function produces.
-
 \section{Five Named Graphs - The 5NG}\label{five-named-graphs---the-5ng}
 
 For our purposes, we will be limiting consideration to five different
@@ -1502,20 +1609,20 @@ \section{5NG\#1: Scatter-plots}\label{scatterplots}
 \def\labelenumi{\arabic{enumi}.}
 \tightlist
 \item
-  \texttt{dep\_delay}: departure delay on the horizontal ``x'' axis
+  \texttt{dep\_delay}: departure delay on the horizontal ``x'' axis and
 \item
   \texttt{arr\_delay}: arrival delay on the vertical ``y'' axis
 \end{enumerate}
 
 for Alaska Airlines flights leaving NYC in 2013. This requires paring
 down the \texttt{flights} data frame to a smaller data frame
-\texttt{alaska\_flights} consisting of only Alaska Airlines (carrier
-code ``AS'') flights.
+\texttt{all\_alaska\_flights} consisting of only Alaska Airlines
+(carrier code ``AS'') flights.
 
 \begin{Shaded}
 \begin{Highlighting}[]
 \KeywordTok{data}\NormalTok{(flights)}
-\NormalTok{alaska_flights <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }
+\NormalTok{all_alaska_flights <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }
 \StringTok{  }\KeywordTok{filter}\NormalTok{(carrier ==}\StringTok{ "AS"}\NormalTok{)}
 \end{Highlighting}
 \end{Shaded}
@@ -1533,22 +1640,21 @@ \section{5NG\#1: Scatter-plots}\label{scatterplots}
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC3.1)} Take a look at both the \texttt{flights} and
-\texttt{alaska\_flights} data frames by running \texttt{View(flights)}
-and \texttt{View(alaska\_flights)} in the console. In what respect do
-these data frames differ?
+\textbf{(LC4.1)} Take a look at both the \texttt{flights} and
+\texttt{all\_alaska\_flights} data frames by running
+\texttt{View(flights)} and \texttt{View(all\_alaska\_flights)} in the
+console. In what respect do these data frames differ?
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
-\subsection{\texorpdfstring{Scatter-plots via
-\texttt{geom\_point}}{Scatter-plots via geom\_point}}\label{scatter-plots-via-geom_point}
+\subsection{Scatter-plots via geom\_point}\label{geompoint}
 
 We proceed to create the scatter-plot using the \texttt{ggplot()}
 function:
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data=}\NormalTok{alaska_flights, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \NormalTok{dep_delay, }\DataTypeTok{y =} \NormalTok{arr_delay)) +}\StringTok{ }
+\KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data =} \NormalTok{all_alaska_flights, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \NormalTok{dep_delay, }\DataTypeTok{y =} \NormalTok{arr_delay)) +}\StringTok{ }
 \StringTok{  }\KeywordTok{geom_point}\NormalTok{()}
 \end{Highlighting}
 \end{Shaded}
@@ -1562,6 +1668,11 @@ \subsection{\texorpdfstring{Scatter-plots via
 \caption[Arrival Delays vs Departure Delays for Alaska Airlines flights from NYC in 2013]{Arrival Delays vs Departure Delays for Alaska Airlines flights from NYC in 2013}\label{fig:noalpha}
 \end{figure}
 
+You are encouraged to enter \textbf{Return} on your keyboard after
+entering the \texttt{+}. As we add more and more elements, it will be
+nice to keep them indented as you see below. Note that this will not
+work if you begin the line with the \texttt{+}.
+
 Let's break down this keeping in mind our discussion in Section
 \ref{grammarofgraphics}:
 
@@ -1575,8 +1686,8 @@ \subsection{\texorpdfstring{Scatter-plots via
   \def\labelenumi{\arabic{enumi}.}
   \tightlist
   \item
-    The \texttt{data} frame to be \texttt{alaska\_flights} by setting
-    \texttt{data=alaska\_flights}
+    The \texttt{data} frame to be \texttt{all\_alaska\_flights} by
+    setting \texttt{data\ =\ all\_alaska\_flights}
   \item
     The \texttt{aes}thetic mapping by setting
     \texttt{aes(x\ =\ dep\_delay,\ y\ =\ arr\_delay)}. Specifically
@@ -1602,7 +1713,8 @@ \subsection{\texorpdfstring{Scatter-plots via
 between \texttt{dep\_delay} and \texttt{arr\_delay}: as departure delays
 increase, arrival delays tend to also increase. We also note that the
 majority of points fall near the point (0, 0). There is a large mass of
-points clustered there.
+points clustered there. (We will work more with this data set in Chapter
+\ref{regress}, where we investigate correlation and linear regression.)
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
@@ -1610,22 +1722,23 @@ \subsection{\texorpdfstring{Scatter-plots via
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC3.2)} What are some practical reasons why \texttt{dep\_delay}
+\textbf{(LC4.2)} What are some practical reasons why \texttt{dep\_delay}
 and \texttt{arr\_delay} have a positive relationship?
 
-\textbf{(LC3.3)} What variables (not necessarily in the \texttt{flights}
+\textbf{(LC4.3)} What variables (not necessarily in the \texttt{flights}
 data frame) would you expect to have a negative correlation (i.e.~a
 negative relationship) with \texttt{dep\_delay}? Why? Remember that we
 are focusing on continuous variables here.
 
-\textbf{(LC3.4)} Why do you believe there is a cluster of points near
+\textbf{(LC4.4)} Why do you believe there is a cluster of points near
 (0, 0)? What does (0, 0) correspond to in terms of the Alaskan flights?
 
-\textbf{(LC3.5)} What are some other features of the plot that stand out
+\textbf{(LC4.5)} What are some other features of the plot that stand out
 to you?
 
-\textbf{(LC3.6)} Create a new scatter-plot using different variables in
-the \texttt{alaska\_flights} data frame by modifying the example above.
+\textbf{(LC4.6)} Create a new scatter-plot using different variables in
+the \texttt{all\_alaska\_flights} data frame by modifying the example
+above.
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
@@ -1651,12 +1764,12 @@ \subsection{Over-Plotting}\label{over-plotting}
 The first way of relieving over-plotting is by changing the
 \texttt{alpha} argument to \texttt{geom\_point()} which controls the
 transparency of the points. By default, this value is set to \texttt{1}.
-We can change this value to a smaller fraction to change the
-transparency of the points in the plot:
+We can change this value to a smaller fraction (greater than 0) to
+change the transparency of the points in the plot:
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data=}\NormalTok{alaska_flights, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \NormalTok{dep_delay, }\DataTypeTok{y =} \NormalTok{arr_delay)) +}\StringTok{ }
+\KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data =} \NormalTok{all_alaska_flights, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \NormalTok{dep_delay, }\DataTypeTok{y =} \NormalTok{arr_delay)) +}\StringTok{ }
 \StringTok{  }\KeywordTok{geom_point}\NormalTok{(}\DataTypeTok{alpha =} \FloatTok{0.2}\NormalTok{)}
 \end{Highlighting}
 \end{Shaded}
@@ -1672,7 +1785,7 @@ \subsection{Over-Plotting}\label{over-plotting}
 
 Note how this function call is identical to the one in Section
 \ref{scatterplots}, but with \texttt{geom\_point()} replaced with
-\texttt{alpha=0.2} added.
+\texttt{alpha\ =\ 0.2} added.
 
 The second way of relieving over-plotting is to \textbf{jitter} the
 points a bit. In other words, we are going to add just a bit of random
@@ -1687,7 +1800,7 @@ \subsection{Over-Plotting}\label{over-plotting}
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data=}\NormalTok{alaska_flights, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \NormalTok{dep_delay, }\DataTypeTok{y =} \NormalTok{arr_delay)) +}\StringTok{ }
+\KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data =} \NormalTok{all_alaska_flights, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \NormalTok{dep_delay, }\DataTypeTok{y =} \NormalTok{arr_delay)) +}\StringTok{ }
 \StringTok{  }\KeywordTok{geom_jitter}\NormalTok{(}\DataTypeTok{width =} \DecValTok{30}\NormalTok{, }\DataTypeTok{height =} \DecValTok{30}\NormalTok{)}
 \end{Highlighting}
 \end{Shaded}
@@ -1715,11 +1828,11 @@ \subsection{Over-Plotting}\label{over-plotting}
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC3.7)} Why is setting the \texttt{alpha} argument value useful
+\textbf{(LC4.7)} Why is setting the \texttt{alpha} argument value useful
 with scatter-plots? What further information does it give you that a
 regular scatter-plot cannot?
 
-\textbf{(LC3.8)} After viewing the Figure \ref{fig:alpha} above, give a
+\textbf{(LC4.8)} After viewing the Figure \ref{fig:alpha} above, give a
 range of arrival times and departure times that occur most frequently?
 How has that region changed compared to when you observed the same plot
 without the \texttt{alpha\ =\ 0.2} set in Figure \ref{fig:noalpha}?
@@ -1748,7 +1861,8 @@ \section{5NG\#2: Line-graphs}\label{linegraphs}
 represents a variable that is connected together by each day following
 the previous day. In other words, time has a natural ordering.
 Line-graphs should be avoided when there is not a clear sequential
-ordering to the explanatory variable i.e.~the x-variable.
+ordering to the explanatory variable, i.e.~the x-variable or the
+\emph{predictor} variable.
 
 Our focus turns to the \texttt{temp} variable in this \texttt{weather}
 dataset. By
@@ -1765,7 +1879,7 @@ \section{5NG\#2: Line-graphs}\label{linegraphs}
 We can see that the \texttt{temp} variable corresponds to hourly
 temperature (in Fahrenheit) recordings at weather stations near airports
 in New York City. Instead of considering all hours in 2013 for all three
-airports in NYC, let's focus in the hourly temperature at Newark airport
+airports in NYC, let's focus on the hourly temperature at Newark airport
 (\texttt{origin} code ``EWR'') for the first 15 days in January 2013.
 The \texttt{weather} data frame in the \texttt{nycflights13} package
 contains this data, but we first need to filter it to only include those
@@ -1775,15 +1889,15 @@ \section{5NG\#2: Line-graphs}\label{linegraphs}
 \begin{Highlighting}[]
 \KeywordTok{data}\NormalTok{(weather)}
 \NormalTok{early_january_weather <-}\StringTok{ }\NormalTok{weather %>%}\StringTok{ }
-\StringTok{  }\KeywordTok{filter}\NormalTok{(origin==}\StringTok{"EWR"} \NormalTok{&}\StringTok{ }\NormalTok{month ==}\StringTok{ }\DecValTok{1} \NormalTok{&}\StringTok{ }\NormalTok{day <=}\StringTok{ }\DecValTok{15}\NormalTok{)}
+\StringTok{  }\KeywordTok{filter}\NormalTok{(origin ==}\StringTok{ "EWR"} \NormalTok{&}\StringTok{ }\NormalTok{month ==}\StringTok{ }\DecValTok{1} \NormalTok{&}\StringTok{ }\NormalTok{day <=}\StringTok{ }\DecValTok{15}\NormalTok{)}
 \end{Highlighting}
 \end{Shaded}
 
-This is very similar to the previous use of the \texttt{filter} command
-in Section \ref{scatterplots}, however we now use the \texttt{\&}
-operator. The above selects only those rows in \texttt{weather} where
-\texttt{origin=="EWR"\ **and**}month=1\texttt{**and**}day \textless{}=
-15`.
+This is similar to the previous use of the \texttt{filter} command in
+Section \ref{scatterplots}, however we now use the \texttt{\&} operator.
+The above selects only those rows in \texttt{weather} where
+\texttt{origin\ ==\ "EWR"} \textbf{and} \texttt{month\ =\ 1}
+\textbf{and} \texttt{day\ \textless{}=\ 15}.
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
@@ -1791,25 +1905,24 @@ \section{5NG\#2: Line-graphs}\label{linegraphs}
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC3.9)} Take a look at both the \texttt{weather} and
+\textbf{(LC4.9)} Take a look at both the \texttt{weather} and
 \texttt{early\_january\_weather} data frames by running
 \texttt{View(weather)} and \texttt{View(early\_january\_weather)} in the
 console. In what respect do these data frames differ?
 
-\textbf{(LC3.10)} The weather data is recorded hourly. Why does the
+\textbf{(LC4.10)} The weather data is recorded hourly. Why does the
 \texttt{time\_hour} variable correctly identify the hour of the
-measurement and not the just the \texttt{hour} variable?
+measurement whereas the \texttt{hour} variable does not?
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
-\subsection{\texorpdfstring{Line-graphs via
-\texttt{geom\_line}}{Line-graphs via geom\_line}}\label{line-graphs-via-geom_line}
+\subsection{Line-graphs via geom\_line}\label{geomline}
 
 We plot a line-graph of hourly temperature using \texttt{geom\_line()}:
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data=}\NormalTok{early_january_weather, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x=}\NormalTok{time_hour, }\DataTypeTok{y=}\NormalTok{temp)) +}
+\KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data =} \NormalTok{early_january_weather, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \NormalTok{time_hour, }\DataTypeTok{y =} \NormalTok{temp)) +}
 \StringTok{  }\KeywordTok{geom_line}\NormalTok{()}
 \end{Highlighting}
 \end{Shaded}
@@ -1837,7 +1950,7 @@ \subsection{\texorpdfstring{Line-graphs via
   \tightlist
   \item
     The \texttt{data} frame to be \texttt{early\_january\_weather} by
-    setting \texttt{data=early\_january\_weather}
+    setting \texttt{data\ =\ early\_january\_weather}
   \item
     The \texttt{aes}thetic mapping by setting
     \texttt{aes(x\ =\ time\_hour,\ y\ =\ temp)}. Specifically
@@ -1866,13 +1979,13 @@ \subsection{\texorpdfstring{Line-graphs via
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC3.11)} Why should line-graphs be avoided when there is not a
+\textbf{(LC4.11)} Why should line-graphs be avoided when there is not a
 clear ordering of the horizontal axis?
 
-\textbf{(LC3.12)} Why are line-graphs frequently used when time is the
+\textbf{(LC4.12)} Why are line-graphs frequently used when time is the
 explanatory variable?
 
-\textbf{(LC3.13)} Plot a time series of a variable other than
+\textbf{(LC4.13)} Plot a time series of a variable other than
 \texttt{temp} for Newark Airport in the first 15 days of January 2013.
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
@@ -1895,11 +2008,11 @@ \section{5NG\#3: Histograms}\label{histograms}
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-16-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-19-1} 
 
 }
 
-\caption[Strip Plot of Hourly Temperature Recordings from NYC in 2013]{Strip Plot of Hourly Temperature Recordings from NYC in 2013}\label{fig:unnamed-chunk-16}
+\caption[Strip Plot of Hourly Temperature Recordings from NYC in 2013]{Strip Plot of Hourly Temperature Recordings from NYC in 2013}\label{fig:unnamed-chunk-19}
 \end{figure}
 
 This gives us a general idea of how the values of \texttt{temp} differ.
@@ -1907,8 +2020,7 @@ \section{5NG\#3: Histograms}\label{histograms}
 Fahrenheit. The area between 40 and 60 degrees appears to have more
 points plotted than outside that range.
 
-\subsection{\texorpdfstring{Histograms via
-\texttt{geom\_histogram}}{Histograms via geom\_histogram}}\label{histograms-via-geom_histogram}
+\subsection{Histograms via geom\_histogram}\label{geomhistogram}
 
 What is commonly produced instead of this strip plot is a plot known as
 a \textbf{histogram}. The \textbf{histogram} shows how many elements of
@@ -1934,11 +2046,11 @@ \subsection{\texorpdfstring{Histograms via
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-17-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-20-1} 
 
 }
 
-\caption[Histogram of Hourly Temperature Recordings from NYC in 2013]{Histogram of Hourly Temperature Recordings from NYC in 2013}\label{fig:unnamed-chunk-17}
+\caption[Histogram of Hourly Temperature Recordings from NYC in 2013]{Histogram of Hourly Temperature Recordings from NYC in 2013}\label{fig:unnamed-chunk-20}
 \end{figure}
 
 Note here:
@@ -1973,25 +2085,29 @@ \subsection{Adjusting the Bins}\label{adjustbins}
 
 First, we have the power to specify how many bins we would like to put
 the data into as an argument in the \texttt{geom\_histogram} function.
-By default, this is chosen to be 30 somewhat arbitrarily we have
+By default, this is chosen to be 30 somewhat arbitrarily; we have
 received a warning above our plot that this was done.
 
 \begin{Shaded}
 \begin{Highlighting}[]
 \KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data =} \NormalTok{weather, }\DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \NormalTok{temp)) +}
-\StringTok{  }\KeywordTok{geom_histogram}\NormalTok{(}\DataTypeTok{bins =} \DecValTok{60}\NormalTok{)}
+\StringTok{  }\KeywordTok{geom_histogram}\NormalTok{(}\DataTypeTok{bins =} \DecValTok{60}\NormalTok{, }\DataTypeTok{color =} \StringTok{"white"}\NormalTok{)}
 \end{Highlighting}
 \end{Shaded}
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-18-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-21-1} 
 
 }
 
-\caption[Histogram of Hourly Temperature Recordings from NYC in 2013 - 60 Bins]{Histogram of Hourly Temperature Recordings from NYC in 2013 - 60 Bins}\label{fig:unnamed-chunk-18}
+\caption[Histogram of Hourly Temperature Recordings from NYC in 2013 - 60 Bins]{Histogram of Hourly Temperature Recordings from NYC in 2013 - 60 Bins}\label{fig:unnamed-chunk-21}
 \end{figure}
 
+Note the addition of the \texttt{color} argument. If you'd like to be
+able to more easily differentiate each of the bins, you can specify the
+color of the outline as done above.
+
 Second, instead of specifying the number of bins, we can also specify
 the width of the bins by using the \texttt{binwidth} argument in the
 \texttt{geom\_histogram} function.
@@ -1999,17 +2115,17 @@ \subsection{Adjusting the Bins}\label{adjustbins}
 \begin{Shaded}
 \begin{Highlighting}[]
 \KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data =} \NormalTok{weather, }\DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \NormalTok{temp)) +}
-\StringTok{  }\KeywordTok{geom_histogram}\NormalTok{(}\DataTypeTok{binwidth =} \DecValTok{10}\NormalTok{)}
+\StringTok{  }\KeywordTok{geom_histogram}\NormalTok{(}\DataTypeTok{binwidth =} \DecValTok{10}\NormalTok{, }\DataTypeTok{color =} \StringTok{"white"}\NormalTok{)}
 \end{Highlighting}
 \end{Shaded}
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-19-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-22-1} 
 
 }
 
-\caption[Histogram of Hourly Temperature Recordings from NYC in 2013 - Binwidth = 10]{Histogram of Hourly Temperature Recordings from NYC in 2013 - Binwidth = 10}\label{fig:unnamed-chunk-19}
+\caption[Histogram of Hourly Temperature Recordings from NYC in 2013 - Binwidth = 10]{Histogram of Hourly Temperature Recordings from NYC in 2013 - Binwidth = 10}\label{fig:unnamed-chunk-22}
 \end{figure}
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
@@ -2018,16 +2134,16 @@ \subsection{Adjusting the Bins}\label{adjustbins}
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC3.14)} What does changing the number of bins from 30 to 60
+\textbf{(LC4.14)} What does changing the number of bins from 30 to 60
 tell us about the distribution of temperatures?
 
-\textbf{(LC3.15)} Would you classify the distribution of temperatures as
+\textbf{(LC4.15)} Would you classify the distribution of temperatures as
 symmetric or skewed?
 
-\textbf{(LC3.16)} What would you guess is the ``center'' value in this
+\textbf{(LC4.16)} What would you guess is the ``center'' value in this
 distribution? Why did you make that choice?
 
-\textbf{(LC3.17)} Is this data spread out greatly from the center or is
+\textbf{(LC4.17)} Is this data spread out greatly from the center or is
 it close? Why?
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
@@ -2046,17 +2162,19 @@ \section{Facets}\label{facets}
 default, all of the small multiples will have the same vertical axis.
 
 For example, suppose we were interested in looking at how the
-temperature histograms we saw in Chapter \ref{histograms} varied by
+temperature histograms we saw in Section \ref{histograms} varied by
 month. This is what is meant by ``the distribution of a variable over
 another variable'': \texttt{temp} is one variable and \texttt{month} is
 the other variable. In order to look at histograms of \texttt{temp} for
 each month, we add a layer \texttt{facet\_wrap(\textasciitilde{}month)}.
+You can also specify how many rows you'd like the small multiple plots
+to be in using \texttt{nrow} inside of \texttt{facet\_wrap}.
 
 \begin{Shaded}
 \begin{Highlighting}[]
 \KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data =} \NormalTok{weather, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \NormalTok{temp)) +}
-\StringTok{  }\KeywordTok{geom_histogram}\NormalTok{(}\DataTypeTok{binwidth =} \DecValTok{5}\NormalTok{) +}
-\StringTok{  }\KeywordTok{facet_wrap}\NormalTok{(~month)}
+\StringTok{  }\KeywordTok{geom_histogram}\NormalTok{(}\DataTypeTok{binwidth =} \DecValTok{5}\NormalTok{, }\DataTypeTok{color =} \StringTok{"white"}\NormalTok{) +}
+\StringTok{  }\KeywordTok{facet_wrap}\NormalTok{(~}\StringTok{ }\NormalTok{month, }\DataTypeTok{nrow =} \DecValTok{4}\NormalTok{)}
 \end{Highlighting}
 \end{Shaded}
 
@@ -2078,19 +2196,19 @@ \section{Facets}\label{facets}
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC3.18)} What other things do you notice about the faceted plot
+\textbf{(LC4.18)} What other things do you notice about the faceted plot
 above? How does a faceted plot help us see how relationships between two
 variables?
 
-\textbf{(LC3.19)} What do the numbers 1-12 correspond to in the plot
+\textbf{(LC4.19)} What do the numbers 1-12 correspond to in the plot
 above? What about 25, 50, 75, 100?
 
-\textbf{(LC3.21)} For which types of datasets would these types of
+\textbf{(LC4.20)} For which types of datasets would these types of
 faceted plots not work well in comparing relationships between
 variables? Give an example describing the variability of the variables
 and other important characteristics.
 
-\textbf{(LC3.22)} Does the \texttt{temp} variable in the
+\textbf{(LC4.21)} Does the \texttt{temp} variable in the
 \texttt{weather} data set have a lot of variability? Why do you say
 that?
 
@@ -2102,13 +2220,12 @@ \section{5NG\#4: Boxplots}\label{ng4-boxplots}
 distributions of a continuous variable split by groups of a categorical
 variable as in Chapter \ref{facets}, an alternative plot called a
 \textbf{boxplot} (also called a \textbf{side-by-side boxplot}) achieves
-the same task. The \textbf{boxplot} uses the information provided in the
-\textbf{five-number summary} referred to in Appendix \ref{appendixA}. It
-gives a way to compare this summary information across the different
-levels of a categorical variable.
+the same task and is frequently preferred. The \textbf{boxplot} uses the
+information provided in the \textbf{five-number summary} referred to in
+Appendix \ref{appendixA}. It gives a way to compare this summary
+information across the different levels of a categorical variable.
 
-\subsection{\texorpdfstring{Boxplots via
-\texttt{geom\_boxplot}}{Boxplots via geom\_boxplot}}\label{boxplots-via-geom_boxplot}
+\subsection{Boxplots via geom\_boxplot}\label{geomboxplot}
 
 Let's create a boxplot to compare the monthly temperatures as we did
 above with the faceted histograms.
@@ -2171,20 +2288,20 @@ \subsection{\texorpdfstring{Boxplots via
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC3.23)} What does the dot at the bottom of the plot for May
+\textbf{(LC4.22)} What does the dot at the bottom of the plot for May
 correspond to? Explain what might have occurred in May to produce this
 point.
 
-\textbf{(LC3.24)} Which months have the highest variability in
+\textbf{(LC4.23)} Which months have the highest variability in
 temperature? What reasons do you think this is?
 
-\textbf{(LC3.25)} We looked at the distribution of a continuous variable
+\textbf{(LC4.24)} We looked at the distribution of a continuous variable
 over a categorical variable here with this boxplot. Why can't we look at
 the distribution of one continuous variable over the distribution of
-another continuous variable? Say temperature across pressure, for
+another continuous variable? Say, temperature across pressure, for
 example?
 
-\textbf{(LC3.26)} Boxplots provide a simple way to identify outliers.
+\textbf{(LC4.25)} Boxplots provide a simple way to identify outliers.
 Why may outliers be easier to identify when looking at a boxplot instead
 of a faceted histogram?
 
@@ -2200,7 +2317,7 @@ \subsection{Summary}\label{summary-3}
 looking at the width of the box and also how far out the lines stretch
 from the box. If the lines stretch far from the box but the box has a
 small width, the variability of the values closer to the center is much
-smaller than the variable of the outer ends of the variable. Lastly,
+smaller than the variability of the outer ends of the variable. Lastly,
 outliers are even more easily identified when looking at a boxplot than
 when looking at a histogram.
 
@@ -2212,8 +2329,7 @@ \section{5NG\#5: Barplots}\label{geombar}
 will be interested in how many elements from our data fall into the
 different categories of the categorical variable.
 
-\subsection{\texorpdfstring{Barplots via
-\texttt{geom\_bar}}{Barplots via geom\_bar}}\label{barplots-via-geom_bar}
+\subsection{Barplots via geom\_bar}\label{barplots-via-geom_bar}
 
 Frequently, the best way to visualize these different counts (also known
 as \textbf{frequencies}) is via a barplot. Consider the distribution of
@@ -2237,34 +2353,118 @@ \subsection{\texorpdfstring{Barplots via
 \caption[Number of flights departing NYC in 2013 by airline]{Number of flights departing NYC in 2013 by airline}\label{fig:flightsbar}
 \end{figure}
 
-We see that United Air Lines, JetBlue Airways, and ExpressJet Airlines
-had the most flights depart New York City in 2013. To get the actual
-number of flights by each airline we can use the \texttt{count} function
-in the \texttt{dplyr} package on the \texttt{carrier} variable in
-\texttt{flights}, which we will introduce formally in Chapter
-@ref\{manip\}.
+To get an understanding of what the names of these airlines are
+corresponding to these \texttt{carrier} codes, we can look at the
+\texttt{airlines} data frame in the \texttt{nycflights13} package. Note
+the use of the \texttt{kable} function here in the \texttt{knitr}
+package, which produces a nicely-formatted table of the values in the
+\texttt{airlines} data frame.
 
-\begin{verbatim}
-## # A tibble: 16 × 2
-##    carrier     n
-##      <chr> <int>
-## 1       9E 18460
-## 2       AA 32729
-## 3       AS   714
-## 4       B6 54635
-## 5       DL 48110
-## 6       EV 54173
-## 7       F9   685
-## 8       FL  3260
-## 9       HA   342
-## 10      MQ 26397
-## 11      OO    32
-## 12      UA 58665
-## 13      US 20536
-## 14      VX  5162
-## 15      WN 12275
-## 16      YV   601
-\end{verbatim}
+\begin{Shaded}
+\begin{Highlighting}[]
+\KeywordTok{data}\NormalTok{(airlines)}
+\KeywordTok{kable}\NormalTok{(airlines)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{tabular}{l|l}
+\hline
+carrier & name\\
+\hline
+9E & Endeavor Air Inc.\\
+\hline
+AA & American Airlines Inc.\\
+\hline
+AS & Alaska Airlines Inc.\\
+\hline
+B6 & JetBlue Airways\\
+\hline
+DL & Delta Air Lines Inc.\\
+\hline
+EV & ExpressJet Airlines Inc.\\
+\hline
+F9 & Frontier Airlines Inc.\\
+\hline
+FL & AirTran Airways Corporation\\
+\hline
+HA & Hawaiian Airlines Inc.\\
+\hline
+MQ & Envoy Air\\
+\hline
+OO & SkyWest Airlines Inc.\\
+\hline
+UA & United Air Lines Inc.\\
+\hline
+US & US Airways Inc.\\
+\hline
+VX & Virgin America\\
+\hline
+WN & Southwest Airlines Co.\\
+\hline
+YV & Mesa Airlines Inc.\\
+\hline
+\end{tabular}
+
+Going back to our barplot, we see that United Air Lines, JetBlue
+Airways, and ExpressJet Airlines had the most flights depart New York
+City in 2013. To get the actual number of flights by each airline we can
+use the \texttt{count} function in the \texttt{dplyr} package on the
+\texttt{carrier} variable in \texttt{flights}, which we will introduce
+formally in Chapter \ref{manip}.
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{flights_table <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }\NormalTok{dplyr::}\KeywordTok{count}\NormalTok{(carrier)}
+\NormalTok{knitr::}\KeywordTok{kable}\NormalTok{(flights_table)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{tabular}{l|r}
+\hline
+carrier & n\\
+\hline
+9E & 18460\\
+\hline
+AA & 32729\\
+\hline
+AS & 714\\
+\hline
+B6 & 54635\\
+\hline
+DL & 48110\\
+\hline
+EV & 54173\\
+\hline
+F9 & 685\\
+\hline
+FL & 3260\\
+\hline
+HA & 342\\
+\hline
+MQ & 26397\\
+\hline
+OO & 32\\
+\hline
+UA & 58665\\
+\hline
+US & 20536\\
+\hline
+VX & 5162\\
+\hline
+WN & 12275\\
+\hline
+YV & 601\\
+\hline
+\end{tabular}
+
+\textbf{Technical note}: Refer to the use of \texttt{::} in both lines
+of code above. This is another way of ensuring the correct function is
+called. A \texttt{count} exists in a couple different packages and
+sometimes you'll receive strange errors when a different instance of a
+function is used. This is a great way of telling R that ``I want this
+one!''. You specify the name of the package directly before the
+\texttt{::} and then the name of the function immediately after
+\texttt{::}.
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
@@ -2272,16 +2472,16 @@ \subsection{\texorpdfstring{Barplots via
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC3.27)} Why are histograms inappropriate for visualizing
+\textbf{(LC4.26)} Why are histograms inappropriate for visualizing
 categorical variables?
 
-\textbf{(LC3.28)} What is the difference between histograms and
+\textbf{(LC4.27)} What is the difference between histograms and
 barplots?
 
-\textbf{(LC3.29)} How many Envoy Air flights departed NYC in 2013?
+\textbf{(LC4.28)} How many Envoy Air flights departed NYC in 2013?
 
-\textbf{(LC3.30)} What was the seventh highest airline in terms of
-departed flights from NYC in 2013? How can we better present the table
+\textbf{(LC4.29)} What was the seventh highest airline in terms of
+departed flights from NYC in 2013? How could we better present the table
 to get this answer quickly.
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
@@ -2336,7 +2536,7 @@ \subsection{Must avoid pie charts!}\label{must-avoid-pie-charts}
 
 }
 
-\caption[The only good pie chart]{The only good pie chart}\label{fig:unnamed-chunk-21}
+\caption[The only good pie chart]{The only good pie chart}\label{fig:unnamed-chunk-25}
 \end{figure}
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
@@ -2345,10 +2545,10 @@ \subsection{Must avoid pie charts!}\label{must-avoid-pie-charts}
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC3.31)} Why should pie charts be avoided and replaced by
+\textbf{(LC4.30)} Why should pie charts be avoided and replaced by
 barplots?
 
-\textbf{(LC3.32)} What is your opinion as to why pie charts continue to
+\textbf{(LC4.31)} What is your opinion as to why pie charts continue to
 be used?
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
@@ -2384,6 +2584,13 @@ \subsection{Using barplots to compare two
 here is \texttt{fill\ =\ name}. Look over what was produced from the
 plot to get an idea of what this argument gives.
 
+Note that \texttt{fill} is an \texttt{aes}thetic just like \texttt{x} is
+an \texttt{aes}thetic. We need to make the \texttt{name} variable to
+this \texttt{aes}thetic. Any time you use a variable like this, you need
+to make sure it is wrapped inside the \texttt{aes} function.
+\textbf{This is a common error!} Make note of this now so you don't fall
+into this problem later.
+
 \begin{Shaded}
 \begin{Highlighting}[]
 \KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data =} \NormalTok{flights_namedports, }\DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \NormalTok{carrier, }\DataTypeTok{fill =} \NormalTok{name)) +}
@@ -2393,11 +2600,11 @@ \subsection{Using barplots to compare two
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-23-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-27-1} 
 
 }
 
-\caption[Stacked barplot comparing the number of flights by carrier and airport]{Stacked barplot comparing the number of flights by carrier and airport}\label{fig:unnamed-chunk-23}
+\caption[Stacked barplot comparing the number of flights by carrier and airport]{Stacked barplot comparing the number of flights by carrier and airport}\label{fig:unnamed-chunk-27}
 \end{figure}
 
 This plot is what is known as a \textbf{stacked barplot}. While simple
@@ -2409,10 +2616,10 @@ \subsection{Using barplots to compare two
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC3.33)} What kinds of questions are not easily answered by
+\textbf{(LC4.32)} What kinds of questions are not easily answered by
 looking at the above figure?
 
-\textbf{(LC3.34)} What can you say, if anything, about the relationship
+\textbf{(LC4.33)} What can you say, if anything, about the relationship
 between airline and airport in NYC in 2013 in regards to the number of
 departing flights?
 
@@ -2430,11 +2637,11 @@ \subsection{Using barplots to compare two
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-24-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-28-1} 
 
 }
 
-\caption[Side-by-side barplot comparing the number of flights by carrier and airport]{Side-by-side barplot comparing the number of flights by carrier and airport}\label{fig:unnamed-chunk-24}
+\caption[Side-by-side barplot comparing the number of flights by carrier and airport]{Side-by-side barplot comparing the number of flights by carrier and airport}\label{fig:unnamed-chunk-28}
 \end{figure}
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
@@ -2443,10 +2650,10 @@ \subsection{Using barplots to compare two
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC3.35)} Why might the side-by-side barplot be preferable to a
+\textbf{(LC4.34)} Why might the side-by-side barplot be preferable to a
 stacked barplot in this case?
 
-\textbf{(LC3.36)} What are the disadvantages of using a side-by-side
+\textbf{(LC4.35)} What are the disadvantages of using a side-by-side
 barplot, in general?
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
@@ -2466,11 +2673,11 @@ \subsection{Using barplots to compare two
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-25-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-29-1} 
 
 }
 
-\caption[Faceted barplot comparing the number of flights by carrier and airport]{Faceted barplot comparing the number of flights by carrier and airport}\label{fig:unnamed-chunk-25}
+\caption[Faceted barplot comparing the number of flights by carrier and airport]{Faceted barplot comparing the number of flights by carrier and airport}\label{fig:unnamed-chunk-29}
 \end{figure}
 
 Note how the \texttt{facet\_grid} function arguments are written here.
@@ -2486,10 +2693,10 @@ \subsection{Using barplots to compare two
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC3.37)} Why is the faceted barplot preferred to the
+\textbf{(LC4.36)} Why is the faceted barplot preferred to the
 side-by-side and stacked barplots in this case?
 
-\textbf{(LC3.38)} What information about the different carriers at
+\textbf{(LC4.37)} What information about the different carriers at
 different airports is more easily seen in the faceted barplot?
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
@@ -2508,33 +2715,66 @@ \subsection{Summary}\label{summary-4}
 
 \section{Conclusion}\label{conclusion}
 
-\subsection{What's to come?}\label{whats-to-come-1}
+\subsection{Resources}\label{resources}
 
-In Chapter \ref{manip}, we'll further explore data by grouping our data,
-creating summaries based on those groupings, filtering our data to match
-conditions, selecting specific columns of our data, and other
-manipulations with our data including defining new columns/variables.
-These data manipulation procedures will go hand-in-hand with the data
-visualizations you've produced here.
+An excellent resource as you begin to create plots using the
+\texttt{ggplot2} package is a cheatsheet that RStudio has put together
+entitled ``Data Visualization with ggplot2'' available
+
+\begin{itemize}
+\tightlist
+\item
+  by clicking
+  \href{https://www.rstudio.com/wp-content/uploads/2015/12/ggplot2-cheatsheet-2.0.pdf}{here}
+  or
+\item
+  by clicking the RStudio Menu Bar -\textgreater{} Help -\textgreater{}
+  Cheatsheets -\textgreater{} ``Data Visualization with
+  \texttt{ggplot2}''
+\end{itemize}
+
+This covers more than what we've discussed in this chapter but provides
+nice visual descriptions of what each function produces.
+
+In addition, we've created a mind map to help you remember which types
+of plots are most appropriate in a given situation by identifying the
+types of variables involved in the problem. It is available
+\href{https://coggle.it/diagram/V_G2gzukTDoQ-aZt-}{here} and below.
+
+\begin{figure}
+
+{\centering \includegraphics[width=2\linewidth]{images/coggleviz} 
+
+}
+
+\caption[Mind map for Data Visualization]{Mind map for Data Visualization}\label{fig:viz-map}
+\end{figure}
 
 \subsection{Script of R code}\label{script-of-r-code}
 
 An R script file of all R code used in this chapter is available
 \href{http://ismayc.github.io/moderndiver-book/04-viz.R}{here}.
 
-\chapter{\texorpdfstring{Data Manipulation via
-\texttt{dplyr}}{Data Manipulation via dplyr}}\label{data-manipulation-via-dplyr}
+\subsection{What's to come?}\label{whats-to-come-1}
+
+In Chapter \ref{manip}, we'll further explore data by grouping our data,
+creating summaries based on those groupings, filtering our data to match
+conditions, and other manipulations with our data including defining new
+columns/variables. These data manipulation procedures will go
+hand-in-hand with the data visualizations you've produced here.
+
+\chapter{Data Manipulation via dplyr}\label{manip}
 
 Let's briefly recap where we have been so far and where we are headed.
 In Chapter \ref{tidy}, we discussed what it means for data to be tidy.
 We saw that this refers to observational units corresponding to rows and
-variables being stored in columns. The entries in the data frame
-correspond to different combinations of observational units and
-variables. In the \texttt{flights} data frame, we saw that each row
-corresponded to a different flight leaving New York City. (In other
-words, the observational unit of that tidy data frame is a flight.) The
-variables are listed as columns and for \texttt{flights} they include
-both quantitative variables like \texttt{dep\_delay} and
+variables being stored in columns (one variable for every column). The
+entries in the data frame correspond to different combinations of
+observational units and variables. In the \texttt{flights} data frame,
+we saw that each row corresponds to a different flight leaving New York
+City. In other words, the observational unit of that tidy data frame is
+a flight. The variables are listed as columns and for \texttt{flights}
+they include both quantitative variables like \texttt{dep\_delay} and
 \texttt{distance} but also categorical variables like \texttt{carrier}
 and \texttt{origin}. An entry in the table corresponds to a particular
 flight on a given day and a particular value of a given variable
@@ -2547,15 +2787,23 @@ \chapter{\texorpdfstring{Data Manipulation via
 things such as changing the color by another variable or change the size
 of our points by a fourth variable given this tidy data set.
 
-In Chapter \ref{viz}, we also introduced some ways to summarize and
-manipulate data to suit your needs. This chapter focuses more on the
-details of this by giving a variety of examples using the four main
-verbs in the \texttt{dplyr} package \citep{R-dplyr}. There are more
-advanced operations that can be done than these and you'll see some
-examples of this near the end of the chapter.
+Furthermore, in Chapter \ref{viz}, we hinted at some ways to summarize
+and manipulate data to suit your needs. This chapter expands on this by
+giving a variety of examples using what we call the \emph{Five Main
+Verbs} in the \texttt{dplyr} package \citep{R-dplyr}. There are more
+advanced operations than just these and you'll see some examples of this
+near the end of the chapter.
 
-\section*{Needed packages}\label{needed-packages-1}
-\addcontentsline{toc}{section}{Needed packages}
+While at various points we specifically make mention to use the
+\texttt{View()} command to inspect a particular data frame, feel free to
+do so whenever. In fact, you should get into the habit of doing this for
+\emph{any} data frame you work with.
+
+\subsection*{Needed packages}\label{needed-packages-2}
+\addcontentsline{toc}{subsection}{Needed packages}
+
+Before we proceed with this chapter, let's load all the necessary
+packages.
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -2569,53 +2817,73 @@ \section*{Needed packages}\label{needed-packages-1}
 \section{\texorpdfstring{The pipe
 \texttt{\%\textgreater{}\%}}{The pipe \%\textgreater{}\%}}\label{the-pipe}
 
-Just as the \texttt{+} sign was used to add layers to a plot created
-using \texttt{ggplot} we will use the pipe operator
-(\texttt{\%\textgreater{}\%}) to chain together \texttt{dplyr}
-functions. We read the pipe operator as ``and then''. The
+Before we introduce the five main verbs, we first introduce the the pipe
+operator (\texttt{\%\textgreater{}\%}). Just as the \texttt{+} sign was
+used to add layers to a plot created using \texttt{ggplot()}, the pipe
+operator allows us to chain together \texttt{dplyr} data manipulation
+functions. The pipe operator can be read as ``\emph{then}''. The
 \texttt{\%\textgreater{}\%} operator allows us to go from one step in
-\texttt{dplyr} to the next easily so we can \texttt{filter} our data
-frame to only focus on a few rows, and then take that filtered data set,
-and \texttt{group\_by} another variable, and then lastly
-\texttt{summarize} this grouped data to calculate the mean for each
-level of the group.
+\texttt{dplyr} to the next easily so we can, for example:
+
+\begin{itemize}
+\tightlist
+\item
+  \texttt{filter} our data frame to only focus on a few rows \emph{then}
+\item
+  \texttt{group\_by} another variable to create groups \emph{then}
+\item
+  \texttt{summarize} this grouped data to calculate the mean for each
+  level of the group.
+\end{itemize}
 
 The piping syntax will be our major focus throughout the rest of this
 book and you'll find that you'll quickly be addicted to the chaining
 with some practice. If you'd like to see more examples on using
-\texttt{dplyr}, the 4MV (in addition to some other \texttt{dplyr}
+\texttt{dplyr}, the 5MV (in addition to some other \texttt{dplyr}
 verbs), and \texttt{\%\textgreater{}\%} with the \texttt{nycflights13}
 data set, you can check out Chapter 5 of Hadley and Garrett's book
 \citep{rds2016}.
 
-\section{Four Main Verbs - The 4MV}\label{four-main-verbs---the-4mv}
+\section{Five Main Verbs - The 5MV}\label{five-main-verbs---the-5mv}
 
-The \texttt{d} in \texttt{dplyr} stands for data frames so the functions
-here work when you are working with objects of the data frame type. It's
-most important for you to focus on the four most commonly used functions
-that help us manipulate and summarize data. A description of these verbs
-follows with each subsection devoted to seeing an example of that verb
-in play (or a combination of a few verbs):
+The \texttt{d} in \texttt{dplyr} stands for data frames, so the
+functions here work when you are working with objects of the data frame
+type. It's most important for you to focus on the 5MV: the five most
+commonly used functions that help us manipulate and summarize data. A
+description of these verbs follows with each subsection devoted to
+seeing an example of that verb in play (or a combination of a few
+verbs):
 
 \begin{itemize}
 \tightlist
 \item
   \texttt{filter}: Pick rows based on conditions about their values
 \item
-  \texttt{summarize}: Create summary measures of variables (or groups of
-  observations on variables using \texttt{group\_by})
+  \texttt{summarize}: Create summary measures of variables either
+
+  \begin{itemize}
+  \tightlist
+  \item
+    over the entire data frame
+  \item
+    or over groups of observations on variables using \texttt{group\_by}
+  \end{itemize}
 \item
-  \texttt{mutate}: Make a new variable in the data frame
+  \texttt{mutate}: Create a new variable in the data frame by mutating
+  existing ones
 \item
-  \texttt{arrange}: Sort the rows based on one or more variables
+  \texttt{arrange}: Arrange/sort the rows based on one or more variables
 \end{itemize}
 
 Just as we had the 5NG (The Five Named Graphs in Chapter \ref{viz} using
-\texttt{ggplot2}), we have the 4MV here (The Four Main Verbs in
-\texttt{dplyr}):
+\texttt{ggplot2}) for data visualization, we also have the 5MV here (The
+Five Main Verbs in \texttt{dplyr}) for data manipulation. All of the
+5MVs follow the same syntax with the argument before the pipe
+\texttt{\%\textgreater{}\%} being the name of the data frame and then
+the name of the verb with other arguments specifying which criteria
+you'd like the verb to work with in parentheses.
 
-\subsection{\texorpdfstring{Filter observations using
-\texttt{filter}}{Filter observations using filter}}\label{filter-observations-using-filter}
+\subsection{5MV\#1: Filter observations using filter}\label{filter}
 
 \begin{figure}
 
@@ -2626,52 +2894,48 @@ \subsection{\texorpdfstring{Filter observations using
 \caption[Filter diagram from Data Wrangling with dplyr and tidyr cheatsheet]{Filter diagram from Data Wrangling with dplyr and tidyr cheatsheet}\label{fig:filter}
 \end{figure}
 
-All of the 4MVs follow the same syntax with the argument before the pipe
-being the name of the data frame and then the name of the verb with
-other arguments specifying which criteria you'd like the verb to work
-with in parantheses.
-
 The \texttt{filter} function here works much like the ``Filter'' option
-in Microsoft Excel. It allows you to specify criteria about values of a
+in Microsoft Excel; it allows you to specify criteria about values of a
 variable in your data set and then chooses only those rows that match
 that criteria. We begin by focusing only on flights from New York City
 to Portland, Oregon. The \texttt{dest} code (or airport code) for
-Portland, Oregon is \texttt{"PDX"}:
+Portland, Oregon is \texttt{"PDX"}. Run the following and look at the
+resulting spreadsheet to ensure that only flights heading to Portland
+are chosen here:
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{portland_flights <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }\KeywordTok{filter}\NormalTok{(dest ==}\StringTok{ "PDX"}\NormalTok{)}
-\NormalTok{portland_flights}
+\NormalTok{portland_flights <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{filter}\NormalTok{(dest ==}\StringTok{ "PDX"}\NormalTok{)}
+\KeywordTok{View}\NormalTok{(pdx_flights)}
 \end{Highlighting}
 \end{Shaded}
 
-\begin{verbatim}
-## # A tibble: 1,354 × 19
-##     year month   day dep_time sched_dep_time dep_delay arr_time
-##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
-## 1   2013     1     1     1739           1740        -1     2051
-## 2   2013     1     1     1805           1757         8     2117
-## 3   2013     1     1     2052           2029        23     2349
-## 4   2013     1     2      804            805        -1     1039
-## 5   2013     1     2     1552           1550         2     1853
-## 6   2013     1     2     1727           1720         7     2042
-## 7   2013     1     2     1738           1740        -2     2028
-## 8   2013     1     2     2024           2029        -5     2314
-## 9   2013     1     3     1755           1745        10     2110
-## 10  2013     1     3     1814           1727        47     2108
-## # ... with 1,344 more rows, and 12 more variables:
-## #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
-## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
-## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
-\end{verbatim}
+Note the following:
 
-Note the second equals sign here. You are almost guaranteed to make the
-mistake at least once of only including one equals sign. Let's see what
-happens when we make this error:
+\begin{itemize}
+\tightlist
+\item
+  The ordering of the commands:
+
+  \begin{itemize}
+  \tightlist
+  \item
+    Take the data frame \texttt{flights} \emph{then}
+  \item
+    \texttt{filter} the data frame so that only those where the
+    \texttt{dest} equals \texttt{"PDX"} are included.
+  \end{itemize}
+\item
+  The double equal sign \texttt{==} You are almost guaranteed to make
+  the mistake at least once of only including one equals sign. Let's see
+  what happens when we make this error:
+\end{itemize}
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{portland_flights <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }\KeywordTok{filter}\NormalTok{(}\DataTypeTok{dest =} \StringTok{"PDX"}\NormalTok{)}
+\NormalTok{portland_flights <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{filter}\NormalTok{(}\DataTypeTok{dest =} \StringTok{"PDX"}\NormalTok{)}
 \end{Highlighting}
 \end{Shaded}
 
@@ -2679,10 +2943,6 @@ \subsection{\texorpdfstring{Filter observations using
 Error: filter() takes unnamed arguments. Do you need `==`?
 \end{verbatim}
 
-You should run \texttt{View(pdx\_flights)} to glance at the data in
-spreadsheet form and ensure that only flights heading to Portland are
-chosen here.
-
 You can combine multiple criteria together using operators that make
 comparisons:
 
@@ -2717,111 +2977,54 @@ \subsection{\texorpdfstring{Filter observations using
 To see many of these in action, let's select all flights that left JFK
 airport heading to Burlington, Vermont (\texttt{"BTV"}) or Seattle,
 Washington (\texttt{"SEA"}) in the months of October, November, or
-December:
+December. Run the following
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{btv_sea_flights_fall <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }\KeywordTok{filter}\NormalTok{(}
-                               \NormalTok{origin ==}\StringTok{ "JFK"}\NormalTok{, }
-                               \NormalTok{(dest ==}\StringTok{ "BTV"}\NormalTok{) |}\StringTok{ }\NormalTok{(dest ==}\StringTok{ "SEA"}\NormalTok{),}
-                               \NormalTok{month >=}\StringTok{ }\DecValTok{10}\NormalTok{)}
+\NormalTok{btv_sea_flights_fall <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{filter}\NormalTok{(origin ==}\StringTok{ "JFK"}\NormalTok{, (dest ==}\StringTok{ "BTV"} \NormalTok{|}\StringTok{ }\NormalTok{dest ==}\StringTok{ "SEA"}\NormalTok{), month >=}\StringTok{ }\DecValTok{10}\NormalTok{)}
+\KeywordTok{View}\NormalTok{(btv_sea_flights_fall)}
 \end{Highlighting}
 \end{Shaded}
 
+Note how even though colloquially speaking one might say ``all flights
+leaving Burlington, Vermont \emph{and} Seattle, Washington'', in terms
+of computer operations, we really mean ``all flights leaving Burlington,
+Vermont \emph{or} Seattle, Washington'', because for a given row in the
+data, \texttt{dest} can either be: ``BTV'', ``SEA'', or something else,
+but not ``BTV'' and ``SEA'' at the same time.
+
 Another example uses the \texttt{!} to pick rows that \textbf{DON'T}
-match a condition. Here we are referring to excluding the Northern
-Hemisphere summer months of June, July, and August.
+match a condition. Here we are selecting rows corresponding to flights
+that didn't go to Burlington, VT or Seattle, WA.
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{not_summer_flights <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }\KeywordTok{filter}\NormalTok{(!}\KeywordTok{between}\NormalTok{(month, }\DecValTok{6}\NormalTok{, }\DecValTok{8}\NormalTok{))}
-\NormalTok{not_summer_flights}
+\NormalTok{not_BTV_SEA <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{filter}\NormalTok{(!(dest ==}\StringTok{ "BTV"} \NormalTok{|}\StringTok{ }\NormalTok{dest ==}\StringTok{ "SEA"}\NormalTok{))}
+\KeywordTok{View}\NormalTok{(not_BTV_SEA)}
 \end{Highlighting}
 \end{Shaded}
 
-\begin{verbatim}
-## # A tibble: 249,781 × 19
-##     year month   day dep_time sched_dep_time dep_delay arr_time
-##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
-## 1   2013     1     1      517            515         2      830
-## 2   2013     1     1      533            529         4      850
-## 3   2013     1     1      542            540         2      923
-## 4   2013     1     1      544            545        -1     1004
-## 5   2013     1     1      554            600        -6      812
-## 6   2013     1     1      554            558        -4      740
-## 7   2013     1     1      555            600        -5      913
-## 8   2013     1     1      557            600        -3      709
-## 9   2013     1     1      557            600        -3      838
-## 10  2013     1     1      558            600        -2      753
-## # ... with 249,771 more rows, and 12 more variables:
-## #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
-## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
-## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
-\end{verbatim}
+As a final note we point out that \texttt{filter()} should often be the
+first verb you'll apply to your data. This cleans your data set to only
+those rows you care about, or put differently, it narrows down the scope
+to just the observational units your care about.
 
-To check that we are correct here we can use the \texttt{count} function
-in the \texttt{dplyr} package on the \texttt{month} variable in our
-\texttt{not\_summer\_flights} data frame to ensure June, July, and
-August are not selected:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{not_summer_flights %>%}\StringTok{ }\KeywordTok{count}\NormalTok{(month)}
-\end{Highlighting}
-\end{Shaded}
-
-\begin{verbatim}
-## # A tibble: 9 × 2
-##   month     n
-##   <int> <int>
-## 1     1 27004
-## 2     2 24951
-## 3     3 28834
-## 4     4 28330
-## 5     5 28796
-## 6     9 27574
-## 7    10 28889
-## 8    11 27268
-## 9    12 28135
-\end{verbatim}
-
-The function \texttt{between} is a shortcut. We could also have written
-the following to get the same result:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{not_summer2 <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }\KeywordTok{filter}\NormalTok{(month <=}\StringTok{ }\DecValTok{5} \NormalTok{|}\StringTok{ }\NormalTok{month >=}\StringTok{ }\DecValTok{9}\NormalTok{)}
-\NormalTok{not_summer2 %>%}\StringTok{ }\KeywordTok{count}\NormalTok{(month)}
-\end{Highlighting}
-\end{Shaded}
-
-\begin{verbatim}
-## # A tibble: 9 × 2
-##   month     n
-##   <int> <int>
-## 1     1 27004
-## 2     2 24951
-## 3     3 28834
-## 4     4 28330
-## 5     5 28796
-## 6     9 27574
-## 7    10 28889
-## 8    11 27268
-## 9    12 28135
-\end{verbatim}
+\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
 \begin{learncheck}
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
 \textbf{(LC5.1)} What's another way using \texttt{!} we could filter
-only the rows that are not summer months (June, July, or August) in the
-\texttt{flights} data frame?
+only the rows that are not going to Burlington, VT nor Seattle, WA in
+the \texttt{flights} data frame? Test this out using the code above.
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
-\subsection{\texorpdfstring{Summarize variables using
-\texttt{summarize}}{Summarize variables using summarize}}\label{summarize-variables-using-summarize}
+\subsection{5MV\#2: Summarize variables using
+summarize}\label{mv2-summarize-variables-using-summarize}
 
 \begin{figure}
 
@@ -2848,7 +3051,9 @@ \subsection{\texorpdfstring{Summarize variables using
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{weather %>%}\StringTok{ }\KeywordTok{summarize}\NormalTok{(}\DataTypeTok{mean =} \KeywordTok{mean}\NormalTok{(temp), }\DataTypeTok{std_dev =} \KeywordTok{sd}\NormalTok{(temp))}
+\NormalTok{summary_temp <-}\StringTok{ }\NormalTok{weather %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{summarize}\NormalTok{(}\DataTypeTok{mean =} \KeywordTok{mean}\NormalTok{(temp), }\DataTypeTok{std_dev =} \KeywordTok{sd}\NormalTok{(temp))}
+\NormalTok{summary_temp}
 \end{Highlighting}
 \end{Shaded}
 
@@ -2859,16 +3064,21 @@ \subsection{\texorpdfstring{Summarize variables using
 ## 1    NA      NA
 \end{verbatim}
 
-What happened here? The mean and the standard deviation temperatures are
-missing? Remember that by default the \texttt{mean} and \texttt{sd}
-functions do not ignore missing values. We need to specify \texttt{TRUE}
-for the \texttt{na.rm} parameter:
+We've created a small data frame here called \texttt{summary\_temp} that
+includes both the \texttt{mean} and the \texttt{std\_dev} of the
+\texttt{temp} variable in \texttt{weather}. Notice as shown in Figures
+\ref{fig:sum1} and \ref{fig:sum2}, the data frame \texttt{weather} went
+from many rows to a single row of just the summary values in the data
+frame \texttt{summary\_temp}. But why are the mean and standard
+deviation missing, i.e. \texttt{NA}? Remember that by default the
+\texttt{mean} and \texttt{sd} functions do not ignore missing values. We
+need to specify the argument \texttt{na.rm=TRUE} (\texttt{rm} is short
+for ``remove''):
 
 \begin{Shaded}
 \begin{Highlighting}[]
 \NormalTok{summary_temp <-}\StringTok{ }\NormalTok{weather %>%}\StringTok{ }
-\StringTok{  }\KeywordTok{summarize}\NormalTok{(}\DataTypeTok{mean =} \KeywordTok{mean}\NormalTok{(temp, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{),}
-          \DataTypeTok{std_dev =} \KeywordTok{sd}\NormalTok{(temp, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{))}
+\StringTok{  }\KeywordTok{summarize}\NormalTok{(}\DataTypeTok{mean =} \KeywordTok{mean}\NormalTok{(temp, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{), }\DataTypeTok{std_dev =} \KeywordTok{sd}\NormalTok{(temp, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{))}
 \NormalTok{summary_temp}
 \end{Highlighting}
 \end{Shaded}
@@ -2880,11 +3090,8 @@ \subsection{\texorpdfstring{Summarize variables using
 ## 1 55.20351 17.78212
 \end{verbatim}
 
-We've created a small data frame here called \texttt{summary\_temp} that
-includes both the \texttt{mean} and the \texttt{std\_dev} of the
-\texttt{temp} variable in \texttt{weather}. If we'd like to access
-either of these values directly we can use the \texttt{\$} to specify a
-column in a data frame:
+If we'd like to access either of these values directly we can use the
+\texttt{\$} to specify a column in a data frame. For example:
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -2896,20 +3103,65 @@ \subsection{\texorpdfstring{Summarize variables using
 ## [1] 55.20351
 \end{verbatim}
 
+You'll often encounter issues with missing values \texttt{NA}. In fact,
+an entire branch of the field of statistics deals with missing data.
+However, it is not good practice to include a \texttt{na.rm\ =\ TRUE} in
+your summary commands by default; you should attempt to run them without
+this argument. The idea being you should at the very least be alerted to
+the presence of missing values and consider what the impact on the
+analysis might be if you ignore these values. In other words,
+\texttt{na.rm\ =\ TRUE} should only be used when necessary.
+
+What other summary functions can we use inside the \texttt{summarize()}
+verb? Any function in R that takes a vector of values and returns just
+one. Here are just a few:
+
+\begin{itemize}
+\tightlist
+\item
+  \texttt{min()} and \texttt{max()}: the minimum and maximum values
+  respectively
+\item
+  \texttt{IQR()}: Interquartile range
+\item
+  \texttt{sum()}: the sum
+\item
+  \texttt{n()}: a count of the number of rows/observations in each
+  group. This particular summary function will make more sense in the
+  \texttt{group\_by} chapter.
+\end{itemize}
+
+\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
+
+\begin{learncheck}
+\textbf{\emph{Learning check}}
+\end{learncheck}
+
+\textbf{(LC5.2)} Say a doctor is studying the effect of smoking on lung
+cancer of a large number of patients who have records measured at five
+year intervals. He notices that a large number of patients have missing
+data points because the patient has died, so he chooses to ignore these
+patients in his analysis. What is wrong with this doctor's approach?
+
+\textbf{(LC5.3)} Modify the above \texttt{summarize} function to be use
+the \texttt{n()} summary function: \texttt{summarize(count=n())}. What
+does the returned value correspond to?
+
+\textbf{(LC5.4)} Why doesn't the following code work? You may want to
+run the code line by line:
+
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{summary_temp$std_dev}
+\NormalTok{summary_temp <-}\StringTok{ }\NormalTok{weather %>%}\StringTok{   }
+\StringTok{  }\KeywordTok{summarize}\NormalTok{(}\DataTypeTok{mean =} \KeywordTok{mean}\NormalTok{(temp, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{)) %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{summarize}\NormalTok{(}\DataTypeTok{std_dev =} \KeywordTok{sd}\NormalTok{(temp, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{))}
 \end{Highlighting}
 \end{Shaded}
 
-\begin{verbatim}
-## [1] 17.78212
-\end{verbatim}
+\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
-It's often more useful to summarize a variable based on the groupings of
-another variable. Let's say we were interested in the mean and standard
-deviation of temperatures for each month. We believe that you will be
-amazed at just how simple this is:
+\subsection{5MV\#3: Group rows using
+group\_by}\label{mv3-group-rows-using-group_by}
 
 \begin{figure}
 
@@ -2920,13 +3172,35 @@ \subsection{\texorpdfstring{Summarize variables using
 \caption[Group by and summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet]{Group by and summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet}\label{fig:groupsummarize}
 \end{figure}
 
+However, it's often more useful to summarize a variable based on the
+groupings of another variable. Let's say similarly to the previous
+section, we are interested in the mean and standard deviation of
+temperatures but \emph{grouped by month}. This concept can equivalently
+be articulated as: we want the mean and standard deviation of
+temperatures
+
+\begin{enumerate}
+\def\labelenumi{\arabic{enumi}.}
+\tightlist
+\item
+  split by month.
+\item
+  sliced by month.
+\item
+  aggregated by month.
+\item
+  collapsed over month.
+\end{enumerate}
+
+We believe that you will be amazed at just how simple this is. Run the
+following code:
+
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{summary_tempXmonth <-}\StringTok{ }\NormalTok{weather %>%}\StringTok{ }
+\NormalTok{summary_monthly_temp <-}\StringTok{ }\NormalTok{weather %>%}\StringTok{ }
 \StringTok{  }\KeywordTok{group_by}\NormalTok{(month) %>%}\StringTok{ }
-\StringTok{  }\KeywordTok{summarize}\NormalTok{(}\DataTypeTok{mean =} \KeywordTok{mean}\NormalTok{(temp, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{),}
-          \DataTypeTok{std_dev =} \KeywordTok{sd}\NormalTok{(temp, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{))}
-\NormalTok{summary_tempXmonth}
+\StringTok{  }\KeywordTok{summarize}\NormalTok{(}\DataTypeTok{mean =} \KeywordTok{mean}\NormalTok{(temp, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{), }\DataTypeTok{std_dev =} \KeywordTok{sd}\NormalTok{(temp, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{))}
+\NormalTok{summary_monthly_temp}
 \end{Highlighting}
 \end{Shaded}
 
@@ -2948,15 +3222,27 @@ \subsection{\texorpdfstring{Summarize variables using
 ## 12    12 38.36811  9.940822
 \end{verbatim}
 
-By simply grouping the \texttt{weather} data set by \texttt{month} first
-and then passing this new data frame into \texttt{summarize} we get a
-resulting data frame that shows the mean and standard deviation
-temperature for each month in New York City.
+This code is identical to the previous code that created
+\texttt{summary\_temp}, but there is an extra \texttt{group\_by(month)}
+spliced in between. By simply grouping the \texttt{weather} data set by
+\texttt{month} first and then passing this new data frame into
+\texttt{summarize} we get a resulting data frame that shows the mean and
+standard deviation temperature for each month in New York City. Since
+each row in \texttt{summary\_monthly\_temp} represents a summary of
+different rows in \texttt{weather}, the observational units have
+changed.
+
+It is important to note that \texttt{group\_by} doesn't actually change
+the data frame. It simply sets \emph{meta-data} (data about the data),
+specifically the group structure of the data. It is only after we apply
+the \texttt{summarize} function that the data frame actually changes. If
+we would like to remove this group structure meta-data, we can pipe a
+resulting data frame into the \texttt{ungroup()} function.
 
-Another useful function is the \texttt{n} function which gives a count
-of how many entries appeared in the groupings. Suppose we'd like to get
-a sense for how many flights departed each of the three airports in New
-York City:
+We now revisit the \texttt{n()} counting summary function we introduced
+in the previous section. For example, suppose we'd like to get a sense
+for how many flights departed each of the three airports in New York
+City:
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -2978,28 +3264,54 @@ \subsection{\texorpdfstring{Summarize variables using
 
 We see that Newark (\texttt{"EWR"}) had the most flights departing in
 2013 followed by \texttt{"JFK"} and lastly by LaGuardia
-(\texttt{"LGA"}).
+(\texttt{"LGA"}). Note there is a subtle but important difference
+between \texttt{sum()} and \texttt{n()}. While \texttt{sum()} simply
+adds up a large set of numbers, the latter counts the number of times
+each of many different values occur.
+
+You are not limited to grouping by one variable! Say you wanted to know
+the number of flights leaving each of the three New York City airports
+\emph{for each month}, we can also group by a second variable
+\texttt{month}: \texttt{group\_by(origin,\ month)}. Run the following:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{by_monthly_origin <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{group_by}\NormalTok{(origin, month) %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{summarize}\NormalTok{(}\DataTypeTok{count =} \KeywordTok{n}\NormalTok{())}
+\KeywordTok{View}\NormalTok{(by_monthly_origin)}
+\end{Highlighting}
+\end{Shaded}
+
+\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
 \begin{learncheck}
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC5.2)} Recall from Chapter \ref{viz} when we looked at plots
+\textbf{(LC5.5)} Recall from Chapter \ref{viz} when we looked at plots
 of temperatures by months in NYC. What does the standard deviation
-column in the \texttt{summary\_tempXmonth} data frame tell us about
+column in the \texttt{summary\_monthly\_temp} data frame tell us about
 temperatures in New York City throughout the year?
 
-\textbf{(LC5.3)} What code would be required to get the mean and
+\textbf{(LC5.6)} What code would be required to get the mean and
 standard deviation temperature for each day in 2013 for NYC?
 
-\textbf{(LC5.4)} How could we identify how many flights left each of the
-three airports in each of the months of 2013?
+\textbf{(LC5.7)} Recreate \texttt{by\_monthly\_origin}, but instead of
+grouping via \texttt{group\_by(origin,\ month)}, group variables in a
+different order \texttt{group\_by(month,\ origin)}. What differs in the
+resulting data set?
+
+\textbf{(LC5.8)} How could we identify how many flights left each of the
+three airports for each \emph{carrier}?
+
+\textbf{(LC5.9)} How does the \texttt{filter} operation differ from a
+\texttt{group\_by} followed by a \texttt{summarize}?
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
-\subsection{\texorpdfstring{Create new variables/change old variables
-using
-\texttt{mutate}}{Create new variables/change old variables using mutate}}\label{create-new-variableschange-old-variables-using-mutate}
+\subsection{5MV\#4: Create new variables/change old variables using
+mutate}\label{mv4-create-new-variableschange-old-variables-using-mutate}
 
 \begin{figure}
 
@@ -3023,26 +3335,36 @@ \subsection{\texorpdfstring{Create new variables/change old variables
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{flights <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{gain =} \NormalTok{arr_delay -}\StringTok{ }\NormalTok{dep_delay)}
+\NormalTok{flights <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{gain =} \NormalTok{arr_delay -}\StringTok{ }\NormalTok{dep_delay)}
 \end{Highlighting}
 \end{Shaded}
 
-We can now look at summary measures of this \texttt{gain} variable and
-even plot it in the form of a histogram:
+Why did we overwrite \texttt{flights} instead of assigning the resulting
+data frame to a new object, like \texttt{flights\_with\_gain}? As a
+rough rule of thumb, as long as you are not losing information that you
+might need later, its acceptable practice to overwrite data frames.
+However, if you overwrite existing variables and/or change the
+observational units, recovering the original information might prove
+difficult. It this case, it might make sense to create a new data
+object.
+
+Let's look at summary measures of this \texttt{gain} variable and even
+plot it in the form of a histogram:
 
 \begin{Shaded}
 \begin{Highlighting}[]
 \NormalTok{gain_summary <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }
 \StringTok{  }\KeywordTok{summarize}\NormalTok{(}
-          \DataTypeTok{min =} \KeywordTok{min}\NormalTok{(gain, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{),}
-          \DataTypeTok{q1 =} \KeywordTok{quantile}\NormalTok{(gain, }\FloatTok{0.25}\NormalTok{, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{),}
-          \DataTypeTok{median =} \KeywordTok{quantile}\NormalTok{(gain, }\FloatTok{0.5}\NormalTok{, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{),}
-          \DataTypeTok{q3 =} \KeywordTok{quantile}\NormalTok{(gain, }\FloatTok{0.75}\NormalTok{, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{),}
-          \DataTypeTok{max =} \KeywordTok{max}\NormalTok{(gain, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{),}
-          \DataTypeTok{mean =} \KeywordTok{mean}\NormalTok{(gain, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{),}
-          \DataTypeTok{sd =} \KeywordTok{sd}\NormalTok{(gain, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{),}
-          \DataTypeTok{missing =} \KeywordTok{sum}\NormalTok{(}\KeywordTok{is.na}\NormalTok{(gain))}
-\NormalTok{)}
+    \DataTypeTok{min =} \KeywordTok{min}\NormalTok{(gain, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{),}
+    \DataTypeTok{q1 =} \KeywordTok{quantile}\NormalTok{(gain, }\FloatTok{0.25}\NormalTok{, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{),}
+    \DataTypeTok{median =} \KeywordTok{quantile}\NormalTok{(gain, }\FloatTok{0.5}\NormalTok{, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{),}
+    \DataTypeTok{q3 =} \KeywordTok{quantile}\NormalTok{(gain, }\FloatTok{0.75}\NormalTok{, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{),}
+    \DataTypeTok{max =} \KeywordTok{max}\NormalTok{(gain, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{),}
+    \DataTypeTok{mean =} \KeywordTok{mean}\NormalTok{(gain, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{),}
+    \DataTypeTok{sd =} \KeywordTok{sd}\NormalTok{(gain, }\DataTypeTok{na.rm =} \OtherTok{TRUE}\NormalTok{),}
+    \DataTypeTok{missing =} \KeywordTok{sum}\NormalTok{(}\KeywordTok{is.na}\NormalTok{(gain))}
+  \NormalTok{)}
 \NormalTok{gain_summary}
 \end{Highlighting}
 \end{Shaded}
@@ -3059,7 +3381,6 @@ \subsection{\texorpdfstring{Create new variables/change old variables
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\KeywordTok{library}\NormalTok{(ggplot2)}
 \KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data =} \NormalTok{flights, }\DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \NormalTok{gain)) +}
 \StringTok{  }\KeywordTok{geom_histogram}\NormalTok{(}\DataTypeTok{color =} \StringTok{"white"}\NormalTok{, }\DataTypeTok{bins =} \DecValTok{20}\NormalTok{)}
 \end{Highlighting}
@@ -3067,11 +3388,11 @@ \subsection{\texorpdfstring{Create new variables/change old variables
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-41-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-45-1} 
 
 }
 
-\caption[Histogram of gain variable]{Histogram of gain variable}\label{fig:unnamed-chunk-41}
+\caption[Histogram of gain variable]{Histogram of gain variable}\label{fig:unnamed-chunk-45}
 \end{figure}
 
 We can also create multiple columns at once and even refer to columns
@@ -3080,11 +3401,12 @@ \subsection{\texorpdfstring{Create new variables/change old variables
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{flights_plus <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }\KeywordTok{mutate}\NormalTok{(}
-  \DataTypeTok{gain =} \NormalTok{arr_delay -}\StringTok{ }\NormalTok{dep_delay,}
-  \DataTypeTok{hours =} \NormalTok{air_time /}\StringTok{ }\DecValTok{60}\NormalTok{,}
-  \DataTypeTok{gain_per_hour =} \NormalTok{gain /}\StringTok{ }\NormalTok{hours}
-\NormalTok{)}
+\NormalTok{flights <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{mutate}\NormalTok{(}
+    \DataTypeTok{gain =} \NormalTok{arr_delay -}\StringTok{ }\NormalTok{dep_delay,}
+    \DataTypeTok{hours =} \NormalTok{air_time /}\StringTok{ }\DecValTok{60}\NormalTok{,}
+    \DataTypeTok{gain_per_hour =} \NormalTok{gain /}\StringTok{ }\NormalTok{hours}
+  \NormalTok{)}
 \end{Highlighting}
 \end{Shaded}
 
@@ -3094,33 +3416,32 @@ \subsection{\texorpdfstring{Create new variables/change old variables
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC5.5)} What do positive values of the \texttt{gain} variable
-in \texttt{flights\_plus} correspond to? What about negative values? And
-what about a zero value?
+\textbf{(LC5.10)} What do positive values of the \texttt{gain} variable
+in \texttt{flights} correspond to? What about negative values? And what
+about a zero value?
 
-\textbf{(LC5.6)} Could we create the \texttt{dep\_delay} and
+\textbf{(LC5.11)} Could we create the \texttt{dep\_delay} and
 \texttt{arr\_delay} columns by simply subtracting \texttt{dep\_time}
 from \texttt{sched\_dep\_time} and similarly for arrivals? Try the code
 out and explain any differences between the result and what actually
 appears in \texttt{flights}.
 
-\textbf{(LC5.7)} What can we say about the distribution of
+\textbf{(LC5.12)} What can we say about the distribution of
 \texttt{gain}? Describe it in a few sentences using the plot and the
 \texttt{gain\_summary} data frame values.
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
-\subsection{\texorpdfstring{Reorder the data frame using
-\texttt{arrange}}{Reorder the data frame using arrange}}\label{reorder-the-data-frame-using-arrange}
+\subsection{5MV\#5: Reorder the data frame using arrange}\label{arrange}
 
 As you may have thought about with the data frames we've worked with so
 far in the book, one of the most common things you'd like to do is sort
-the data frames by a specific column. Have you ever been asked to
-calculate a median by hand? This requires you to put the data in order
-from smallest to highest in value. The \texttt{dplyr} package has a
-function called \texttt{arrange} that we will use to sort/reorder our
-data according to the values of the specified variable. This is most
-frequently used after we have used the \texttt{group\_by} and
+the data frames by a specific variable in a column. Have you ever been
+asked to calculate a median by hand? This requires you to put the data
+in order from smallest to highest in value. The \texttt{dplyr} package
+has a function called \texttt{arrange} that we will use to sort/reorder
+our data according to the values of the specified variable. This is
+often used after we have used the \texttt{group\_by} and
 \texttt{summarize} functions as we will see.
 
 Let's suppose we were interested in determining the most frequent
@@ -3153,13 +3474,13 @@ \subsection{\texorpdfstring{Reorder the data frame using
 \end{verbatim}
 
 You'll see that by default the values of \texttt{dest} are displayed in
-alphabetical order here. Remember to use \texttt{View()} in the R
-Console to look at all the values of \texttt{freq\_dest} in spreadsheet
-format. We are interested in finding those airports that appear most:
+alphabetical order here. We are interested in finding those airports
+that appear most:
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{freq_dest %>%}\StringTok{ }\KeywordTok{arrange}\NormalTok{(num_flights)}
+\NormalTok{freq_dest %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{arrange}\NormalTok{(num_flights)}
 \end{Highlighting}
 \end{Shaded}
 
@@ -3187,7 +3508,8 @@ \subsection{\texorpdfstring{Reorder the data frame using
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{freq_dest %>%}\StringTok{ }\KeywordTok{arrange}\NormalTok{(}\KeywordTok{desc}\NormalTok{(num_flights))}
+\NormalTok{freq_dest %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{arrange}\NormalTok{(}\KeywordTok{desc}\NormalTok{(num_flights))}
 \end{Highlighting}
 \end{Shaded}
 
@@ -3208,466 +3530,337 @@ \subsection{\texorpdfstring{Reorder the data frame using
 ## # ... with 95 more rows
 \end{verbatim}
 
-\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
+\section{Joining data frames}\label{joining-data-frames}
+
+Another common task is joining/merging two different data sets. For
+example, in the \texttt{flights} data, the variable \texttt{carrier}
+lists the carrier code for the different flights. While \texttt{"UA"}
+and \texttt{"AA"} might be somewhat easy to guess for some (United and
+American Airlines), what are ``VX'', ``HA'', and ``B6''? This
+information is provided in a separate data frame \texttt{airlines}.
 
-\section{Other verbs}\label{other-verbs}
+\begin{Shaded}
+\begin{Highlighting}[]
+\KeywordTok{View}\NormalTok{(airlines)}
+\end{Highlighting}
+\end{Shaded}
+
+We see that in \texttt{airports}, \texttt{carrier} is the carrier code
+while \texttt{name} is the full name of the airline. Using this table,
+we can see that ``VX'', ``HA'', and ``B6'' correspond to Virgin America,
+Hawaiian Airlines, and JetBlue respectively. However, will we have to
+continually look up the carrier's name for each flight in the
+\texttt{airlines} data set? No! Instead of having to manually do this,
+we can have R automatically do this ``looking up'' for us.
 
-\subsection{\texorpdfstring{Select variables using
-\texttt{select}}{Select variables using select}}\label{select-variables-using-select}
+Note that the values in the variable \texttt{carrier} in
+\texttt{flights} match the values in the variable \texttt{carrier} in
+\texttt{airlines}. In this case, we can use the variable
+\texttt{carrier} as a \emph{key variable} to join/merge/match the two
+data frames by. Hadley and Garrett \citep{rds2016} created the following
+diagram to help us understand how the different data sets are linked:
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{images/select} 
+{\centering \includegraphics[width=\textwidth]{images/relational-nycflights} 
 
 }
 
-\caption[Select diagram from Data Wrangling with dplyr and tidyr cheatsheet]{Select diagram from Data Wrangling with dplyr and tidyr cheatsheet}\label{fig:selectfig}
+\caption[Data relationships in nycflights13 from R for Data Science]{Data relationships in nycflights13 from R for Data Science}\label{fig:reldiagram}
 \end{figure}
 
-We've seen that the \texttt{flights} data frame in the
-\texttt{nycflights13} package contains many different variables (19 in
-fact). You can identify this by running the \texttt{dim} function or the
-\texttt{ncol} function:
+\subsection{Joining by Key Variables}\label{joining-by-key-variables}
 
-\begin{Shaded}
-\begin{Highlighting}[]
-\KeywordTok{data}\NormalTok{(flights)}
-\KeywordTok{dim}\NormalTok{(flights)}
-\end{Highlighting}
-\end{Shaded}
-
-\begin{verbatim}
-## [1] 336776     19
-\end{verbatim}
+In both \texttt{flights} and \texttt{airlines}, the key variable we want
+to join/merge/match the two data frames with has the same name in both
+data sets: \texttt{carriers}. We make use of the \texttt{inner\_join()}
+function to join by the variable \texttt{carrier}.
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\KeywordTok{ncol}\NormalTok{(flights)}
+\NormalTok{flights_joined <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{inner_join}\NormalTok{(airlines, }\DataTypeTok{by=}\StringTok{"carrier"}\NormalTok{)}
+\KeywordTok{View}\NormalTok{(flights)}
+\KeywordTok{View}\NormalTok{(flights_joined)}
 \end{Highlighting}
 \end{Shaded}
 
-\begin{verbatim}
-## [1] 19
-\end{verbatim}
-
-One of these variables is \texttt{year}. If you remember the original
-description of the \texttt{flights} data frame (or by running
-\texttt{?flights}), you'll remember that this data correspond to flights
-in 2013 departing New York City. The \texttt{year} variable isn't really
-a variable here in that it doesn't vary\ldots{} \texttt{flights}
-actually comes from a larger data set that covers many years. We may
-want to remove the \texttt{year} variable from our data set since it
-won't be helpful for analysis in this case. To do so easily, we use the
-\texttt{select} variable:
+We observed that the \texttt{flights} and \texttt{flights\_joined} are
+identical except that \texttt{flights\_joined} has an additional
+variable \texttt{name} whose values were drawn from \texttt{airlines}.
 
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{flights_small <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }\KeywordTok{select}\NormalTok{( -year)}
-\KeywordTok{names}\NormalTok{(flights_small)}
-\end{Highlighting}
-\end{Shaded}
+A visual representation of the \texttt{inner\_join} is given below
+\citep{rds2016}:
 
-\begin{verbatim}
-##  [1] "month"          "day"            "dep_time"       "sched_dep_time"
-##  [5] "dep_delay"      "arr_time"       "sched_arr_time" "arr_delay"     
-##  [9] "carrier"        "flight"         "tailnum"        "origin"        
-## [13] "dest"           "air_time"       "distance"       "hour"          
-## [17] "minute"         "time_hour"
-\end{verbatim}
+\begin{figure}
 
-The \texttt{names} function gives a listing of all the columns in a data
-frame. We see that \texttt{year} has been removed. This was done using a
-\texttt{-} in front of the name of the column we'd like to remove.
+{\centering \includegraphics[width=\textwidth]{images/join-inner} 
 
-We could also select specific columns (instead of deselecting columns)
-by listing them out:
+}
 
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{flight_dep_times <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }\KeywordTok{select}\NormalTok{(month, day, dep_time, sched_dep_time)}
-\NormalTok{flight_dep_times}
-\end{Highlighting}
-\end{Shaded}
+\caption[Diagram of inner join from R for Data Science]{Diagram of inner join from R for Data Science}\label{fig:ijdiagram}
+\end{figure}
 
-\begin{verbatim}
-## # A tibble: 336,776 × 4
-##    month   day dep_time sched_dep_time
-##    <int> <int>    <int>          <int>
-## 1      1     1      517            515
-## 2      1     1      533            529
-## 3      1     1      542            540
-## 4      1     1      544            545
-## 5      1     1      554            600
-## 6      1     1      554            558
-## 7      1     1      555            600
-## 8      1     1      557            600
-## 9      1     1      557            600
-## 10     1     1      558            600
-## # ... with 336,766 more rows
-\end{verbatim}
+There are more complex joins available, but the \texttt{inner\_join}
+will solve nearly all of the problems you'll face in our experience.
 
-Or we could specify a ranges of columns:
+\subsection{Joining by Key Variables with Different
+Names}\label{joining-by-key-variables-with-different-names}
 
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{flight_arr_times <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }\KeywordTok{select}\NormalTok{(month:day, arr_time:sched_arr_time)}
-\NormalTok{flight_arr_times}
-\end{Highlighting}
-\end{Shaded}
+Say instead, you are interested in all the destinations of flights from
+NYC in 2013 and ask yourself:
 
-\begin{verbatim}
-## # A tibble: 336,776 × 4
-##    month   day arr_time sched_arr_time
-##    <int> <int>    <int>          <int>
-## 1      1     1      830            819
-## 2      1     1      850            830
-## 3      1     1      923            850
-## 4      1     1     1004           1022
-## 5      1     1      812            837
-## 6      1     1      740            728
-## 7      1     1      913            854
-## 8      1     1      709            723
-## 9      1     1      838            846
-## 10     1     1      753            745
-## # ... with 336,766 more rows
-\end{verbatim}
+\begin{itemize}
+\tightlist
+\item
+  ``What cities are these airports in?''
+\item
+  ``Is \texttt{"ORD"} Orlando?''
+\item
+  ``Where is \texttt{"FLL"}?
+\end{itemize}
 
-The \texttt{select} function can also be used to reorder columns in
-combination with the \texttt{everything} helper function. Let's suppose
-we'd like the \texttt{hour}, \texttt{minute}, and \texttt{time\_hour}
-variables, which appear at the end of the \texttt{flights} data set, to
-actually appear immediately after the \texttt{day} variable:
+The \texttt{airports} data frame contains airport codes:
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{flights_reorder <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }\KeywordTok{select}\NormalTok{(month:day, hour:time_hour, }\KeywordTok{everything}\NormalTok{())}
-\KeywordTok{names}\NormalTok{(flights_reorder)}
+\KeywordTok{View}\NormalTok{(airports)}
 \end{Highlighting}
 \end{Shaded}
 
-\begin{verbatim}
-##  [1] "month"          "day"            "hour"           "minute"        
-##  [5] "time_hour"      "year"           "dep_time"       "sched_dep_time"
-##  [9] "dep_delay"      "arr_time"       "sched_arr_time" "arr_delay"     
-## [13] "carrier"        "flight"         "tailnum"        "origin"        
-## [17] "dest"           "air_time"       "distance"
-\end{verbatim}
+However, looking at both the \texttt{airports} and \texttt{flights} and
+the visual representation of the relations between the data frames in
+Figure \ref{fig:ijdiagram}, we see that in:
 
-Lastly, the helper functions \texttt{starts\_with}, \texttt{ends\_with},
-and \texttt{contains} can be used to choose column names that match
-those conditions:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{flights_begin_a <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }\KeywordTok{select}\NormalTok{(}\KeywordTok{starts_with}\NormalTok{(}\StringTok{"a"}\NormalTok{))}
-\NormalTok{flights_begin_a}
-\end{Highlighting}
-\end{Shaded}
+\begin{itemize}
+\tightlist
+\item
+  \texttt{airports} the airport code is in the variable \texttt{faa}
+\item
+  \texttt{flights} the airport code is in the variable \texttt{origin}
+\end{itemize}
 
-\begin{verbatim}
-## # A tibble: 336,776 × 3
-##    arr_time arr_delay air_time
-##       <int>     <dbl>    <dbl>
-## 1       830        11      227
-## 2       850        20      227
-## 3       923        33      160
-## 4      1004       -18      183
-## 5       812       -25      116
-## 6       740        12      150
-## 7       913        19      158
-## 8       709       -14       53
-## 9       838        -8      140
-## 10      753         8      138
-## # ... with 336,766 more rows
-\end{verbatim}
+So to join these two data sets, our \texttt{inner\_join} operation
+involves a \texttt{by} argument that accounts for the different names:
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{flights_delays <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }\KeywordTok{select}\NormalTok{(}\KeywordTok{ends_with}\NormalTok{(}\StringTok{"delay"}\NormalTok{))}
-\NormalTok{flights_delays}
+\NormalTok{flights %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{inner_join}\NormalTok{(airports, }\DataTypeTok{by =} \KeywordTok{c}\NormalTok{(}\StringTok{"dest"} \NormalTok{=}\StringTok{ "faa"}\NormalTok{))}
 \end{Highlighting}
 \end{Shaded}
 
-\begin{verbatim}
-## # A tibble: 336,776 × 2
-##    dep_delay arr_delay
-##        <dbl>     <dbl>
-## 1          2        11
-## 2          4        20
-## 3          2        33
-## 4         -1       -18
-## 5         -6       -25
-## 6         -4        12
-## 7         -5        19
-## 8         -3       -14
-## 9         -3        -8
-## 10        -2         8
-## # ... with 336,766 more rows
-\end{verbatim}
+Let's construct the sequence of commands that computes the number of
+flights from NYC to each destination but also includes information about
+each destination airport:
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{flights_time <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }\KeywordTok{select}\NormalTok{(}\KeywordTok{contains}\NormalTok{(}\StringTok{"time"}\NormalTok{))}
-\NormalTok{flights_time}
+\NormalTok{named_dests <-}\StringTok{ }\NormalTok{flights %>%}
+\StringTok{  }\KeywordTok{group_by}\NormalTok{(dest) %>%}
+\StringTok{  }\KeywordTok{summarize}\NormalTok{(}\DataTypeTok{num_flights =} \KeywordTok{n}\NormalTok{()) %>%}
+\StringTok{  }\KeywordTok{arrange}\NormalTok{(}\KeywordTok{desc}\NormalTok{(num_flights)) %>%}
+\StringTok{  }\KeywordTok{inner_join}\NormalTok{(airports, }\DataTypeTok{by =} \KeywordTok{c}\NormalTok{(}\StringTok{"dest"} \NormalTok{=}\StringTok{ "faa"}\NormalTok{)) %>%}
+\StringTok{  }\KeywordTok{rename}\NormalTok{(}\DataTypeTok{airport_name =} \NormalTok{name)}
+\KeywordTok{View}\NormalTok{(named_dests)}
 \end{Highlighting}
 \end{Shaded}
 
-\begin{verbatim}
-## # A tibble: 336,776 × 6
-##    dep_time sched_dep_time arr_time sched_arr_time air_time
-##       <int>          <int>    <int>          <int>    <dbl>
-## 1       517            515      830            819      227
-## 2       533            529      850            830      227
-## 3       542            540      923            850      160
-## 4       544            545     1004           1022      183
-## 5       554            600      812            837      116
-## 6       554            558      740            728      150
-## 7       555            600      913            854      158
-## 8       557            600      709            723       53
-## 9       557            600      838            846      140
-## 10      558            600      753            745      138
-## # ... with 336,766 more rows, and 1 more variables: time_hour <dttm>
-\end{verbatim}
-
-\subsection{\texorpdfstring{Rename variables using
-\texttt{rename}}{Rename variables using rename}}\label{rename-variables-using-rename}
-
-Another useful function is \texttt{rename}, which as you may suspect
-renames one column to another name. Suppose we wanted \texttt{dep\_time}
-and \texttt{arr\_time} to be \texttt{departure\_time} and
-\texttt{arrival\_time} instead in the \texttt{flights\_time} data frame:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\NormalTok{flights_time <-}\StringTok{ }\NormalTok{flights_time %>%}\StringTok{ }
-\StringTok{  }\KeywordTok{rename}\NormalTok{(}\DataTypeTok{departure_time =} \NormalTok{dep_time,}
-         \DataTypeTok{arrival_time =} \NormalTok{arr_time)}
-\KeywordTok{names}\NormalTok{(flights_time)}
-\end{Highlighting}
-\end{Shaded}
+In case you didn't know, \texttt{"ORD"} is the airport code of Chicago
+O'Hare airport and \texttt{"FLL"} is the main airport in Fort
+Lauderdale, Florida, which we can now see in our
+\texttt{named\_freq\_dests} data frame.
 
-\begin{verbatim}
-## [1] "departure_time" "sched_dep_time" "arrival_time"   "sched_arr_time"
-## [5] "air_time"       "time_hour"
-\end{verbatim}
+\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
-It's easy to forget if the new name comes before or after the equals
-sign. I usually remember this as ``New Before, Old After'' or NBOA.
+\begin{learncheck}
+\textbf{\emph{Learning check}}
+\end{learncheck}
 
-You'll receive an error if you try to do it the other way:
+\textbf{(LC5.13)} Looking at Figure \ref{fig:reldiagram}, when joining
+\texttt{flights} and \texttt{weather}, or in order words match the
+hourly weather values with each flight, why do we need to join by all of
+\texttt{year}, \texttt{month}, \texttt{day}, \texttt{hour}, and
+\texttt{origin}, and not just \texttt{hour}?
 
-\begin{verbatim}
-Error: Unknown variables: departure_time, arrival_time.
-\end{verbatim}
+\textbf{(LC5.14)} What surprises you about the top 10 destinations from
+NYC in 2013?
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
-\begin{learncheck}
-\textbf{\emph{Learning check}}
-\end{learncheck}
+\section{Optional: Other verbs}\label{optional-other-verbs}
 
-\textbf{(LC5.8)} What are some ways to select all three of the
-\texttt{dest}, \texttt{air\_time}, and \texttt{distance} variables from
-\texttt{flights}? Give the code showing how to do this in at least three
-different ways.
+\subsection{Select variables using select}\label{select}
 
-\textbf{(LC5.9)} How could one use \texttt{starts\_with},
-\texttt{ends\_with}, and \texttt{contains} to select columns from the
-\texttt{flights} data frame? Provide three different examples in total:
-one for \texttt{starts\_with}, one for \texttt{ends\_with}, and one for
-\texttt{contains}.
+\begin{figure}
 
-\textbf{(LC5.10)} Why might we want to use the \texttt{select} function
-on a data frame?
+{\centering \includegraphics[width=\textwidth]{images/select} 
 
-\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
+}
 
-\subsection{\texorpdfstring{Find the top number of values using
-\texttt{top\_n}}{Find the top number of values using top\_n}}\label{find-the-top-number-of-values-using-top_n}
+\caption[Select diagram from Data Wrangling with dplyr and tidyr cheatsheet]{Select diagram from Data Wrangling with dplyr and tidyr cheatsheet}\label{fig:selectfig}
+\end{figure}
 
-We can also use the \texttt{top\_n} function which automatically tells
-us the most frequent \texttt{num\_flights}. We specify the top 10
-airports here:
+We've seen that the \texttt{flights} data frame in the
+\texttt{nycflights13} package contains many different variables. The
+\texttt{names} function gives a listing of all the columns in a data
+frame; in our case you would run \texttt{names(flights)}. You can also
+identify these variables by running the \texttt{glimpse} function in the
+\texttt{dplyr} package:
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{freq_dest %>%}\StringTok{ }\KeywordTok{top_n}\NormalTok{(}\DataTypeTok{n =} \DecValTok{10}\NormalTok{, }\DataTypeTok{wt =} \NormalTok{num_flights)}
+\KeywordTok{glimpse}\NormalTok{(flights)}
 \end{Highlighting}
 \end{Shaded}
 
-\begin{verbatim}
-## # A tibble: 10 × 2
-##     dest num_flights
-##    <chr>       <int>
-## 1    ATL       17215
-## 2    BOS       15508
-## 3    CLT       14064
-## 4    DCA        9705
-## 5    FLL       12055
-## 6    LAX       16174
-## 7    MCO       14082
-## 8    MIA       11728
-## 9    ORD       17283
-## 10   SFO       13331
-\end{verbatim}
-
-We'll still need to arrange this by \texttt{num\_flights} though:
+However, say you only want to consider two of these variables, say
+\texttt{carrier} and \texttt{flight}. You can \texttt{select} these:
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{freq_dest %>%}\StringTok{ }\KeywordTok{top_n}\NormalTok{(}\DataTypeTok{n =} \DecValTok{10}\NormalTok{, }\DataTypeTok{wt =} \NormalTok{num_flights) %>%}\StringTok{ }
-\StringTok{  }\KeywordTok{arrange}\NormalTok{(}\KeywordTok{desc}\NormalTok{(num_flights))}
+\NormalTok{flights %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{select}\NormalTok{(carrier, flight)}
 \end{Highlighting}
 \end{Shaded}
 
-\begin{verbatim}
-## # A tibble: 10 × 2
-##     dest num_flights
-##    <chr>       <int>
-## 1    ORD       17283
-## 2    ATL       17215
-## 3    LAX       16174
-## 4    BOS       15508
-## 5    MCO       14082
-## 6    CLT       14064
-## 7    SFO       13331
-## 8    FLL       12055
-## 9    MIA       11728
-## 10   DCA        9705
-\end{verbatim}
-
-\textbf{Note:} Remember that I didn't pull the \texttt{n} and
-\texttt{wt} arguments out of thin air. They can be found by using the
-\texttt{?} function on \texttt{top\_n}.
-
-We can go one stop further and tie together the group\_by and summarize
-functions we used to find the most frequent flights:
+Another one of these variables is \texttt{year}. If you remember the
+original description of the \texttt{flights} data frame (or by running
+\texttt{?flights}), you'll remember that this data correspond to flights
+in 2013 departing New York City. The \texttt{year} variable isn't really
+a variable here in that it doesn't vary\ldots{} \texttt{flights}
+actually comes from a larger data set that covers many years. We may
+want to remove the \texttt{year} variable from our data set since it
+won't be helpful for analysis in this case. We can deselect
+\texttt{year} by using the \texttt{-} sign:
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{ten_freq_dests <-}\StringTok{ }\NormalTok{flights %>%}
-\StringTok{  }\KeywordTok{group_by}\NormalTok{(dest) %>%}
-\StringTok{  }\KeywordTok{summarize}\NormalTok{(}\DataTypeTok{num_flights =} \KeywordTok{n}\NormalTok{()) %>%}
-\StringTok{  }\KeywordTok{top_n}\NormalTok{(}\DataTypeTok{n =} \DecValTok{10}\NormalTok{) %>%}
-\StringTok{  }\KeywordTok{arrange}\NormalTok{(}\KeywordTok{desc}\NormalTok{(num_flights))}
+\NormalTok{flights_no_year <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{select}\NormalTok{(-year)}
+\KeywordTok{names}\NormalTok{(flights_no_year)}
 \end{Highlighting}
 \end{Shaded}
 
-\begin{verbatim}
-## Selecting by num_flights
-\end{verbatim}
-
-\begin{learncheck}
-\textbf{\emph{Learning check}}
-\end{learncheck}
+Or we could specify a ranges of columns:
 
-\textbf{\texttt{paste0("(LC",\ chap,\ ".",\ (lc\ \textless{}-\ lc\ +\ 1),\ ")")}}
-Create a new data frame that shows the top 5 airports with the largest
-arrival delays from NYC in 2013.
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{flight_arr_times <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{select}\NormalTok{(month:day, arr_time:sched_arr_time)}
+\NormalTok{flight_arr_times}
+\end{Highlighting}
+\end{Shaded}
 
-\section{Joining/merging data frames}\label{joiningmerging-data-frames}
+The \texttt{select} function can also be used to reorder columns in
+combination with the \texttt{everything} helper function. Let's suppose
+we'd like the \texttt{hour}, \texttt{minute}, and \texttt{time\_hour}
+variables, which appear at the end of the \texttt{flights} data set, to
+actually appear immediately after the \texttt{day} variable:
 
-Something you may have thought to yourself as you looked at the most
-freqent destinations of flights from NYC in 2013 is
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{flights_reorder <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{select}\NormalTok{(month:day, hour:time_hour, }\KeywordTok{everything}\NormalTok{())}
+\KeywordTok{names}\NormalTok{(flights_reorder)}
+\end{Highlighting}
+\end{Shaded}
 
-\begin{itemize}
-\tightlist
-\item
-  ``What cities are these airports in?''
-\item
-  ``Is \texttt{"ORD"} Orlando?''
-\item
-  ``Where is \texttt{"FLL"}?
-\end{itemize}
+in this case \texttt{everything()} picks up all remaining variables.
+Lastly, the helper functions \texttt{starts\_with}, \texttt{ends\_with},
+and \texttt{contains} can be used to choose column names that match
+those conditions:
 
-The \texttt{nycflights13} data package contains multiple data frames.
-Instead of having to manually look up different values of airport names
-corresponding to airport codes like \texttt{ORD}, we can have R
-automatically do this ``looking up'' for us. To do so, we'll need to
-tell R how to match one data frame to another data frame. Let's first
-check out the \texttt{airports} data frame inside of R:
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{flights_begin_a <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{select}\NormalTok{(}\KeywordTok{starts_with}\NormalTok{(}\StringTok{"a"}\NormalTok{))}
+\NormalTok{flights_begin_a}
+\end{Highlighting}
+\end{Shaded}
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\KeywordTok{View}\NormalTok{(airports)}
+\NormalTok{flights_delays <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{select}\NormalTok{(}\KeywordTok{ends_with}\NormalTok{(}\StringTok{"delay"}\NormalTok{))}
+\NormalTok{flights_delays}
 \end{Highlighting}
 \end{Shaded}
 
-The first column \texttt{faa} corresponds to the airport codes that we
-saw in \texttt{dest} in our \texttt{flights} and subsequent
-\texttt{ten\_freq\_dests} data sets. Hadley and Garrett \citep{rds2016}
-created the following diagram to help us understand how the different
-data sets are linked:
-
-\begin{figure}
-
-{\centering \includegraphics[width=\textwidth]{images/relational-nycflights} 
-
-}
-
-\caption[Data relationships in nycflights13 from R for Data Science]{Data relationships in nycflights13 from R for Data Science}\label{fig:reldiagram}
-\end{figure}
-
-We see from \texttt{View(airports)} that \texttt{airports} contains a
-lot of other information about 1458. We are only really interested here
-in the \texttt{faa} and \texttt{name} columns. Let's use the
-\texttt{select} function to only use those variables:
-
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{airports_small <-}\StringTok{ }\NormalTok{airports %>%}\StringTok{ }\KeywordTok{select}\NormalTok{(faa, name)}
+\NormalTok{flights_time <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{select}\NormalTok{(}\KeywordTok{contains}\NormalTok{(}\StringTok{"time"}\NormalTok{))}
+\NormalTok{flights_time}
 \end{Highlighting}
 \end{Shaded}
 
-So if we identify the names of the airports we can use the
-\texttt{inner\_join} function to bring two different data frames
-together. Note that we will also rename the subsequent column
-\texttt{name} as \texttt{airport\_name}:
+\subsection{Rename variables using rename}\label{rename}
+
+Another useful function is \texttt{rename}, which as you may suspect
+renames one column to another name. Suppose we wanted \texttt{dep\_time}
+and \texttt{arr\_time} to be \texttt{departure\_time} and
+\texttt{arrival\_time} instead in the \texttt{flights\_time} data frame:
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{named_freq_dests <-}\StringTok{ }\NormalTok{ten_freq_dests %>%}
-\StringTok{  }\KeywordTok{inner_join}\NormalTok{(airports_small, }\DataTypeTok{by =} \KeywordTok{c}\NormalTok{(}\StringTok{"dest"} \NormalTok{=}\StringTok{ "faa"}\NormalTok{)) %>%}
-\StringTok{  }\KeywordTok{rename}\NormalTok{(}\DataTypeTok{airport_name =} \NormalTok{name)}
-\NormalTok{named_freq_dests}
+\NormalTok{flights_time_new <-}\StringTok{ }\NormalTok{flights %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{select}\NormalTok{(}\KeywordTok{contains}\NormalTok{(}\StringTok{"time"}\NormalTok{)) %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{rename}\NormalTok{(}\DataTypeTok{departure_time =} \NormalTok{dep_time,}
+         \DataTypeTok{arrival_time =} \NormalTok{arr_time)}
+\KeywordTok{names}\NormalTok{(flights_time)}
 \end{Highlighting}
 \end{Shaded}
 
+It's easy to forget if the new name comes before or after the equals
+sign. I usually remember this as ``New Before, Old After'' or NBOA.
+You'll receive an error if you try to do it the other way:
+
 \begin{verbatim}
-## # A tibble: 10 × 3
-##     dest num_flights                       airport_name
-##    <chr>       <int>                              <chr>
-## 1    ORD       17283                 Chicago Ohare Intl
-## 2    ATL       17215    Hartsfield Jackson Atlanta Intl
-## 3    LAX       16174                   Los Angeles Intl
-## 4    BOS       15508 General Edward Lawrence Logan Intl
-## 5    MCO       14082                       Orlando Intl
-## 6    CLT       14064             Charlotte Douglas Intl
-## 7    SFO       13331                 San Francisco Intl
-## 8    FLL       12055     Fort Lauderdale Hollywood Intl
-## 9    MIA       11728                         Miami Intl
-## 10   DCA        9705      Ronald Reagan Washington Natl
+Error: Unknown variables: departure_time, arrival_time.
 \end{verbatim}
 
-In case you didn't know, \texttt{"ORD"} is the airport code of Chicago
-O'Hare airport and \texttt{"FLL"} is the main airport in Fort
-Lauderdale, Florida, which we can now see in our
-\texttt{named\_freq\_dests} data frame.
+\subsection{Find the top number of values using
+top\_n}\label{find-the-top-number-of-values-using-top_n}
 
-A visual representation of the \texttt{inner\_join} is given below
-\citep{rds2016}:
+We can also use the \texttt{top\_n} function which automatically tells
+us the most frequent \texttt{num\_flights}. We specify the top 10
+airports here:
 
-\begin{figure}
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{named_dests %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{top_n}\NormalTok{(}\DataTypeTok{n =} \DecValTok{10}\NormalTok{, }\DataTypeTok{wt =} \NormalTok{num_flights)}
+\end{Highlighting}
+\end{Shaded}
 
-{\centering \includegraphics[width=\textwidth]{images/join-inner} 
+We'll still need to arrange this by \texttt{num\_flights} though:
 
-}
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{named_dests  %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{top_n}\NormalTok{(}\DataTypeTok{n =} \DecValTok{10}\NormalTok{, }\DataTypeTok{wt =} \NormalTok{num_flights) %>%}\StringTok{ }
+\StringTok{  }\KeywordTok{arrange}\NormalTok{(}\KeywordTok{desc}\NormalTok{(num_flights))}
+\end{Highlighting}
+\end{Shaded}
 
-\caption[Diagram of inner join from R for Data Science]{Diagram of inner join from R for Data Science}\label{fig:ijdiagram}
-\end{figure}
+\textbf{Note:} Remember that I didn't pull the \texttt{n} and
+\texttt{wt} arguments out of thin air. They can be found by using the
+\texttt{?} function on \texttt{top\_n}.
 
-There are more complex joins available, but the \texttt{inner\_join}
-will solve nearly all of the problems you'll face in our experience.
+We can go one stop further and tie together the \texttt{group\_by} and
+\texttt{summarize} functions we used to find the most frequent flights:
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\NormalTok{ten_freq_dests <-}\StringTok{ }\NormalTok{flights %>%}
+\StringTok{  }\KeywordTok{group_by}\NormalTok{(dest) %>%}
+\StringTok{  }\KeywordTok{summarize}\NormalTok{(}\DataTypeTok{num_flights =} \KeywordTok{n}\NormalTok{()) %>%}
+\StringTok{  }\KeywordTok{top_n}\NormalTok{(}\DataTypeTok{n =} \DecValTok{10}\NormalTok{) %>%}
+\StringTok{  }\KeywordTok{arrange}\NormalTok{(}\KeywordTok{desc}\NormalTok{(num_flights))}
+\KeywordTok{View}\NormalTok{(ten_freq_dests)}
+\end{Highlighting}
+\end{Shaded}
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
@@ -3675,32 +3868,57 @@ \section{Joining/merging data frames}\label{joiningmerging-data-frames}
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC5.11)} What happens when you try to \texttt{inner\_join} the
-\texttt{ten\_freq\_dests} data frame with \texttt{airports} instead of
-\texttt{airports\_small}? How might one use this result to answer
-further questions about the top 10 destinations?
+\textbf{(LC5.15)} What are some ways to select all three of the
+\texttt{dest}, \texttt{air\_time}, and \texttt{distance} variables from
+\texttt{flights}? Give the code showing how to do this in at least three
+different ways.
+
+\textbf{(LC5.16)} How could one use \texttt{starts\_with},
+\texttt{ends\_with}, and \texttt{contains} to select columns from the
+\texttt{flights} data frame? Provide three different examples in total:
+one for \texttt{starts\_with}, one for \texttt{ends\_with}, and one for
+\texttt{contains}.
+
+\textbf{(LC5.17)} Why might we want to use the \texttt{select} function
+on a data frame?
 
-\textbf{(LC5.12)} What surprises you about the top 10 destinations from
-NYC in 2013?
+\textbf{\texttt{paste0("(LC",\ chap,\ ".",\ (lc\ \textless{}-\ lc\ +\ 1),\ ")")}}
+Create a new data frame that shows the top 5 airports with the largest
+arrival delays from NYC in 2013.
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
+\section{Conclusion}\label{conclusion-1}
+
+\subsection{Resources}\label{resources-1}
+
 As we saw with the RStudio cheatsheet on
 \href{https://www.rstudio.com/wp-content/uploads/2015/12/ggplot2-cheatsheet-2.0.pdf}{data
 visualization}, RStudio has also created a cheatsheet for data
 manipulation entitled ``Data Wrangling with dplyr and tidyr'' available
-\href{https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf}{here}.
+
+\begin{itemize}
+\tightlist
+\item
+  By clicking
+  \href{https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf}{here}
+\item
+  Or by clicking the RStudio Menu Bar -\textgreater{} Help
+  -\textgreater{} Cheatsheets -\textgreater{} ``Data Manipulation with
+  \texttt{dplyr}, \texttt{tidyr}''
+\end{itemize}
+
 We will focus only on the \texttt{dplyr} functions in this book, but you
 are encouraged to also explore \texttt{tidyr} if you are presented with
 data that is not in the tidy format that we have specified as the
 preferred option for our purposes.
 
-\section{Script of R code}\label{script-of-r-code-1}
+\subsection{Script of R code}\label{script-of-r-code-1}
 
 An R script file of all R code used in this chapter is available
 \href{http://ismayc.github.io/moderndiver-book/05-manip.R}{here}.
 
-\section{What's to come?}\label{whats-to-come-2}
+\subsection{What's to come?}\label{whats-to-come-2}
 
 This concludes the \textbf{Data Exploration} unit of this book. You
 should be pretty proficient in both plotting variables (or multiple
@@ -3719,16 +3937,15 @@ \section{What's to come?}\label{whats-to-come-2}
 
 \part{Inference}\label{part-inference}
 
-\chapter{\texorpdfstring{Simulating Randomness via
-\texttt{mosaic}}{Simulating Randomness via mosaic}}\label{simulating-randomness-via-mosaic}
+\chapter{Simulating Randomness via mosaic}\label{sim}
 
 In this chapter we will introduce new concepts that will serve as the
 basis for the remainder of the text: \textbf{sampling} and
 \textbf{resampling}. We will see that the tools that you learned in the
-Data Exploration part of this book (tidy data, data manipulation, and
-data visualization) will also play an important role here. As mentioned
-before, the concepts all build into a culmination allowing you to create
-better stories with data.
+Data Exploration part of this book (tidy data, data visualization, and
+data manipulation) will also play an important role here. As mentioned
+before, the concepts throughout this text all build into a culmination
+allowing you to create better stories with data.
 
 We begin with some helpful definitions that will help us better
 understand why statistical inference exists and why it is needed. We
@@ -3745,8 +3962,8 @@ \chapter{\texorpdfstring{Simulating Randomness via
 different functions introduced in the \texttt{mosaic} package in this
 chapter.
 
-\section*{Needed packages}\label{needed-packages-2}
-\addcontentsline{toc}{section}{Needed packages}
+\subsection*{Needed packages}\label{needed-packages-3}
+\addcontentsline{toc}{subsection}{Needed packages}
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -3802,20 +4019,6 @@ \subsection{Tasting soup}\label{tasting-soup}
 of soup from? Is there anything we should do to the soup before we
 taste? Is one taste enough?
 
-\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
-
-\begin{learncheck}
-\textbf{\emph{Learning check}}
-\end{learncheck}
-
-\textbf{(LC6.1)} Explain in your own words how tasting soup relates to
-the concepts of sampling covered here.
-
-\textbf{(LC6.2)} Describe a different scenario (not food or drink
-related) that is analogous to sampling concepts covered here.
-
-\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
-
 \subsection{Common terms}\label{common-terms}
 
 The process of sampling brings with it many common terms that we define
@@ -3872,6 +4075,20 @@ \subsection{Common terms}\label{common-terms}
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
+\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
+
+\begin{learncheck}
+\textbf{\emph{Learning check}}
+\end{learncheck}
+
+\textbf{(LC6.1)} Explain in your own words how tasting soup relates to
+the concepts of sampling covered here.
+
+\textbf{(LC6.2)} Describe a different scenario (not food or drink
+related) that is analogous to sampling concepts covered here.
+
+\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
+
 Let's explore these terms for our tasting soup example:
 
 \emph{Population} - the entire container of soup that we have cooked.
@@ -3947,16 +4164,17 @@ \section{Visualizing sampling}\label{visualizing-sampling}
 
 Let's explore how sampling and these other terms relate to working with
 data and data visualization. Here we introduce the \texttt{okcupiddata}
-R package. Note that permission to use this data to create the R package
-was explicitly granted by OkCupid. More information about this package
-is available \href{https://github.com/rudeboybert/okcupiddata}{here}.
-The \texttt{profiles} data frame in this R data package contains data
-about 59,946 OkCupid users who were living within 25 miles of San
-Francisco, had active profiles on June 26, 2012, were online in the
-previous year, and had at least one picture in their profile. We will be
-focusing on the \texttt{height} variable, which corresponds to
-self-reported heights of the individual on their profile. Note that this
-is measured in inches.
+R package \citep{R-okcupiddata}. Note that permission to use this data
+to create the R package was explicitly granted by OkCupid. More
+information about this package is available
+\href{https://github.com/rudeboybert/okcupiddata}{here}. The
+\texttt{profiles} data frame in this R data package contains data about
+59,946 OkCupid users who were living within 25 miles of San Francisco,
+had active profiles on June 26, 2012, were online in the previous year,
+and had at least one picture in their profile. We will be focusing on
+the \texttt{height} variable, which corresponds to a self-reported
+height for each individual on their profile. Note that this is measured
+in inches.
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -4003,8 +4221,7 @@ \section{Visualizing sampling}\label{visualizing-sampling}
 \begin{Shaded}
 \begin{Highlighting}[]
 \KeywordTok{library}\NormalTok{(dplyr)}
-\NormalTok{profiles_subset <-}\StringTok{ }\NormalTok{profiles %>%}\StringTok{ }
-\StringTok{  }\KeywordTok{filter}\NormalTok{(}\KeywordTok{between}\NormalTok{(height, }\DecValTok{55}\NormalTok{, }\DecValTok{85}\NormalTok{))}
+\NormalTok{profiles_subset <-}\StringTok{ }\NormalTok{profiles %>%}\StringTok{ }\KeywordTok{filter}\NormalTok{(}\KeywordTok{between}\NormalTok{(height, }\DecValTok{55}\NormalTok{, }\DecValTok{85}\NormalTok{))}
 \end{Highlighting}
 \end{Shaded}
 
@@ -4013,7 +4230,6 @@ \section{Visualizing sampling}\label{visualizing-sampling}
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\KeywordTok{library}\NormalTok{(ggplot2)}
 \KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data =} \NormalTok{profiles_subset, }\DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \NormalTok{height)) +}
 \StringTok{  }\KeywordTok{geom_histogram}\NormalTok{(}\DataTypeTok{bins =} \DecValTok{20}\NormalTok{, }\DataTypeTok{color =} \StringTok{"white"}\NormalTok{)}
 \end{Highlighting}
@@ -4040,20 +4256,20 @@ \section{Visualizing sampling}\label{visualizing-sampling}
 same random sample when they run the code above. It is a way of
 interfacing with the pseudo-random number generation scheme that R uses
 to generate ``random'' numbers. If that command was not run, you'd
-obtain a different random sample if you ran the code above for the first
-time.
+obtain a different random sample than someone else if you ran the code
+above for the first time.
 
 We have introduced the \texttt{resample} function from the
-\texttt{mosaic} package here. This function can be used for both
-sampling with and without replacement. Here we have chosen to sample
-without replacement. In other words, after the first row is chosen from
-the \texttt{profiles\_subset} data frame at random it is kept out of the
-further 99 samples. Let's now visualize the 100 values of the
-\texttt{height} variable in the \texttt{profiles\_sample1} data frame.
-To keep this visualization on the same horizontal scale as our original
-population presented in \texttt{profiles\_subset} we can use the
-\texttt{coord\_cartesian} function along with the \texttt{c} function to
-specify the limits on the horizontal axis.
+\texttt{mosaic} package here \citep{R-mosaic}. This function can be used
+for both sampling with and without replacement. Here we have chosen to
+sample without replacement. In other words, after the first row is
+chosen from the \texttt{profiles\_subset} data frame at random it is
+kept out of the further 99 samples. Let's now visualize the 100 values
+of the \texttt{height} variable in the \texttt{profiles\_sample1} data
+frame. To keep this visualization on the same horizontal scale as our
+original population presented in \texttt{profiles\_subset} we can use
+the \texttt{coord\_cartesian} function along with the \texttt{c}
+function to specify the limits on the horizontal axis.
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -4095,11 +4311,11 @@ \section{Visualizing sampling}\label{visualizing-sampling}
 Remember that a sample can never truly quantify all of the properties of
 a population since it contains less data and, thus, less information. We
 can use the overall shape as a good guess as to the representativeness
-of the sample in regards to the population. We see that the above two
-random samples of size 100 have roughly the same shape as the original
-population \texttt{height} data. Let's next explore what is known as a
-convenience sample and how its distribution compares to the population
-distribution.
+of the sample in regards to the population though. We see that the above
+two random samples of size 100 have roughly the same shape as the
+original population \texttt{height} data. Let's next explore what is
+known as a convenience sample and how its distribution compares to the
+population distribution.
 
 A \textbf{convenience sample} is a sample that is chosen conveniently by
 the person selecting the sample. While certainly less work, convenience
@@ -4111,8 +4327,7 @@ \section{Visualizing sampling}\label{visualizing-sampling}
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{profiles_sample3 <-}\StringTok{ }\NormalTok{profiles_subset %>%}\StringTok{ }
-\StringTok{  }\KeywordTok{filter}\NormalTok{(height >=}\StringTok{ }\DecValTok{72}\NormalTok{)}
+\NormalTok{profiles_sample3 <-}\StringTok{ }\NormalTok{profiles_subset %>%}\StringTok{ }\KeywordTok{filter}\NormalTok{(height >=}\StringTok{ }\DecValTok{72}\NormalTok{)}
 \KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data =} \NormalTok{profiles_sample3, }\DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \NormalTok{height)) +}
 \StringTok{  }\KeywordTok{geom_histogram}\NormalTok{(}\DataTypeTok{bins =} \DecValTok{20}\NormalTok{, }\DataTypeTok{color =} \StringTok{"white"}\NormalTok{, }\DataTypeTok{fill =} \StringTok{"blue"}\NormalTok{) +}
 \StringTok{  }\KeywordTok{coord_cartesian}\NormalTok{(}\DataTypeTok{xlim =} \KeywordTok{c}\NormalTok{(}\DecValTok{55}\NormalTok{, }\DecValTok{85}\NormalTok{))}
@@ -4122,11 +4337,11 @@ \section{Visualizing sampling}\label{visualizing-sampling}
 \begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/sample-profiles3-1} \end{center}
 
 This is a clear example of a sample that is not representative of the
-population. The population \texttt{height} is roughly symmetric, whereas
-this distribution is right-skewed. Further, since it only selects large
-heights it has completely excluded the small and middle heights. We have
-seen here that data visualization provides an excellent tool in judging
-the representativeness of a sample.
+population. The population \texttt{height} variable is roughly
+symmetric, whereas this distribution is right-skewed. Further, since it
+only selects large heights it has completely excluded the small and
+middle heights. We have seen here that data visualization provides an
+excellent tool in judging the representativeness of a sample.
 
 \subsection{Sampling distribution}\label{sampling-distribution}
 
@@ -4229,14 +4444,15 @@ \subsection{\texorpdfstring{Repeated sampling via
 Note how the range of sample mean height values is much more narrow than
 the original range of \texttt{height} in the \texttt{profiles\_subset}
 data frame. We also see a characteristic shape to this distribution of
-\texttt{sample\_mean}: the normal curve. This idea is commonly
+\texttt{mean\_height}: the normal curve. This idea is commonly
 associated with statistics and you hopefully have a good sense of how
 this distribution comes about. As before, if you aren't quite sure of
 this yet, go back and explore the shiny app above a bit more. We see
 that many values for the sample mean appear near the center of the
-distribution and a few values out in the tails providing the bell-shaped
-distribution linked with the normal distribution. You'll see more
-examples of this in the chapters to come and in the appendices.
+distribution and a few values appear out in the tails providing the
+bell-shaped distribution linked with the normal distribution. You'll see
+more examples of this in the chapters to come and in Appendix
+\ref{appendixB}.
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
@@ -4333,10 +4549,10 @@ \section{Simulation}\label{simulation}
 It's amazing that there is no actual evidence that such an event
 actually took place. This problem is a great introduction into inference
 though and we can proceed by testing to see how likely it is for a
-person to guess correctly, say, 9 out of 10 times assuming that that
-person is just guessing. In other words, is the person just lucky or do
-we have reason to suspect that they can actually detect whether milk was
-put in first or not?
+person to guess correctly, say, 9 out of 10 times, assuming that person
+is just guessing. In other words, is the person just lucky or do we have
+reason to suspect that they can actually detect whether milk was put in
+first or not?
 
 We need to think about this problem from the standpoint of hypothesis
 testing. First, we'll need to identify some important parts of a
@@ -4433,10 +4649,11 @@ \section{Simulation}\label{simulation}
 If you look at the output above for our simulation of 13 student
 guesses, we can begin to get a sense for what an ``expected'' sample
 proportion of successes may be. Around five out of 10 seems to be the
-most likely value. What does this say about our assumed \(\hat{p}\) of
-9/10? To better answer this question, we can simulate 10,000 student
-guesses and then look at the distribution of the simulated sample
-proportion of successes, also known as the \textbf{null distribution}.
+most likely value. What does this say about what we actually observed
+with a success rate of 9/10? To better answer this question, we can
+simulate 10,000 student guesses and then look at the distribution of the
+simulated sample proportion of successes, also known as the \textbf{null
+distribution}.
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -4466,7 +4683,7 @@ \section{Simulation}\label{simulation}
 \end{verbatim}
 
 We can see here that we have created a count of how many of each of the
-10,000 sets of 10 flips resulted in 0, 1, 2, \ldots{}, up to 10 heads.
+10,000 sets of 10 flips resulted in 0, 1, 2, \(\ldots\), up to 10 heads.
 Note the use of the \texttt{group\_by} and \texttt{summarize} functions
 from Chapter \ref{manip} here.
 
@@ -4490,11 +4707,11 @@ \section{Simulation}\label{simulation}
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-66-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-71-1} 
 
 }
 
-\caption[Histogram of number of heads in simulation - needs tweaking]{Histogram of number of heads in simulation - needs tweaking}\label{fig:unnamed-chunk-66}
+\caption[Histogram of number of heads in simulation - needs tweaking]{Histogram of number of heads in simulation - needs tweaking}\label{fig:unnamed-chunk-71}
 \end{figure}
 
 This horizontal axis labels are a little confusing here. What does 2.5
@@ -4514,11 +4731,11 @@ \section{Simulation}\label{simulation}
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-67-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-72-1} 
 
 }
 
-\caption[Barplot of number of heads in simulation]{Barplot of number of heads in simulation}\label{fig:unnamed-chunk-67}
+\caption[Barplot of number of heads in simulation]{Barplot of number of heads in simulation}\label{fig:unnamed-chunk-72}
 \end{figure}
 
 You'll frequently need to make this conversion to \texttt{factor} when
@@ -4530,11 +4747,11 @@ \section{Simulation}\label{simulation}
 1, 2, \(\ldots\), 10.
 
 Again, note that the shape of these number of heads follows what appears
-to be a normal distribution. We'll see that if appropriate
-conditions/assumptions are met with the data that we can expect to see a
-normal distribution result. When these conditions aren't met, the
-simulation methodology we've presented here still works well whereas the
-traditional normal-based methods start to fall apart.
+to be a normal distribution. We'll see in a related example that if
+appropriate conditions/assumptions are met with the data that we can
+expect to see a normal distribution result. When these conditions aren't
+met, the simulation methodology we've presented here still works well
+whereas the traditional normal-based methods start to fall apart.
 
 We will delve further into hypothesis testing in the next few chapters.
 This null distribution in combination with the \textbf{sampling
@@ -4590,12 +4807,14 @@ \section{\texorpdfstring{Review of \texttt{mosaic} simulation
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
-\section{Script of R code}\label{script-of-r-code-2}
+\section{Conclusion}\label{conclusion-2}
+
+\subsection{Script of R code}\label{script-of-r-code-2}
 
 An R script file of all R code used in this chapter is available
 \href{http://ismayc.github.io/moderndiver-book/06-sim.R}{here}.
 
-\section{What's to come?}\label{whats-to-come-3}
+\subsection{What's to come?}\label{whats-to-come-3}
 
 This chapter has served as an introduction into inferential techniques
 that will be discussed in greater detail in Chapter \ref{hypo} for
@@ -4621,8 +4840,8 @@ \chapter{Hypothesis Testing}\label{hypo}
 for each variation, but the important idea is to understand the general
 framework so that you can apply it to more specific problems. We believe
 that this approach is much better in the long-term than teaching you
-specific tests and confidence intervals rigorously. You can find full
-worked out examples for five common hypothesis tests and their
+specific tests and confidence intervals rigorously. You can find
+fully-worked out examples for five common hypothesis tests and their
 corresponding confidence intervals in Appendix \ref{appendixB}. We
 recommend that you carefully review these examples as they also cover
 how the general frameworks apply to traditional normal-based
@@ -4634,14 +4853,13 @@ \chapter{Hypothesis Testing}\label{hypo}
 of restrictions and further advance computational thinking, which is one
 big reason for their emphasis throughout this textbook.
 
-\section*{Needed packages}\label{needed-packages-3}
-\addcontentsline{toc}{section}{Needed packages}
+\subsection*{Needed packages}\label{needed-packages-4}
+\addcontentsline{toc}{subsection}{Needed packages}
 
 \begin{Shaded}
 \begin{Highlighting}[]
 \KeywordTok{library}\NormalTok{(dplyr)}
 \KeywordTok{library}\NormalTok{(ggplot2)}
-\KeywordTok{library}\NormalTok{(okcupiddata)}
 \KeywordTok{library}\NormalTok{(mosaic)}
 \KeywordTok{library}\NormalTok{(knitr)}
 \KeywordTok{library}\NormalTok{(nycflights13)}
@@ -4728,7 +4946,7 @@ \section{When Inference Is Not
 \end{Highlighting}
 \end{Shaded}
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-72-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-77-1} \end{center}
 
 Since there is no overlap at all, we can conclude that the
 \texttt{air\_time} for San Francisco flights is statistically greater
@@ -4808,12 +5026,12 @@ \section{Criminal trial analogy}\label{trial}
 We can think of hypothesis testing in the same context as a criminal
 trial in the United States. A criminal trial in the United States is a
 familiar situation in which a choice between two contradictory claims
-must be made. 1. The accuser of the crime must be judged either guilty
-or not guilty.
+must be made.
 
 \begin{enumerate}
 \def\labelenumi{\arabic{enumi}.}
-\setcounter{enumi}{1}
+\item
+  The accuser of the crime must be judged either guilty or not guilty.
 \item
   Under the U.S. system of justice, the individual on trial is initially
   presumed not guilty.
@@ -4899,9 +5117,16 @@ \section{Types of Errors in Hypothesis
   a guilty person is set free (found not guilty).
 \end{itemize}
 
-The possible errors in a hypothesis test are - rejecting \(H_0\) when in
-fact \(H_0\) is true (Type I Error) - failing to reject \(H_0\) when in
-fact \(H_0\) is false (Type II Error)
+The possible errors in a hypothesis test are
+
+\begin{itemize}
+\tightlist
+\item
+  rejecting \(H_0\) when in fact \(H_0\) is true (Type I Error) or
+\item
+  failing to reject \(H_0\) when in fact \(H_0\) is false (Type II
+  Error).
+\end{itemize}
 
 The risk of error is the price researchers pay for basing an inference
 about a population on a sample. With any reasonable sample-based
@@ -4917,7 +5142,7 @@ \section{Types of Errors in Hypothesis
 
 }
 
-\caption[Type I and Type II errors]{Type I and Type II errors}\label{fig:unnamed-chunk-73}
+\caption[Type I and Type II errors]{Type I and Type II errors}\label{fig:unnamed-chunk-78}
 \end{figure}
 
 If we are using sample data to make inferences about a parameter, we run
@@ -4969,8 +5194,8 @@ \section{Types of Errors in Hypothesis
 failing to reject \(H_0\) when we should -- will \emph{increase}! Thus,
 as \(\alpha\) decreases, \(\beta\) increases and as \(\alpha\)
 increases, \(\beta\) decreases. We, therefore, need to strike a balance
-in \(\alpha\) and \(\beta\) and the common values of 0.05, 0.01, and
-0.10 usually lead to a nice balance.
+in \(\alpha\) and \(\beta\) and the common values for \(\alpha\) of
+0.05, 0.01, and 0.10 usually lead to a nice balance.
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
@@ -4978,8 +5203,8 @@ \section{Types of Errors in Hypothesis
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC7.2)} Reproduce the table above, but for a hypothesis test,
-instead of the one provided for a criminal trial.
+\textbf{(LC7.2)} Reproduce the table above about errors, but for a
+hypothesis test, instead of the one provided for a criminal trial.
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
@@ -4989,7 +5214,7 @@ \subsection{Logic of Hypothesis
 \begin{itemize}
 \tightlist
 \item
-  Take a random sample (or samples) from a population (or two
+  Take a random sample (or samples) from a population (or multiple
   populations)
 \item
   If the sample data are consistent with the null hypothesis, do not
@@ -5005,24 +5230,12 @@ \section{Statistical Significance}\label{statistical-significance}
 
 The idea that sample results are more extreme than we would reasonably
 expect to see by random chance if the null hypothesis were true is the
-fundamental idea behind statistical hypothesis tests. If data as extreme
-would be very unlikely if the null hypothesis were true, we say the data
-are \textbf{statistically significant}. Statistically significant data
-provide convincing evidence against the null hypothesis in favor of the
-alternative, and allow us to generalize our sample results to the claim
-about the population.
-
-\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
-
-\textbf{Definition: Statistical Significance}
-
-When results as extreme as the observed sample statistic are unlikely to
-occur by random chance alone (assuming the null hypothesis is true), we
-say the sample results/statistics are \emph{statistically significant}.
-If our sample is statistically significant, we have convincing evidence
-against \(H_0\) and in favor of \(H_a\).
-
-\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
+fundamental idea behind statistical hypothesis tests. If data at least
+as extreme would be very unlikely if the null hypothesis were true, we
+say the data are \textbf{statistically significant}. Statistically
+significant data provide convincing evidence against the null hypothesis
+in favor of the alternative, and allow us to generalize our sample
+results to the claim about the population.
 
 \begin{learncheck}
 \textbf{\emph{Learning check}}
@@ -5052,7 +5265,7 @@ \section{EXAMPLE: Revisiting the Lady Tasting
 \caption[Hypothesis Testing Framework]{Hypothesis Testing Framework}\label{fig:htdowney2}
 \end{figure}
 
-We will now walk-through how each of the steps to the diagram apply to
+We will now walk through how each of the steps to the diagram apply to
 determining whether the lady tasting tea was actually better than chance
 at determining whether or not milk was added first. We will see that the
 process of creating a null distribution is a statistical way to
@@ -5060,7 +5273,7 @@ \section{EXAMPLE: Revisiting the Lady Tasting
 
 \subsection{Data}\label{data}
 
-Let's assume as we did in Chapter \ref{sim}, that the lady is correct in
+Let's assume as we did in Chapter \ref{sim} that the lady is correct in
 determining whether milk was added first or not in 9 out of 10 trials.
 Our data, therefore, may look something like
 
@@ -5116,7 +5329,7 @@ \subsection{Simulated Data}\label{simulated-data}
 
 We now want to use this null hypothesis to simulate the test statistic
 assuming that the null hypothesis is true. Therefore, we want to figure
-out a way to simulate in 10 trials, getting either the choice Correct or
+out a way to simulate 10 trials, getting either the choice Correct or
 Incorrect, assuming that the probability of success (getting it Correct)
 in any given trial is 0.5.
 
@@ -5128,15 +5341,20 @@ \subsection{Simulated Data}\label{simulated-data}
 tactile, hands on experiment can help.
 
 In this case, flipping a fair coin is a great way to simulate this
-process. To simulate 10 trials, we could flip the fair coin and record
-Heads as Correct and Tails as Incorrect.
+process. This simulates how the sample could be collected assuming the
+null hypothesis is true. To simulate 10 trials, we could flip the fair
+coin and record Heads as Correct and Tails as Incorrect.
 
 Some simulated data using this coin flipping procedure may look like the
 following. Note that this data frame is not tidy, but is a convenient
 way to look at the results of the simulation in this wide format. The
 numbers on the fair left correspond to the number of the trial.
 
-\begin{tabular}{l|l|l|l}
+\begin{table}
+
+\caption{\label{tab:sample-table}A table of three sets of 10 coin flips}
+\centering
+\begin{tabular}[t]{l|l|l|l}
 \hline
   & sample1 & sample2 & sample3\\
 \hline
@@ -5161,6 +5379,7 @@ \subsection{Simulated Data}\label{simulated-data}
 10 & Incorrect & Correct & Incorrect\\
 \hline
 \end{tabular}
+\end{table}
 
 We then use the formula for the \textbf{Test Statistic} to determine the
 simulated test statistic for each of these simulated samples. So in this
@@ -5171,11 +5390,11 @@ \subsection{Simulated Data}\label{simulated-data}
 \subsection{\texorpdfstring{Distribution of \(\delta\) under
 \(H_0\)}{Distribution of \textbackslash{}delta under H\_0}}\label{distribution-of-delta-under-h_0}
 
-We could continue this process say 10,000 times by flipping a coin in
+We could continue this process, say, 10,000 times by flipping a coin in
 sets of 10 for 10,000 repetitions and counting and taking note of how
 many heads out of 10 we have for each set. It's at this point that you
-realize that a computer can do this procedure much faster and more
-efficient than the tactile experiment with a coin.
+surely realize that a computer can do this procedure much faster and
+more efficient than the tactile experiment with a coin.
 
 Recall that we've already created the distribution of 10,000 such coin
 flips and we've stored these values in the \texttt{heads} variable in
@@ -5189,7 +5408,7 @@ \subsection{\texorpdfstring{Distribution of \(\delta\) under
 \end{Highlighting}
 \end{Shaded}
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-77-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-81-1} \end{center}
 
 \subsection{The p-value}\label{the-p-value}
 
@@ -5213,7 +5432,7 @@ \subsection{The p-value}\label{the-p-value}
 this problem. Note that ``more extreme'' in this case corresponds to
 looking at values of 9 or greater since our alternative hypothesis
 invokes a right-tail test corresponding to a ``greater than'' hypothesis
-of \(H_a: \pi > 0.5\). In other words, we are looking to see how likely
+of \(H_a: \tau > 5\). In other words, we are looking to see how likely
 it is for the lady to pick 9 or more correct instead of 9 or less
 correct. We'd like to go in the right direction.
 
@@ -5235,235 +5454,14 @@ \subsection{The p-value}\label{the-p-value}
   us to this naming.
 \item
   We are working with the \texttt{simGuesses} data frame here so that
-  comes immediately before the pipe operator.
+  comes immediately before the pipe operator.\\
 \item
   We would like to only focus on the rows in our \texttt{simGuesses}
   data frame that have \texttt{heads} values of 9 or 10. This represents
   simulated statistics ``as extreme or more extreme'' than what we
-  observed (9 correct guesses out of 10). Let's get a glimpse of what we
-  have up to this point:
-
-\begin{Shaded}
-\begin{Highlighting}[]
-\KeywordTok{kable}\NormalTok{(simGuesses %>%}\StringTok{ }\KeywordTok{filter}\NormalTok{(heads >=}\StringTok{ }\DecValTok{9}\NormalTok{))    }
-\end{Highlighting}
-\end{Shaded}
-
-  \begin{tabular}{r|r|r|r}
-  \hline
-  n & heads & tails & prop\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 10 & 0 & 1.0\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 10 & 0 & 1.0\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 10 & 0 & 1.0\\
-  \hline
-  10 & 10 & 0 & 1.0\\
-  \hline
-  10 & 10 & 0 & 1.0\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 10 & 0 & 1.0\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 10 & 0 & 1.0\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 10 & 0 & 1.0\\
-  \hline
-  10 & 10 & 0 & 1.0\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 10 & 0 & 1.0\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 10 & 0 & 1.0\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 10 & 0 & 1.0\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 10 & 0 & 1.0\\
-  \hline
-  10 & 10 & 0 & 1.0\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  10 & 9 & 1 & 0.9\\
-  \hline
-  \end{tabular}
+  observed (9 correct guesses out of 10). To get a glimpse of what we
+  have up to this point, run
+  \texttt{simGuesses\ \%\textgreater{}\%\ filter(heads\ \textgreater{}=\ 9)\ \%\textgreater{}\%\ View()}.
 \item
   Now that we have changed the focus to only those rows that have number
   of heads out of 10 flips corresponding to 9 or more, we count how many
@@ -5479,7 +5477,7 @@ \subsection{The p-value}\label{the-p-value}
 have evidence supporting the conclusion that the person is actually
 better than just guessing at random at determining whether milk has been
 added first or not. To better visualize this we can also make use of
-pink shading on the histogram corresponding to the \(p\)-value:
+blue shading on the histogram corresponding to the \(p\)-value:
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -5492,23 +5490,20 @@ \subsection{The p-value}\label{the-p-value}
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-80-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-83-1} 
 
 }
 
-\caption[Barplot of heads with p-value highlighted]{Barplot of heads with p-value highlighted}\label{fig:unnamed-chunk-80}
+\caption[Barplot of heads with p-value highlighted]{Barplot of heads with p-value highlighted}\label{fig:unnamed-chunk-83}
 \end{figure}
 
 This helps us better see just how few of the values of \texttt{heads}
-are at our observed value or more extreme.
-
-We'll see in Chapters \ref{hypo} and \ref{ci} that this idea of a
-\(p\)-value can be extended to the more traditional methods using normal
-and \(t\) distributions in the traditional way that introductory
-statistics has been presented. These traditional methods were used
-because statisticians haven't always been able to do 10,000 simulations
-on the computer within seconds. We'll elaborate on this more in these
-later chapters.
+are at our observed value or more extreme. This idea of a \(p\)-value
+can be extended to the more traditional methods using normal and \(t\)
+distributions in the traditional way that introductory statistics has
+been presented. These traditional methods were used because
+statisticians haven't always been able to do 10,000 simulations on the
+computer within seconds. We'll elaborate on this more in a few sections.
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
@@ -5516,13 +5511,16 @@ \subsection{The p-value}\label{the-p-value}
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC7.6)} What is meant by ``pseudo-random number generation?''
+\textbf{(LC7.6)} How could we make Table \ref{tab:sample-table} into a
+tidy data frame?
 
-\textbf{(LC7.7)} How can simulation be used to help us address the
+\textbf{(LC7.7)} What is meant by ``pseudo-random number generation?''
+
+\textbf{(LC7.8)} How can simulation be used to help us address the
 question of whether or not an observed result is statistically
 significant?
 
-\textbf{(LC7.8)} In Chapter \ref{viz}, we noted that barplots should be
+\textbf{(LC7.9)} In Chapter \ref{viz}, we noted that barplots should be
 used when creating a plot of categorical variables. Why are we using
 barplots to make a plot of a numerical variable \texttt{heads} in this
 chapter?
@@ -5539,7 +5537,8 @@ \subsection{Randomization/Permutation}\label{randomizationpermutation}
 means using the Greek symbol \(\mu\) (pronounced ``mu''). Thus, we will
 be looking to see if one group ``out-performs'' another group. This is
 quite possibly the most common type of statistical inference and serves
-as a basis for many other types of analyses when comparing two groups.
+as a basis for many other types of analyses when comparing the
+relationship between two variables.
 
 Our null hypothesis will be of the form \(H_0: \mu_1 = \mu_2\), which
 can also be written as \(H_0: \mu_1 - \mu_2 = 0\). Our alternative
@@ -5560,11 +5559,11 @@ \subsection{Comparing Action and Romance
 
 The \texttt{movies} data set in the \texttt{ggplot2movies} package
 contains information on a large number of movies that have been rated by
-users of IMDB.com. We are interested in the question here of whether
-\texttt{Action} movies are rated higher on IMDB than \texttt{Romance}
-movies. We will first need to do a little bit of data manipulation using
-the ideas from Chapter \ref{manip} to get the data in the form that we
-would like:
+users of IMDB.com \citep{R-ggplot2movies}. We are interested in the
+question here of whether \texttt{Action} movies are rated higher on IMDB
+than \texttt{Romance} movies. We will first need to do a little bit of
+data manipulation using the ideas from Chapter \ref{manip} to get the
+data in the form that we would like:
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -5631,11 +5630,11 @@ \subsection{Comparing Action and Romance
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC7.9)} Why are the different genre variables stored as binary
+\textbf{(LC7.10)} Why are the different genre variables stored as binary
 variables (1s and 0s) instead of just listing the \texttt{genre} as a
 column of values like ``Action'', ``Comedy'', etc.?
 
-\textbf{(LC7.10)} What complications could come above with us excluding
+\textbf{(LC7.11)} What complications could come above with us excluding
 action romance movies? Should we question the results of our hypothesis
 test? Explain.
 
@@ -5655,11 +5654,11 @@ \subsection{Comparing Action and Romance
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-84-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-87-1} 
 
 }
 
-\caption[Rating vs genre in the population]{Rating vs genre in the population}\label{fig:unnamed-chunk-84}
+\caption[Rating vs genre in the population]{Rating vs genre in the population}\label{fig:unnamed-chunk-87}
 \end{figure}
 
 We can see that the middle 50\% of ratings for \texttt{"Action"} movies
@@ -5703,7 +5702,7 @@ \subsection{\texorpdfstring{Sampling \(\rightarrow\)
 example, whether a \textbf{treatment} has an effect over a
 \textbf{control} and other ways to statistically analyze if one group
 performs better than, worse than, or different than another. We will
-also use confidence intervals to determine the size of the effect if it
+also use confidence intervals to determine the size of the effect, if it
 exists. You'll see more on this in Chapter \ref{ci}.
 
 We are interested here in seeing how we can use a random sample of
@@ -5717,7 +5716,7 @@ \subsection{\texorpdfstring{Sampling \(\rightarrow\)
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC7.11)} Define the relevant parameters here in terms of the
+\textbf{(LC7.12)} Define the relevant parameters here in terms of the
 populations of movies.
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
@@ -5746,18 +5745,18 @@ \subsection{Data}\label{data-1}
 
 \begin{Shaded}
 \begin{Highlighting}[]
- \KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data =} \NormalTok{movies_genre_sample, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \NormalTok{genre, }\DataTypeTok{y =} \NormalTok{rating)) +}
+\KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data =} \NormalTok{movies_genre_sample, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \NormalTok{genre, }\DataTypeTok{y =} \NormalTok{rating)) +}
 \StringTok{  }\KeywordTok{geom_boxplot}\NormalTok{()}
 \end{Highlighting}
 \end{Shaded}
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-86-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-89-1} 
 
 }
 
-\caption[Genre vs rating for our sample]{Genre vs rating for our sample}\label{fig:unnamed-chunk-86}
+\caption[Genre vs rating for our sample]{Genre vs rating for our sample}\label{fig:unnamed-chunk-89}
 \end{figure}
 
 \begin{Shaded}
@@ -5770,11 +5769,11 @@ \subsection{Data}\label{data-1}
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-87-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-90-1} 
 
 }
 
-\caption[Genre vs rating for our sample as faceted histogram]{Genre vs rating for our sample as faceted histogram}\label{fig:unnamed-chunk-87}
+\caption[Genre vs rating for our sample as faceted histogram]{Genre vs rating for our sample as faceted histogram}\label{fig:unnamed-chunk-90}
 \end{figure}
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
@@ -5783,7 +5782,7 @@ \subsection{Data}\label{data-1}
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC7.12)} What single value could we change to improve the
+\textbf{(LC7.13)} What single value could we change to improve the
 approximation using the sample distribution on the population
 distribution?
 
@@ -5806,17 +5805,19 @@ \subsection{Data}\label{data-1}
 \StringTok{  }\KeywordTok{summarize}\NormalTok{(}\DataTypeTok{mean =} \KeywordTok{mean}\NormalTok{(rating),}
             \DataTypeTok{std_dev =} \KeywordTok{sd}\NormalTok{(rating),}
             \DataTypeTok{n =} \KeywordTok{n}\NormalTok{())}
-\NormalTok{summary_ratings}
+\NormalTok{summary_ratings %>%}\StringTok{ }\KeywordTok{kable}\NormalTok{()}
 \end{Highlighting}
 \end{Shaded}
 
-\begin{verbatim}
-## # A tibble: 2 × 4
-##     genre     mean  std_dev     n
-##     <chr>    <dbl>    <dbl> <int>
-## 1  Action 5.197059 1.464837    34
-## 2 Romance 6.026471 1.202096    34
-\end{verbatim}
+\begin{tabular}{l|r|r|r}
+\hline
+genre & mean & std\_dev & n\\
+\hline
+Action & 5.197059 & 1.464837 & 34\\
+\hline
+Romance & 6.026471 & 1.202096 & 34\\
+\hline
+\end{tabular}
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
@@ -5824,7 +5825,7 @@ \subsection{Data}\label{data-1}
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC7.13)} Why did we not specify \texttt{na.rm\ =\ TRUE} here as
+\textbf{(LC7.14)} Why did we not specify \texttt{na.rm\ =\ TRUE} here as
 we did in Chapter \ref{manip}?
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
@@ -5842,7 +5843,7 @@ \subsection{Data}\label{data-1}
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC7.14)} Why might the standard deviation provide some insight
+\textbf{(LC7.15)} Why might the standard deviation provide some insight
 about the means being statistically different or not?
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
@@ -5917,26 +5918,11 @@ \subsection{Simulated Data}\label{simulated-data-1}
 doing the calculation using index cards, we can use R as we have before
 to simulate this process.
 
-\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
-
-\begin{learncheck}
-\textbf{\emph{Learning check}}
-\end{learncheck}
-
-\textbf{(LC7.15)} How would the tactile shuffling of index cards change
-if we had different samples of say 20 action movies and 60 romance
-movies? Describe each step that would change.
-
-\textbf{(LC7.16)} Why are we taking the difference in the means of the
-cards in the new shuffled decks?
-
-\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
-
 \begin{Shaded}
 \begin{Highlighting}[]
 \KeywordTok{library}\NormalTok{(mosaic)}
 \NormalTok{shuffled_ratings <-}\StringTok{ }\NormalTok{movies_trimmed %>%}
-\StringTok{     }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{genre =} \KeywordTok{shuffle}\NormalTok{(genre)) %>%}\StringTok{ }
+\StringTok{     }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{rating =} \KeywordTok{shuffle}\NormalTok{(rating)) %>%}\StringTok{ }
 \StringTok{     }\KeywordTok{group_by}\NormalTok{(genre) %>%}
 \StringTok{     }\KeywordTok{summarize}\NormalTok{(}\DataTypeTok{mean =} \KeywordTok{mean}\NormalTok{(rating))}
 \KeywordTok{diff}\NormalTok{(shuffled_ratings$mean)}
@@ -5944,9 +5930,24 @@ \subsection{Simulated Data}\label{simulated-data-1}
 \end{Shaded}
 
 \begin{verbatim}
-## [1] -0.0170207
+## [1] -0.02287811
 \end{verbatim}
 
+\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
+
+\begin{learncheck}
+\textbf{\emph{Learning check}}
+\end{learncheck}
+
+\textbf{(LC7.16)} How would the tactile shuffling of index cards change
+if we had different samples of say 20 action movies and 60 romance
+movies? Describe each step that would change.
+
+\textbf{(LC7.17)} Why are we taking the difference in the means of the
+cards in the new shuffled decks?
+
+\begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
+
 \subsection{\texorpdfstring{Distribution of \(\delta\) under
 \(H_0\)}{Distribution of \textbackslash{}delta under H\_0}}\label{distribution-of-delta-under-h_0-1}
 
@@ -5996,11 +5997,11 @@ \subsection{\texorpdfstring{Distribution of \(\delta\) under
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-93-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-96-1} 
 
 }
 
-\caption[Simulated differences in means histogram]{Simulated differences in means histogram}\label{fig:unnamed-chunk-93}
+\caption[Simulated differences in means histogram]{Simulated differences in means histogram}\label{fig:unnamed-chunk-96}
 \end{figure}
 
 \subsection{The p-value}\label{the-p-value-1}
@@ -6020,11 +6021,11 @@ \subsection{The p-value}\label{the-p-value-1}
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-94-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-97-1} 
 
 }
 
-\caption[Shaded histogram to show p-value]{Shaded histogram to show p-value}\label{fig:unnamed-chunk-94}
+\caption[Shaded histogram to show p-value]{Shaded histogram to show p-value}\label{fig:unnamed-chunk-97}
 \end{figure}
 
 You may initially think there is an error here, but remember that the
@@ -6043,11 +6044,11 @@ \subsection{The p-value}\label{the-p-value-1}
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-95-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-98-1} 
 
 }
 
-\caption[Histogram with vertical lines corresponding to observed statistic]{Histogram with vertical lines corresponding to observed statistic}\label{fig:unnamed-chunk-95}
+\caption[Histogram with vertical lines corresponding to observed statistic]{Histogram with vertical lines corresponding to observed statistic}\label{fig:unnamed-chunk-98}
 \end{figure}
 
 Based on this plot, we have no values as extreme or more extreme than
@@ -6066,31 +6067,31 @@ \subsection{The p-value}\label{the-p-value-1}
 \textbf{\emph{Learning check}}
 \end{learncheck}
 
-\textbf{(LC7.17)} Conduct the same analysis comparing action movies
+\textbf{(LC7.18)} Conduct the same analysis comparing action movies
 versus romantic movies using the median rating instead of the mean
 rating? Make sure to use the \texttt{\%\textgreater{}\%} as much as
 possible. What was different and what was the same?
 
-\textbf{(LC7.18)} What conclusions can you make from viewing the faceted
+\textbf{(LC7.19)} What conclusions can you make from viewing the faceted
 histogram looking at \texttt{rating} versus \texttt{genre} that you
 couldn't see when looking at the boxplot?
 
-\textbf{(LC7.19)} Describe in a paragraph how we used Allen Downey's
+\textbf{(LC7.20)} Describe in a paragraph how we used Allen Downey's
 diagram to conclude if a statistical difference existed between mean
 movie ratings for action and romance movies.
 
-\textbf{(LC7.20)} Why are we relatively confident that the distributions
+\textbf{(LC7.21)} Why are we relatively confident that the distributions
 of the sample ratings will be good approximations of the population
 distributions of ratings for the two genres?
 
-\textbf{(LC7.21)} Using the definition of ``\(p\)-value'', write in
+\textbf{(LC7.22)} Using the definition of ``\(p\)-value'', write in
 words what the \(p\)-value represents for the hypothesis test above
 comparing the mean rating of romance to action movies.
 
-\textbf{(LC7.22)} What is the value of the \(p\)-value for the
+\textbf{(LC7.23)} What is the value of the \(p\)-value for the
 hypothesis test comparing the mean rating of romance to action movies?
 
-\textbf{(LC7.23)} Do the results of the hypothesis test match up with
+\textbf{(LC7.24)} Do the results of the hypothesis test match up with
 the original plots we made looking at the population of movies? Why or
 why not?
 
@@ -6150,13 +6151,15 @@ \subsection{\texorpdfstring{EXAMPLE: \(t\)-test for two independent
 variable. Then you subtract the mean from each value of your variable
 and divide by the standard deviation. The most common normalization is
 known as the \(z\)-score. The formula for a \(z\)-score is
-\[Z = \frac{x - \mu}{\sigma}\], where \(x\) represent the value of a
+\[Z = \frac{x - \mu}{\sigma},\] where \(x\) represent the value of a
 variable, \(\mu\) represents the mean of the variable, and \(\sigma\)
-represents the standard deviation of the variable. \(z\)-scores are
-normally distributed with mean 0 and standard deviation 1. They have the
-common, bell-shaped pattern.
+represents the standard deviation of the variable. Thus, if your
+variable has 10 elements, each one has a corresponding \(z\)-score that
+gives how many standard deviations away that value is from its mean.
+\(z\)-scores are normally distributed with mean 0 and standard deviation
+1. They have the common, bell-shaped pattern seen below.
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-96-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-99-1} \end{center}
 
 Recall, that we hardly ever know the mean and standard deviation of the
 population of interest. This is almost always the case when considering
@@ -6169,7 +6172,7 @@ \subsection{\texorpdfstring{EXAMPLE: \(t\)-test for two independent
 deviations. This normalization is often called the \(t\)-score. For the
 two independent samples case like what we have for comparing action
 movies to romance movies, the formula is
-\[T =\dfrac{ (\bar{x}_1 - \bar{x}_2) - (\mu_1 - \mu_2)}{ \sqrt{\dfrac{s_1^2}{n_1} + \dfrac{s_2^2}{n_2}}  }\]
+\[T =\dfrac{ (\bar{x}_1 - \bar{x}_2) - (\mu_1 - \mu_2)}{ \sqrt{\dfrac{{s_1}^2}{n_1} + \dfrac{{s_2}^2}{n_2}}  }\]
 
 There is a lot to try to unpack here.
 
@@ -6208,7 +6211,7 @@ \subsection{\texorpdfstring{EXAMPLE: \(t\)-test for two independent
 
 So, assuming \(H_0\) is true, our formula simplifies a bit:
 
-\[T =\dfrac{ \bar{x}_1 - \bar{x}_2}{ \sqrt{\dfrac{s_1^2}{n_1} + \dfrac{s_2^2}{n_2}}  }\]
+\[T =\dfrac{ \bar{x}_1 - \bar{x}_2}{ \sqrt{\dfrac{{s_1}^2}{n_1} + \dfrac{{s_2}^2}{n_2}}  }.\]
 
 We have already built an approximation for what we think the
 distribution of \(\delta = \bar{x}_1 - \bar{x}_2\) looks like using
@@ -6223,21 +6226,21 @@ \subsection{\texorpdfstring{EXAMPLE: \(t\)-test for two independent
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-97-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-100-1} 
 
 }
 
-\caption[Simulated differences in means histogram]{Simulated differences in means histogram}\label{fig:unnamed-chunk-97}
+\caption[Simulated differences in means histogram]{Simulated differences in means histogram}\label{fig:unnamed-chunk-100}
 \end{figure}
 
 If we'd like to have a guess as to what the distribution of \(T\) might
 look like instead, we need only to divide every value in
 \texttt{rand\_distn} by
-\(\sqrt{\dfrac{s_1^2}{n_1} + \dfrac{s_2^2}{n_2}}\). As we did before, we
-will assign Romance to be group 1 and Action to be group 2. (This was
-done since Romance comes second alphabetically and the reason why we
-have a number mismatch below with 1 and 2.) Remember that we've already
-calculated these values:
+\[\sqrt{\dfrac{{s_1}^2}{n_1} + \dfrac{{s_2}^2}{n_2}}.\] As we did
+before, we will assign Romance to be group 1 and Action to be group 2.
+(This was done since Romance comes second alphabetically and the reason
+why we have a number mismatch below with 1 and 2.) Remember that we've
+already calculated these values:
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -6274,10 +6277,14 @@ \subsection{\texorpdfstring{EXAMPLE: \(t\)-test for two independent
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{denom_T <-}\StringTok{ }\KeywordTok{sqrt}\NormalTok{( (s1^}\DecValTok{2} \NormalTok{/}\StringTok{ }\NormalTok{n1) +}\StringTok{ }\NormalTok{(s2^}\DecValTok{2} \NormalTok{/}\StringTok{ }\NormalTok{n2) )}
+\NormalTok{(denom_T <-}\StringTok{ }\KeywordTok{sqrt}\NormalTok{( (s1^}\DecValTok{2} \NormalTok{/}\StringTok{ }\NormalTok{n1) +}\StringTok{ }\NormalTok{(s2^}\DecValTok{2} \NormalTok{/}\StringTok{ }\NormalTok{n2) ))}
 \end{Highlighting}
 \end{Shaded}
 
+\begin{verbatim}
+## [1] 0.3249789
+\end{verbatim}
+
 Now if we divide all of the values of \texttt{diffmean} in
 \texttt{rand\_distn} by \texttt{denom\_T} we can have a simulated
 distribution of \(T\) test statistics instead:
@@ -6285,7 +6292,7 @@ \subsection{\texorpdfstring{EXAMPLE: \(t\)-test for two independent
 \begin{Shaded}
 \begin{Highlighting}[]
 \NormalTok{rand_distn <-}\StringTok{ }\NormalTok{rand_distn %>%}\StringTok{ }
-\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{t_stat =} \NormalTok{diffmean /}\StringTok{ }\NormalTok{denom_T)}
+\StringTok{  }\KeywordTok{mutate}\NormalTok{(}\DataTypeTok{t_stat =} \NormalTok{diffmean /}\StringTok{ }\NormalTok{denom_T *}\StringTok{ }\DecValTok{10}\NormalTok{)}
 \KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data =} \NormalTok{rand_distn, }\KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \NormalTok{t_stat)) +}
 \StringTok{  }\KeywordTok{geom_histogram}\NormalTok{(}\DataTypeTok{color =} \StringTok{"white"}\NormalTok{, }\DataTypeTok{bins =} \DecValTok{20}\NormalTok{)}
 \end{Highlighting}
@@ -6293,11 +6300,11 @@ \subsection{\texorpdfstring{EXAMPLE: \(t\)-test for two independent
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-101-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-104-1} 
 
 }
 
-\caption[Simulated T statistics histogram]{Simulated T statistics histogram}\label{fig:unnamed-chunk-101}
+\caption[Simulated T statistics histogram]{Simulated T statistics histogram}\label{fig:unnamed-chunk-104}
 \end{figure}
 
 We see that the shape of this distribution is the same as that of
@@ -6306,21 +6313,23 @@ \subsection{\texorpdfstring{EXAMPLE: \(t\)-test for two independent
 
 A traditional \(t\)-test doesn't look at this simulated distribution,
 but instead it looks at the \(t\)-curve with degrees of freedom equal to
-33 (the minimum of \(n_1 = 34 - 1 = 33\) and \(n_2 = 34 - 1 = 33\)). We
-now overlay what this \(t\)-curve looks like on top of the histogram
-showing the simulated \(T\) statistics.
+33 (the minimum of \(n_1 = 34 - 1 = 33\) and \(n_2 = 34 - 1 = 33\)).
+This curve is frequently called a \emph{density} curve and this is the
+reason why we specify the use of \texttt{y\ =\ ..density..} here in the
+\texttt{geom\_histogram}. We now overlay what this \(t\)-curve looks
+like on top of the histogram showing the simulated \(T\) statistics.
 
 \begin{Shaded}
 \begin{Highlighting}[]
 \KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data =} \NormalTok{rand_distn, }\DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \NormalTok{t_stat)) +}
-\StringTok{  }\KeywordTok{geom_histogram}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{y =} \NormalTok{..density..), }\DataTypeTok{color =} \StringTok{"white"}\NormalTok{, }\DataTypeTok{binwidth =} \FloatTok{0.1}\NormalTok{) +}
+\StringTok{  }\KeywordTok{geom_histogram}\NormalTok{(}\KeywordTok{aes}\NormalTok{(}\DataTypeTok{y =} \NormalTok{..density..), }\DataTypeTok{color =} \StringTok{"white"}\NormalTok{, }\DataTypeTok{binwidth =} \FloatTok{0.3}\NormalTok{) +}
 \StringTok{  }\KeywordTok{stat_function}\NormalTok{(}\DataTypeTok{fun =} \NormalTok{dt,}
     \DataTypeTok{args =} \KeywordTok{list}\NormalTok{(}\DataTypeTok{df =} \KeywordTok{min}\NormalTok{(n1 -}\StringTok{ }\DecValTok{1}\NormalTok{, n2 -}\StringTok{ }\DecValTok{1}\NormalTok{)), }
     \DataTypeTok{color =} \StringTok{"royalblue"}\NormalTok{, }\DataTypeTok{size =} \DecValTok{2}\NormalTok{)}
 \end{Highlighting}
 \end{Shaded}
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-102-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-105-1} \end{center}
 
 We can see that the curve does a good job of approximating the
 randomization distribution here. (More on when to expect for this to be
@@ -6358,7 +6367,7 @@ \subsection{\texorpdfstring{EXAMPLE: \(t\)-test for two independent
 \end{Highlighting}
 \end{Shaded}
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-104-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-107-1} \end{center}
 
 At this point, you should make a guess as to what a reasonable value may
 be for the \(p\)-value. Let's say the \(p\)-value is 0.01 or so. To
@@ -6416,10 +6425,14 @@ \subsection{Conditions for t-test}\label{conditions-for-t-test}
 researchers are using these techniques that utilize the power of
 computers.
 
+\section{Conclusion}\label{conclusion-3}
+
+\subsection{Script of R code}\label{script-of-r-code-3}
+
 An R script file of all R code used in this chapter is available
 \href{http://ismayc.github.io/moderndiver-book/07-hypo.R}{here}.
 
-\section{What's to come?}\label{whats-to-come-4}
+\subsection{What's to come?}\label{whats-to-come-4}
 
 This chapter examined the basics of hypothesis testing with terminology
 and also an example of how to apply the ``There is Only One Test''
@@ -6459,8 +6472,8 @@ \chapter{Confidence Intervals}\label{ci}
 (use a range of possible values based around our statistic to make a
 plausible guess as to the location of the parameter).
 
-\section*{Needed packages}\label{needed-packages-4}
-\addcontentsline{toc}{section}{Needed packages}
+\subsection*{Needed packages}\label{needed-packages-5}
+\addcontentsline{toc}{subsection}{Needed packages}
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -6475,7 +6488,7 @@ \section{Bootstrapping}\label{bootstrapping}
 
 Just as we did in Chapter \ref{hypo} with the Lady Tasting Tea when
 making hypotheses about a population total with which we would like to
-test which one is more plausible, we can also use simulation to infer
+test which one is more plausible, we can also use computation to infer
 conclusions about a population quantitative statistic such as the mean.
 In this case, we will focus on constructing confidence intervals to
 produce plausible values for a population mean. (We can do a similar
@@ -6496,9 +6509,8 @@ \section{Bootstrapping}\label{bootstrapping}
 estimate the variability of our statistic from sample to sample. One
 neat feature of bootstrapping is that it enables us to approximate the
 sampling distribution and estimate the distribution's standard deviation
-using ONLY the information in the one selected (original) sample.
-
-It sounds just as plagued with the magical type qualities of traditional
+using ONLY the information in the one selected (original) sample. It
+sounds just as plagued with the magical type qualities of traditional
 theory-based inference on initial glance but we will see that it
 provides an intuitive and useful way to make inferences, especially when
 the samples are of medium to large size.
@@ -6518,11 +6530,10 @@ \section{Bootstrapping}\label{bootstrapping}
 
 Recall that you can also glance at this data frame using the
 \texttt{View} function and look at the help documentation for
-\texttt{movies} using the \texttt{?} function.
-
-We will explore many other features of this data set in the chapters to
-come, but here we will be focusing on the \texttt{rating} variable
-corresponding to the average IMDB user rating.
+\texttt{movies} using the \texttt{?} function. We will explore many
+other features of this data set in the chapters to come, but here we
+will be focusing on the \texttt{rating} variable corresponding to the
+average IMDB user rating.
 
 You may notice that this data set is quite large: 58,788 movies have
 data collected about them here. This will correspond to our population
@@ -6530,9 +6541,9 @@ \section{Bootstrapping}\label{bootstrapping}
 rarely known. We use this data set as our population here to show you
 the power of bootstrapping in estimating population parameters. We'll
 see how \textbf{confidence intervals} built using the bootstrap
-distribution do at including our population parameter of interest. Here
-we can actually calculate these values since our population is known,
-but remember that in general this isn't the case.
+distribution perform at including our population parameter of interest.
+Here we can actually calculate these values since our population is
+known, but remember that in general this isn't the case.
 
 Let's take a look at what the distribution of our population
 \texttt{ratings} looks like. We'll see that we will use the distribution
@@ -6547,11 +6558,11 @@ \section{Bootstrapping}\label{bootstrapping}
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-109-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-112-1} 
 
 }
 
-\caption[Population ratings histogram]{Population ratings histogram}\label{fig:unnamed-chunk-109}
+\caption[Population ratings histogram]{Population ratings histogram}\label{fig:unnamed-chunk-112}
 \end{figure}
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
@@ -6563,7 +6574,7 @@ \section{Bootstrapping}\label{bootstrapping}
 \textbf{(LC8.1)} Why was a histogram chosen as the plot to make for the
 \texttt{rating} variable above?
 
-\textbf{(LC8.2)} Why does the shape of the \texttt{rating} histogram
+\textbf{(LC8.2)} What does the shape of the \texttt{rating} histogram
 tell us about how IMDB users rate movies? What stands out about the
 plot?
 
@@ -6582,6 +6593,7 @@ \section{Bootstrapping}\label{bootstrapping}
 \begin{Highlighting}[]
 \KeywordTok{set.seed}\NormalTok{(}\DecValTok{2017}\NormalTok{)}
 \KeywordTok{library}\NormalTok{(mosaic)}
+\KeywordTok{library}\NormalTok{(dplyr)}
 \NormalTok{movies_sample <-}\StringTok{ }\NormalTok{movies %>%}\StringTok{ }\KeywordTok{resample}\NormalTok{(}\DataTypeTok{size =} \DecValTok{50}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{FALSE}\NormalTok{)}
 \end{Highlighting}
 \end{Shaded}
@@ -6603,11 +6615,11 @@ \section{Bootstrapping}\label{bootstrapping}
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-111-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-114-1} 
 
 }
 
-\caption[Sample ratings histogram]{Sample ratings histogram}\label{fig:unnamed-chunk-111}
+\caption[Sample ratings histogram]{Sample ratings histogram}\label{fig:unnamed-chunk-114}
 \end{figure}
 
 Remember that we can think of this histogram as an estimate of our
@@ -6647,7 +6659,6 @@ \section{Bootstrapping}\label{bootstrapping}
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\KeywordTok{library}\NormalTok{(mosaic)}
 \NormalTok{boot1 <-}\StringTok{ }\KeywordTok{resample}\NormalTok{(movies_sample) %>%}
 \StringTok{  }\KeywordTok{arrange}\NormalTok{(orig.id)}
 \end{Highlighting}
@@ -6744,7 +6755,7 @@ \section{Bootstrapping}\label{bootstrapping}
 
 So what's the next step now? Just as we repeated the repetitions
 thousands of times with the ``Lady Tasting Tea'' example, we can do a
-similar thing here.
+similar thing here:
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -6757,11 +6768,11 @@ \section{Bootstrapping}\label{bootstrapping}
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-116-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-119-1} 
 
 }
 
-\caption[Bootstrapped means histogram]{Bootstrapped means histogram}\label{fig:unnamed-chunk-116}
+\caption[Bootstrapped means histogram]{Bootstrapped means histogram}\label{fig:unnamed-chunk-119}
 \end{figure}
 
 The shape of this resulting distribution may look familiar to you. It
@@ -6789,8 +6800,8 @@ \section{Bootstrapping}\label{bootstrapping}
 
 \begin{quote}
 Based on the sample data and bootstrapping techniques, we can be 95\%
-confident that the true mean rating of ALL IMDB ratings is between 5.456
-and 6.296.
+confident that the true mean rating of \textbf{ALL} IMDB ratings is
+between 5.456 and 6.296.
 \end{quote}
 
 This statement may seem a little confusing to you. Another way to think
@@ -6826,18 +6837,16 @@ \section{Bootstrapping}\label{bootstrapping}
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\NormalTok{movies %>%}\StringTok{ }\KeywordTok{summarize}\NormalTok{(}\DataTypeTok{mean_rating =} \KeywordTok{mean}\NormalTok{(rating)) %>%}\StringTok{ }
-\StringTok{  }\KeywordTok{kable}\NormalTok{()}
+\NormalTok{movies %>%}\StringTok{ }\KeywordTok{summarize}\NormalTok{(}\DataTypeTok{mean_rating =} \KeywordTok{mean}\NormalTok{(rating))}
 \end{Highlighting}
 \end{Shaded}
 
-\begin{tabular}{r}
-\hline
-mean\_rating\\
-\hline
-5.93285\\
-\hline
-\end{tabular}
+\begin{verbatim}
+## # A tibble: 1 × 1
+##   mean_rating
+##         <dbl>
+## 1     5.93285
+\end{verbatim}
 
 We see here that the population mean does fall in our range of plausible
 values generated from the bootstrapped samples.
@@ -6847,10 +6856,12 @@ \section{Bootstrapping}\label{bootstrapping}
 \[\bar{x} \pm (2 * SE),\] where \(\bar{x}\) is our original sample mean
 and \(SE\) stands for \textbf{standard error} and corresponds to the
 standard deviation of the bootstrap distribution. The value of 2 here
-corresponds to it being a 95\% confidence interval. This formula assumes
-that the bootstrap distribution is symmetric and bell-shaped. This is
-often the case with bootstrap distributions, especially those in which
-the original distribution of the sample is not highly skewed.
+corresponds to it being a 95\% confidence interval. (95\% of the values
+in a normal distribution fall within 2 standard deviations of the mean.)
+This formula assumes that the bootstrap distribution is symmetric and
+bell-shaped. This is often the case with bootstrap distributions,
+especially those in which the original distribution of the sample is not
+highly skewed.
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
@@ -6860,7 +6871,7 @@ \section{Bootstrapping}\label{bootstrapping}
 distribution. The sampling distribution may be approximated by the
 bootstrap distribution or the null distribution depending on the
 context. Traditional theory-based methodologies for inference also have
-formulas for standard errors assuming some conditions are met.
+formulas for standard errors, assuming some conditions are met.
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
@@ -6898,7 +6909,7 @@ \section{Bootstrapping}\label{bootstrapping}
 \textbf{(LC8.7)} Reproduce the bootstrapping above using a sample of
 size 5 instead of 25. What changes do you see?
 
-\textbf{(LC8.8)} How does the sample size affect the analysis?
+\textbf{(LC8.8)} How does the sample size affect the analysis above?
 
 \textbf{(LC8.9)} Why must bootstrap samples be the same size as the
 original sample?
@@ -6940,13 +6951,13 @@ \section{Relation to hypothesis
 
 Recall that we found a statistically significant difference in the
 sample mean of romance movie ratings compared to the sample mean of
-action movie ratings. We concluded Chapter \ref{hypo} by attempted to
+action movie ratings. We concluded Chapter \ref{hypo} by attempting to
 understand just how much greater we could expect the \emph{population}
-mean romance movie rating to be as compared to the \emph{population}
-mean action movie rating. In order to do so, we will calculate a
-confidence interval for the difference \(\mu_r - \mu_a\). We'll then go
-back to our population parameter values and see if our confidence
-interval contains our parameter value.
+mean romance movie rating to be compared to the \emph{population} mean
+action movie rating. In order to do so, we will calculate a confidence
+interval for the difference \(\mu_r - \mu_a\). We'll then go back to our
+population parameter values and see if our confidence interval contains
+our parameter value.
 
 We could use bootstrapping in a way similar to that done above, except
 now on a difference in sample means, to create a distribution and then
@@ -6969,11 +6980,11 @@ \section{Relation to hypothesis
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-120-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-123-1} 
 
 }
 
-\caption[Simulated shuffled sample means histogram]{Simulated shuffled sample means histogram}\label{fig:unnamed-chunk-120}
+\caption[Simulated shuffled sample means histogram]{Simulated shuffled sample means histogram}\label{fig:unnamed-chunk-123}
 \end{figure}
 
 With this null distribution being quite symmetric and bell-shaped, the
@@ -6995,9 +7006,9 @@ \section{Relation to hypothesis
 ## 1 0.03182225
 \end{verbatim}
 
-Remembering that we can use the general formula of
-\(statistic \pm (2 * SE)\) we get the following result for plausible
-values of the difference in population means at the 95\% level.
+We can use the general formula of \(statistic \pm (2 * SE)\) for a
+confidence interval to obtain the following result for plausible values
+of the difference in population means at the 95\% level.
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -7030,7 +7041,7 @@ \section{Relation to hypothesis
 the two population means between the two groups is 0. This means that
 the null hypothesis is plausible. The results of the hypothesis test and
 the confidence interval should match as they do here. We rejected the
-null hypothesis with hypothesis testing and we have evidence here than
+null hypothesis with hypothesis testing and we have evidence here that
 the mean rating for romance movies is higher than for action movies.
 
 \section{Effect size}\label{effect-size}
@@ -7080,9 +7091,9 @@ \section{Effect size}\label{effect-size}
 \textbf{(LC8.16)} The moment the phrase ``standard error'' is mentioned,
 there seems to be someone that says ``The standard error is \(s\)
 divided by the square root of \(n\).'' This standard error formula is
-correct and used in the theory-based procedure for an inference on one
-mean. But\ldots{} does it always work? For \texttt{samp1},
-\texttt{samp2}, and \texttt{samp3} below, do the following:
+used in the theory-based procedure for an inference on one mean.
+But\ldots{} does it always work? For \texttt{samp1}, \texttt{samp2}, and
+\texttt{samp3} below, do the following:
 
 \begin{enumerate}
 \def\labelenumi{\arabic{enumi}.}
@@ -7110,12 +7121,14 @@ \section{Effect size}\label{effect-size}
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
-\section{Script of R code}\label{script-of-r-code-3}
+\section{Conclusion}\label{conclusion-4}
+
+\subsection{Script of R code}\label{script-of-r-code-4}
 
 An R script file of all R code used in this chapter is available
 \href{http://ismayc.github.io/moderndiver-book/08-ci.R}{here}.
 
-\section{What's to come?}\label{whats-to-come-5}
+\subsection{What's to come?}\label{whats-to-come-5}
 
 We will see in Chapter \ref{regress} many of the same ideas we have seen
 with hypothesis testing and confidence intervals in the last two
@@ -7123,8 +7136,7 @@ \section{What's to come?}\label{whats-to-come-5}
 incorrectly with statistics and data analysis, so you'll need to make
 sure you understand when it is appropriate and when it is not.
 
-\chapter{\texorpdfstring{Regression via
-\texttt{broom}}{Regression via broom}}\label{regression-via-broom}
+\chapter{Regression via broom}\label{regress}
 
 One of the most commonly used statistical procedures is
 \emph{regression}. Regression, in its simplest form, focuses on trying
@@ -7142,8 +7154,8 @@ \chapter{\texorpdfstring{Regression via
 see examples of the \texttt{tidy}, \texttt{glance}, and \texttt{augment}
 functions with linear regression.
 
-\section*{Needed packages}\label{needed-packages-5}
-\addcontentsline{toc}{section}{Needed packages}
+\subsection*{Needed packages}\label{needed-packages-6}
+\addcontentsline{toc}{subsection}{Needed packages}
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -7176,8 +7188,7 @@ \section{EXAMPLE: Alaskan Airlines
 \StringTok{  }\KeywordTok{filter}\NormalTok{(!}\KeywordTok{is.na}\NormalTok{(dep_delay) &}\StringTok{ }\NormalTok{!}\KeywordTok{is.na}\NormalTok{(arr_delay)) %>%}\StringTok{ }
 \StringTok{  }\KeywordTok{resample}\NormalTok{(}\DataTypeTok{size =} \DecValTok{50}\NormalTok{, }\DataTypeTok{replace =} \OtherTok{FALSE}\NormalTok{)}
 
-\KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data =} \NormalTok{alaska_flights, }
-       \DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \NormalTok{dep_delay, }\DataTypeTok{y =} \NormalTok{arr_delay)) +}\StringTok{ }
+\KeywordTok{ggplot}\NormalTok{(}\DataTypeTok{data =} \NormalTok{alaska_flights, }\DataTypeTok{mapping =} \KeywordTok{aes}\NormalTok{(}\DataTypeTok{x =} \NormalTok{dep_delay, }\DataTypeTok{y =} \NormalTok{arr_delay)) +}\StringTok{ }
 \StringTok{   }\KeywordTok{geom_point}\NormalTok{()}
 \end{Highlighting}
 \end{Shaded}
@@ -7199,8 +7210,8 @@ \section{EXAMPLE: Alaskan Airlines
 
 \textbf{(LC9.1)} Does there appear to be a linear relationship with
 arrival delay and departure delay? In other words, could you fit a line
-to the data and have explain how \texttt{arr\_delay} increases as
-\texttt{dep\_delay} increases?
+to the data and have it explain well how \texttt{arr\_delay} increases
+as \texttt{dep\_delay} increases?
 
 \textbf{(LC9.2)} Is there only one possible line that fits the data
 ``well''? How could you decide on which one is best if there are
@@ -7278,13 +7289,13 @@ \section{Correlation}\label{correlation}
 
 \begin{verbatim}
 ## # A tibble: 1 × 1
-##      correl
-##       <dbl>
-## 1 0.7907993
+##   correl
+##    <dbl>
+## 1 0.7908
 \end{verbatim}
 
 The sample correlation coefficient is denoted by \(r\). In this case,
-\(r = 0.7907993\).
+\(r = 0.7908\).
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
@@ -7331,7 +7342,7 @@ \subsection{Correlation does not imply
 Causation is a tricky problem and frequently takes carefully designed
 experiments. These experiments remove confounding variables and only
 focus on the behavior of one variable in the presence of the levels of
-the other variable.
+the other variable(s).
 
 Be careful as you read studies to make sure that the writers aren't
 falling into this fallacy of correlation implying causation. If you spot
@@ -7360,7 +7371,7 @@ \section{Linear regression}\label{linear-regression}
 based on the plot in Figure \ref{fig:regplot1}?
 
 It may be hard to pick a particular value here, especially after just
-going on confidence intervals in Chapter \ref{ci}. One way to do this
+going over confidence intervals in Chapter \ref{ci}. One way to do this
 would be to fit a line that fits the data best and then use the
 predicted \texttt{arr\_delay} value from that line for
 \texttt{dep\_delay\ =\ 25} as our prediction. But what is meant by
@@ -7378,7 +7389,7 @@ \section{Linear regression}\label{linear-regression}
 \caption[Regression line fit on delays]{Regression line fit on delays}\label{fig:with-reg}
 \end{figure}
 
-Here \texttt{lm} corresponds to ``linear model'' and we'll see it's use
+Here \texttt{lm} corresponds to ``linear model'' and we'll see its use
 again in a bit when we find the values that define this line.
 
 \subsection{Understanding linear regression
@@ -7387,19 +7398,19 @@ \subsection{Understanding linear regression
 Let's choose an arbitrary point on the graph and label it the color
 blue.
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-127-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-130-1} \end{center}
 
 Now consider this point's \emph{deviation} from the regression line.
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-128-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-131-1} \end{center}
 
 Do this for another point.
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-129-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-132-1} \end{center}
 
 And for another point.
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-130-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-133-1} \end{center}
 
 We could repeat this process for each of the points in our sample. The
 pattern that emerges here is that the regression line minimizes the sum
@@ -7430,17 +7441,17 @@ \subsection{The equation of the line}\label{the-equation-of-the-line}
 \hline
 term & estimate & std.error & statistic & p.value\\
 \hline
-(Intercept) & -14.155017 & 2.8094813 & -5.038302 & 0.0000071\\
+(Intercept) & -14.155 & 2.809 & -5.038 & 0\\
 \hline
-dep\_delay & 1.217666 & 0.1360336 & 8.951212 & 0.0000000\\
+dep\_delay & 1.218 & 0.136 & 8.951 & 0\\
 \hline
 \end{tabular}
 
 In general, the equation of the line of best fit for a sample is
-\[\hat{y} = b_0 + b_1 x\]. Thus, our equation is
-\(\hat{y} = -14.1550165 + 1.2176658 \, x\). It is usually preferred to
-actually write the names of the variables instead of \(x\) and \(y\):
-\[\widehat{arr\_delay} = -14.1550165 + 1.2176658 \, dep\_delay\].
+\[\hat{y} = b_0 + b_1 x.\] Thus, our equation is
+\(\hat{y} = -14.155 + 1.2177 \, x.\) It is usually preferred to actually
+write the names of the variables instead of \(x\) and \(y\):
+\[\widehat{arr\_delay} = -14.155 + 1.2177 \, dep\_delay.\]
 
 We can also extract the coefficients by using the \texttt{coef}
 function:
@@ -7453,10 +7464,10 @@ \subsection{The equation of the line}\label{the-equation-of-the-line}
 
 \begin{verbatim}
 ## (Intercept)   dep_delay 
-##  -14.155016    1.217666
+##     -14.155       1.218
 \end{verbatim}
 
-\subsection{Interpretting the slope}\label{interpretting-the-slope}
+\subsection{Interpreting the slope}\label{interpreting-the-slope}
 
 After you have determined your line of best fit, it is good practice to
 interpret the results to see if they make sense. Slope is defined as
@@ -7488,13 +7499,13 @@ \subsection{Predicting values}\label{predicting-values}
 \end{Shaded}
 
 \begin{verbatim}
-##   dep_delay  .fitted  .se.fit
-## 1        25 16.28663 3.967287
+##   dep_delay .fitted .se.fit
+## 1        25   16.29   3.967
 \end{verbatim}
 
 Note the use of the \texttt{data\_frame} function here, which must be
-used since \texttt{newdata} is expected a data frame as its argument. We
-must also specify that we are plugging in 25 for the value of
+used since \texttt{newdata} is expecting a data frame as its argument.
+We must also specify that we are plugging in 25 for the value of
 \texttt{dep\_delay} here. We can see that the line predicted an arrival
 delay of 16.29 minutes based on our 25 minute departure delay. This also
 does make some sense since flights that aren't delayed greatly from the
@@ -7508,23 +7519,23 @@ \subsection{Predicting values}\label{predicting-values}
 
 For example, say we have 3 groups of points:
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-133-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-136-1} \end{center}
 
 Their regression lines have different slopes, but \(r = 1\) for all 3.
 In other words, all three groups of points have a perfect (positive)
 linear relationship.
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-134-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-137-1} \end{center}
 
 \section{Inference for regression}\label{inference-for-regression}
 
 The population least squares line is defined by the formula
-\(y = \beta_0 + \beta_1 x + \epsilon\). Here \(\epsilon\) corresponds to
-the error term. It corresponds to the part of the response variable
-\(y\) that remains unexplained after considering the predictor variable
-\(x\). Often it is standard practice to assume that this error term
-follows a normal distribution. We will focus on checking whether that
-assumption is valid in Section \ref{resid}.
+\(y = \beta_0 + \beta_1 x + \epsilon\). Here \(\epsilon\) represents the
+error term. It corresponds to the part of the response variable \(y\)
+that remains unexplained after considering the predictor variable \(x\).
+Often it is standard practice to assume that this error term follows a
+normal distribution. We will focus on checking whether that assumption
+is valid in Section \ref{resid}.
 
 In the population least squares line
 \(y = \beta_0 + \beta_1 x + \epsilon\), we can see that if
@@ -7534,14 +7545,14 @@ \section{Inference for regression}\label{inference-for-regression}
 conducted to check whether a relationship exists between two numerical
 variables \(x\) and \(y\).
 
-We can also use the concept of shuffling to determine standard error and
-conduct a hypothesis test for a population slope. Let's go back to our
-example on Alaskan flights that represent a sample of all Alaskan
-flights departing NYC in 2013. Let's test to see if we have evidence
-that a \emph{positive} relationship exists between the departure delay
-and arrival delay for Alaskan flights. We will set up this hypothesis
-testing process as we have each before via the ``There is Only One
-Test'' diagram in Figure \ref{fig:htdowney}.
+We can also use the concept of shuffling to determine the standard error
+of our null distribution and conduct a hypothesis test for a population
+slope. Let's go back to our example on Alaskan flights that represent a
+sample of all Alaskan flights departing NYC in 2013. Let's test to see
+if we have evidence that a \emph{positive} relationship exists between
+the departure delay and arrival delay for Alaskan flights. We will set
+up this hypothesis testing process as we have each before via the
+``There is Only One Test'' diagram in Figure \ref{fig:htdowney}.
 
 \subsection{Data}\label{data-2}
 
@@ -7564,31 +7575,28 @@ \subsection{\texorpdfstring{Observed effect
 \end{Shaded}
 
 \begin{verbatim}
-## [1] 1.217666
+## [1] 1.218
 \end{verbatim}
 
-The calculated slope value from our observed sample is
-\(b_1 = 1.2176658\).
+The calculated slope value from our observed sample is \(b_1 = 1.2177\).
 
 \subsection{\texorpdfstring{Model of
 \(H_0\)}{Model of H\_0}}\label{model-of-h_0-2}
 
 We are looking to see if a positive relationship exists so
-\(H_A: \beta_1 > 0\). Our null hypothesis is always in terms of equality
-so we have \(\beta_1 = 0\).
+\(H_a: \beta_1 > 0\). Our null hypothesis is always in terms of equality
+so we have \(H_0: \beta_1 = 0\).
 
 \subsection{Simulated Data}\label{simulated-data-2}
 
 Now to simulate the null hypothesis being true and recreating how our
 sample was created, we need to think about what it means for \(\beta_1\)
-to be zero.
-
-If \(\beta_1 = 0\), we said above that there is no relationship between
-the departure delay and arrival delay. If there is no relationship, then
-any one of the arrival delay values could have just as likely occurred
-with any of the other departure delay values instead of the one that it
-actually did fall with. We, therefore, have another example of shuffling
-in our simulating data.
+to be zero. If \(\beta_1 = 0\), we said above that there is no
+relationship between the departure delay and arrival delay. If there is
+no relationship, then any one of the arrival delay values could have
+just as likely occurred with any of the other departure delay values
+instead of the one that it actually did fall with. We, therefore, have
+another example of shuffling in our simulating of data.
 
 \textbf{Tactile simulation}
 
@@ -7637,15 +7645,15 @@ \subsection{\texorpdfstring{Distribution of \(\delta\) under
 \end{Highlighting}
 \end{Shaded}
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-136-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-139-1} \end{center}
 
 \subsection{The p-value}\label{the-p-value-2}
 
 Recall that we want to see where our observed sample slope
-\(\delta^* = 1.2176658\) falls on this distribution and then count all
-of the values to the right of it corresponding to \(H_A: \beta_0 > 0\).
-To get a sense for where our values falls, we can shade all values at
-least as big as \(\delta^*\).
+\(\delta^* = 1.2177\) falls on this distribution and then count all of
+the values to the right of it corresponding to \(H_a: \beta_0 > 0\). To
+get a sense for where our values falls, we can shade all values at least
+as big as \(\delta^*\).
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -7656,14 +7664,14 @@ \subsection{The p-value}\label{the-p-value-2}
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-137-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-140-1} 
 
 }
 
-\caption[Shaded histogram to show p-value]{Shaded histogram to show p-value}\label{fig:unnamed-chunk-137}
+\caption[Shaded histogram to show p-value]{Shaded histogram to show p-value}\label{fig:unnamed-chunk-140}
 \end{figure}
 
-Since 1.2176658 falls far to the right of this plot, we can say that we
+Since 1.2177 falls far to the right of this plot, we can say that we
 have a \(p\)-value of 0. We, thus, have evidence to reject the null
 hypothesis in support of there being a positive association between the
 departure delay and arrival delay of all Alaskan flights from NYC in
@@ -7678,7 +7686,7 @@ \subsection{The p-value}\label{the-p-value-2}
 \textbf{(LC9.7)} Repeat the inference above but this time for the
 correlation coefficient instead of the slope.
 
-\textbf{(LC9.8)} Use bootstrapping with points to determine a range of
+\textbf{(LC9.8)} Use bootstrapping (of points) to determine a range of
 possible values for the population slope comparing departure delays to
 arrival delays for Alaskan flights in 2013 from NYC.
 
@@ -7689,12 +7697,13 @@ \section{Residual analysis}\label{resid}
 The following diagram will help you to keep track of what is meant by a
 residual.
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-138-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-141-1} \end{center}
 
 Here, \(y_i\) is an observed value of the \texttt{arr\_delay} variable.
-\(i\) ranges from 1 to 50. \(\hat{y}_i\) is the fitted value--the
+\(i\) ranges from 1 to 50. For this example, it is the vertical
+component of the blue dot. \(\hat{y}_i\) is the fitted value--the
 \texttt{arr\_delay} value that is being pointed to on the red line. The
-residual is \[\hat{\epsilon}_i = y_i - \hat{y}_i\]. \textbf{Note the
+residual is \[\hat{\epsilon}_i = y_i - \hat{y}_i.\] \textbf{Note the
 order here!} You start at the non-pointy end of the arrow (\(y_i\)) and
 then subtract away what comes at the point (\(\hat{y_i}\)).
 
@@ -7774,16 +7783,22 @@ \section{Conditions for regression}\label{conditions-for-regression}
 
 We have reason to doubt whether a linear regression is valid here.
 Unfortunately, all too frequently regressions are run without checking
-these assumptions carefully. While small deviations in the assumptions
+these assumptions carefully. While small deviations from the assumptions
 can be OK, larger violations can completely invalidate the results and
 make any inferences improbable and questionable.
 
-\section{Script of R code}\label{script-of-r-code-4}
+\section{Conclusion}\label{conclusion-5}
+
+\subsection{Script of R code}\label{script-of-r-code-5}
 
 An R script file of all R code used in this chapter is available
 \href{http://ismayc.github.io/moderndiver-book/09-regress.R}{here}.
 
-\section{What's to come?}\label{whats-to-come-6}
+\subsection{What's to come?}\label{whats-to-come-6}
+
+In the last chapter of the textbook, we'll summarize the purpose of this
+book as well as present an excellent example of what goes into making an
+effective story via data.
 
 \part{Conclusion}\label{part-conclusion}
 
@@ -7797,7 +7812,7 @@ \chapter{Effective Data Storytelling}\label{effective-data-storytelling}
 Further, you've seen the value of inference as a process to come to
 conclusions about a population by using a random sample. Lastly, you've
 explored how to use linear regression and the importance of checking the
-conditions required to make it a valid procedure. Throughout, you've
+conditions required to make it a valid procedure. All throughout, you've
 learned many computational techniques and focused on reproducible
 research in writing R code and keeping track of your work in R Markdown.
 All of these steps go into making a great story using data.
@@ -7811,9 +7826,13 @@ \chapter{Effective Data Storytelling}\label{effective-data-storytelling}
 read over it, think carefully about how Walt is using his data, his
 graphics, and his analyses to paint the picture for the reader of what
 the story is he wants to tell. In the spirit of reproducibility, the
-members of 538 have also shared the data that they used to create this
-story and some R code
+members of FiveThirtyEight have also shared the data that they used to
+create this story and some R code
 \href{https://github.com/fivethirtyeight/data/tree/master/bechdel}{here}.
+A vignette showing how to reproduce one of the plots at the end of the
+article using \texttt{dplyr}, \texttt{ggplot2}, and other packages in
+Hadley's \texttt{tidyverse} is available
+\href{https://cran.r-project.org/web/packages/fivethirtyeight/vignettes/bechdel.html}{here}.
 Great data stories don't mislead the reader, but rather engulf them in
 understanding the importance that data plays in our lives through the
 captivation of storytelling.
@@ -7824,14 +7843,16 @@ \section*{Concluding Remarks}\label{concluding-remarks}
 If you've come to this point in the book, I'd suspect that you know a
 thing or two about how to work with data in R. You've also gained a lot
 of knowledge about how to use simulation techniques to determine
-statistical significance. The hope is that you've come to appreciate
-data manipulation, tidy data sets, and the power of statistical
-visualization. Actually, the data visualization part may be the most
-important thing here. If you can create truly beautiful graphics that
-display information in ways that the reader can clearly decipher, you've
-picked up a great skill. Let's hope that that skill keeps you creating
-great stories with data into the near and far distant future. Thanks for
-coming along for the ride as we dove into modern data analysis using R!
+statistical significance and how these techniques build an intuition
+about traditional inferential methods like the \(t\)-test. The hope is
+that you've come to appreciate data manipulation, tidy data sets, and
+the power of data visualization. Actually, the data visualization part
+may be the most important thing here. If you can create truly beautiful
+graphics that display information in ways that the reader can clearly
+decipher, you've picked up a great skill. Let's hope that that skill
+keeps you creating great stories with data into the near and far distant
+future. Thanks for coming along for the ride as we dove into modern data
+analysis using R!
 
 \appendix
 
@@ -7913,7 +7934,8 @@ \chapter{Inference Examples}\label{appendixB}
 Traditional theory-based methods as well as computational-based methods
 are presented.
 
-\section{Needed packages}\label{needed-packages-6}
+\section*{Needed packages}\label{needed-packages-7}
+\addcontentsline{toc}{section}{Needed packages}
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -7929,6 +7951,21 @@ \section{Needed packages}\label{needed-packages-6}
 
 \begin{center}\rule{0.5\linewidth}{\linethickness}\end{center}
 
+\section{Inference Mind Map}\label{inference-mind-map}
+
+To help you better navigate and choose the appropriate analysis, we've
+created a mind map on \url{http://coggle.it} available
+\href{https://coggle.it/diagram/Vxlydu1akQFeqo6-}{here} and below.
+
+\begin{figure}
+
+{\centering \includegraphics[width=2\linewidth]{images/coggle} 
+
+}
+
+\caption[Mind map for Inference]{Mind map for Inference}\label{fig:infer-map}
+\end{figure}
+
 \section{One Mean}\label{one-mean}
 
 \subsection{Problem Statement}\label{problem-statement}
@@ -7985,13 +8022,6 @@ \subsection{Exploring the sample data}\label{exploring-the-sample-data}
 \end{Highlighting}
 \end{Shaded}
 
-\begin{verbatim}
-## Parsed with column specification:
-## cols(
-##   age = col_integer()
-## )
-\end{verbatim}
-
 \begin{Shaded}
 \begin{Highlighting}[]
 \NormalTok{age_summ <-}\StringTok{ }\NormalTok{ageAtMar %>%}
@@ -8011,7 +8041,7 @@ \subsection{Exploring the sample data}\label{exploring-the-sample-data}
 \hline
 sample\_size & mean & sd & minimum & lower\_quartile & median & upper\_quartile & max\\
 \hline
-5534 & 23.44019 & 4.721365 & 10 & 20 & 23 & 26 & 43\\
+5534 & 23.44 & 4.721 & 10 & 20 & 23 & 26 & 43\\
 \hline
 \end{tabular}
 
@@ -8029,7 +8059,7 @@ \subsection{Exploring the sample data}\label{exploring-the-sample-data}
 \subsubsection{Guess about statistical
 significance}\label{guess-about-statistical-significance}
 
-We are looking to see if the observed sample mean of 23.4401879 is
+We are looking to see if the observed sample mean of 23.4402 is
 statistically greater than \(\mu_0 = 23\). They seem to be quite close,
 but we have a large sample size here. Let's guess that the large sample
 size will lead us to reject this practically small difference.
@@ -8041,7 +8071,7 @@ \subsection{Non-traditional methods}\label{non-traditional-methods}
 \subsubsection{Bootstrapping for Hypothesis
 Test}\label{bootstrapping-for-hypothesis-test}
 
-In order to look to see if the observed sample mean of 23.4401879 is
+In order to look to see if the observed sample mean of 23.4402 is
 statistically greater than \(\mu_0 = 23\), we need to account for the
 sample size. We also need to determine a process that replicates how the
 original sample of size 5534 was selected.
@@ -8065,8 +8095,8 @@ \subsubsection{Bootstrapping for Hypothesis
   \texttt{boot\_distn} object, and
 \item
   shift the center of this distribution over to the null value of 23.
-  (This is needed since it will be centered at 23.4401879 via the
-  process of bootstrapping.)
+  (This is needed since it will be centered at 23.4402 via the process
+  of bootstrapping.)
 \end{enumerate}
 
 \begin{Shaded}
@@ -8087,7 +8117,7 @@ \subsubsection{Bootstrapping for Hypothesis
 
 We can next use this distribution to observe our \(p\)-value. Recall
 this is a right-tailed test so we will be looking for values that are
-greater than or equal to 23.4401879 for our \(p\)-value.
+greater than or equal to 23.4402 for our \(p\)-value.
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -8098,7 +8128,7 @@ \subsubsection{Bootstrapping for Hypothesis
 \end{Highlighting}
 \end{Shaded}
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-142-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-146-1} \end{center}
 
 \paragraph{\texorpdfstring{Calculate
 \(p\)-value}{Calculate p-value}}\label{calculate-p-value}
@@ -8127,7 +8157,7 @@ \subsubsection{Bootstrapping for Confidence
 parameter \(\mu\) using our sample data using \emph{bootstrapping}. Note
 that we don't need to shift this distribution since we want the center
 of our confidence interval to be our point estimate
-\(\bar{x}_{obs} = 23.4401879\).
+\(\bar{x}_{obs} = 23.4402\).
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -8144,7 +8174,7 @@ \subsubsection{Bootstrapping for Confidence
 \end{Highlighting}
 \end{Shaded}
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-144-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-148-1} \end{center}
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -8154,8 +8184,8 @@ \subsubsection{Bootstrapping for Confidence
 \end{Shaded}
 
 \begin{verbatim}
-##      lower    upper
-## 1 23.31821 23.56361
+##   lower upper
+## 1 23.32 23.56
 \end{verbatim}
 
 We see that 23 is not contained in this confidence interval as a
@@ -8200,6 +8230,8 @@ \subsubsection{Check conditions}\label{check-conditions}
 \end{Highlighting}
 \end{Shaded}
 
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/qqplotmean-1} \end{center}
+
 The sample size here is quite large though (\(n = 5534\)) so both
 conditions are met.
 
@@ -8210,12 +8242,12 @@ \subsubsection{Test statistic}\label{test-statistic}
 guess is the sample mean \(\bar{X}\). Recall that this sample mean is
 actually a random variable that will vary as different samples are
 (theoretically, would be) collected. We are looking to see how likely is
-it for us to have observed a sample mean of
-\(\bar{x}_{obs} = 23.4401879\) or larger assuming that the population
-mean is 23 (assuming the null hypothesis is true). If the conditions are
-met and assuming \(H_0\) is true, we can ``standardize'' this original
-test statistic of \(\bar{X}\) into a \(T\) statistic that follows a
-\(t\) distribution with degrees of freedom equal to \(df = n - 1\):
+it for us to have observed a sample mean of \(\bar{x}_{obs} = 23.4402\)
+or larger assuming that the population mean is 23 (assuming the null
+hypothesis is true). If the conditions are met and assuming \(H_0\) is
+true, we can ``standardize'' this original test statistic of \(\bar{X}\)
+into a \(T\) statistic that follows a \(t\) distribution with degrees of
+freedom equal to \(df = n - 1\):
 
 \[ T =\dfrac{ \bar{X} - \mu_0}{ S / \sqrt{n} } \sim t (df = n - 1) \]
 
@@ -8242,13 +8274,13 @@ \subsubsection{Test statistic}\label{test-statistic}
 ##  One Sample t-test
 ## 
 ## data:  ageAtMar$age
-## t = 6.9357, df = 5533, p-value = 0.000000000002252
+## t = 6.9, df = 5500, p-value = 0.000000000002
 ## alternative hypothesis: true mean is greater than 23
 ## 95 percent confidence interval:
-##  23.33578      Inf
+##  23.34   Inf
 ## sample estimates:
 ## mean of x 
-##  23.44019
+##     23.44
 \end{verbatim}
 
 We see here that the \(t_{obs}\) value is around 6.94. Recall that for
@@ -8270,7 +8302,7 @@ \subsubsection{\texorpdfstring{Compute
 \end{Shaded}
 
 \begin{verbatim}
-## [1] 0.000000000002247382
+## [1] 0.000000000002247
 \end{verbatim}
 
 We can also use the \(N(0, 1)\) distribution here:
@@ -8282,7 +8314,7 @@ \subsubsection{\texorpdfstring{Compute
 \end{Shaded}
 
 \begin{verbatim}
-## [1] 0.000000000002016788
+## [1] 0.000000000002017
 \end{verbatim}
 
 \subsubsection{State conclusion}\label{state-conclusion}
@@ -8310,7 +8342,7 @@ \subsubsection{Confidence interval}\label{confidence-interval}
 \end{Shaded}
 
 \begin{verbatim}
-## [1] 23.31577 23.56461
+## [1] 23.32 23.56
 ## attr(,"conf.level")
 ## [1] 0.95
 \end{verbatim}
@@ -8452,7 +8484,7 @@ \subsubsection{Simulation for Hypothesis
 \end{Highlighting}
 \end{Shaded}
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-146-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-150-1} \end{center}
 
 \paragraph{\texorpdfstring{Calculate
 \(p\)-value}{Calculate p-value}}\label{calculate-p-value-1}
@@ -8523,7 +8555,7 @@ \subsubsection{Bootstrapping for Confidence
 \end{Highlighting}
 \end{Shaded}
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-148-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-152-1} \end{center}
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -8625,7 +8657,7 @@ \subsubsection{\texorpdfstring{Compute
 \end{Shaded}
 
 \begin{verbatim}
-## [1] 0.08011831
+## [1] 0.08012
 \end{verbatim}
 
 The \(p\)-value---the probability of observing an \(z_{obs}\) value of
@@ -8650,10 +8682,10 @@ \subsubsection{\texorpdfstring{Compute
 ##  1-sample proportions test without continuity correction
 ## 
 ## data:  table(elec$satisfy), null probability 0.8
-## X-squared = 3.0625, df = 1, p-value = 0.08012
+## X-squared = 3.1, df = 1, p-value = 0.08
 ## alternative hypothesis: true p is not equal to 0.8
 ## 95 percent confidence interval:
-##  0.6356788 0.8073042
+##  0.6357 0.8073
 ## sample estimates:
 ##    p 
 ## 0.73
@@ -8788,8 +8820,8 @@ \subsection{Exploring the sample
 ## # A tibble: 2 × 3
 ##   college_grad prop_no_opinion sample_size
 ##          <chr>           <dbl>       <int>
-## 1           no       0.3367609         389
-## 2          yes       0.2374429         438
+## 1           no          0.3368         389
+## 2          yes          0.2374         438
 \end{verbatim}
 
 \begin{Shaded}
@@ -8834,9 +8866,9 @@ \subsubsection{Randomization for Hypothesis
 Test}\label{randomization-for-hypothesis-test}
 
 In order to look to see if the observed sample proportion of no opinion
-for college graduates of 0.3367609 is statistically different than that
-for graduates of 0.2374429, we need to account for the sample sizes.
-Note that this is the same as looking to see if
+for college graduates of 0.3368 is statistically different than that for
+graduates of 0.2374, we need to account for the sample sizes. Note that
+this is the same as looking to see if
 \(\hat{p}_{grad} - \hat{p}_{nograd}\) is statistically different than 0.
 We also need to determine a process that replicates how the original
 group sizes of 389 and 438 were selected.
@@ -8868,8 +8900,8 @@ \subsubsection{Randomization for Hypothesis
 
 We can next use this distribution to observe our \(p\)-value. Recall
 this is a two-tailed test so we will be looking for values that are
-greater than or equal to -0.099318 or less than or equal to 0.099318 for
-our \(p\)-value.
+greater than or equal to -0.0993 or less than or equal to 0.0993 for our
+\(p\)-value.
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -8880,7 +8912,7 @@ \subsubsection{Randomization for Hypothesis
 \end{Highlighting}
 \end{Shaded}
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-151-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-155-1} \end{center}
 
 \paragraph{\texorpdfstring{Calculate
 \(p\)-value}{Calculate p-value}}\label{calculate-p-value-2}
@@ -8941,7 +8973,7 @@ \subsubsection{Bootstrapping for Confidence
 \end{Highlighting}
 \end{Shaded}
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-154-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-158-1} \end{center}
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -8952,9 +8984,9 @@ \subsubsection{Bootstrapping for Confidence
 
 \begin{verbatim}
 ## # A tibble: 1 × 2
-##        lower       upper
-##        <dbl>       <dbl>
-## 1 -0.1595767 -0.03791979
+##     lower    upper
+##     <dbl>    <dbl>
+## 1 -0.1596 -0.03792
 \end{verbatim}
 
 We see that 0 is not contained in this confidence interval as a
@@ -9018,7 +9050,7 @@ \subsection{Test statistic}\label{test-statistic-2}
 The test statistic is a random variable based on the sample data. Here,
 we are interested in seeing if our observed difference in sample
 proportions corresponding to no opinion on drilling
-(\(\hat{p}_{college, obs} - \hat{p}_{no\_college, obs}\) = 0.0326481) is
+(\(\hat{p}_{college, obs} - \hat{p}_{no\_college, obs}\) = 0.0326) is
 statistically different than 0. Assuming that conditions are met and the
 null hypothesis is true, we can use the standard normal distribution to
 standardize the difference in sample proportions
@@ -9052,13 +9084,13 @@ \subsubsection{Observed test statistic}\label{observed-test-statistic-2}
 ##  correction
 ## 
 ## data:  table(offshore$college_grad, offshore$response)
-## X-squared = 9.9907, df = 1, p-value = 0.001573
+## X-squared = 10, df = 1, p-value = 0.002
 ## alternative hypothesis: two.sided
 ## 95 percent confidence interval:
-##  0.03772522 0.16091078
+##  0.03773 0.16091
 ## sample estimates:
-##    prop 1    prop 2 
-## 0.3367609 0.2374429
+## prop 1 prop 2 
+## 0.3368 0.2374
 \end{verbatim}
 
 \texttt{prop.test} does a \(\chi^2\) test here but this matches up
@@ -9079,7 +9111,7 @@ \subsubsection{Observed test statistic}\label{observed-test-statistic-2}
 \end{Shaded}
 
 \begin{verbatim}
-## [1] 0.001577691
+## [1] 0.001578
 \end{verbatim}
 
 The 95\% confidence interval is also stated above in the
@@ -9188,9 +9220,9 @@ \subsection{Exploring the sample
 \hline
 metro\_area & sample\_size & mean & sd & minimum & lower\_quartile & median & upper\_quartile & max\\
 \hline
-Cleveland\_ OH & 212 & 27467.07 & 27680.68 & 0 & 8475 & 21000 & 35275 & 152400\\
+Cleveland\_ OH & 212 & 27467 & 27681 & 0 & 8475 & 21000 & 35275 & 152400\\
 \hline
-Sacramento\_ CA & 175 & 32427.54 & 35773.63 & 0 & 8050 & 20000 & 49350 & 206900\\
+Sacramento\_ CA & 175 & 32428 & 35774 & 0 & 8050 & 20000 & 49350 & 206900\\
 \hline
 \end{tabular}
 
@@ -9237,9 +9269,9 @@ \subsubsection{Randomization for Hypothesis
 Test}\label{randomization-for-hypothesis-test-1}
 
 In order to look to see if the observed sample mean for Sacramento of
-27467.0660377 is statistically different than that for Cleveland of
-32427.5428571, we need to account for the sample sizes. Note that this
-is the same as looking to see if \(\bar{x}_{sac} - \bar{x}_{cle}\) is
+27467.066 is statistically different than that for Cleveland of
+32427.5429, we need to account for the sample sizes. Note that this is
+the same as looking to see if \(\bar{x}_{sac} - \bar{x}_{cle}\) is
 statistically different than 0. We also need to determine a process that
 replicates how the original group sizes of 212 and 175 were selected.
 
@@ -9270,8 +9302,8 @@ \subsubsection{Randomization for Hypothesis
 
 We can next use this distribution to observe our \(p\)-value. Recall
 this is a two-tailed test so we will be looking for values that are
-greater than or equal to 4960.4768194 or less than or equal to
--4960.4768194 for our \(p\)-value.
+greater than or equal to 4960.4768 or less than or equal to -4960.4768
+for our \(p\)-value.
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -9282,7 +9314,7 @@ \subsubsection{Randomization for Hypothesis
 \end{Highlighting}
 \end{Shaded}
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-156-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-160-1} \end{center}
 
 \paragraph{\texorpdfstring{Calculate
 \(p\)-value}{Calculate p-value}}\label{calculate-p-value-3}
@@ -9343,7 +9375,7 @@ \subsubsection{Bootstrapping for Confidence
 \end{Highlighting}
 \end{Shaded}
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-159-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-163-1} \end{center}
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -9354,9 +9386,9 @@ \subsubsection{Bootstrapping for Confidence
 
 \begin{verbatim}
 ## # A tibble: 1 × 2
-##      lower    upper
-##      <dbl>    <dbl>
-## 1 -1512.59 11458.85
+##   lower upper
+##   <dbl> <dbl>
+## 1 -1513 11459
 \end{verbatim}
 
 We see that 0 is contained in this confidence interval as a plausible
@@ -9423,7 +9455,7 @@ \subsection{Test statistic}\label{test-statistic-3}
 
 The test statistic is a random variable based on the sample data. Here,
 we are interested in seeing if our observed difference in sample means
-(\(\bar{x}_{sac, obs} - \bar{x}_{cle, obs}\) = 4960.4768194) is
+(\(\bar{x}_{sac, obs} - \bar{x}_{cle, obs}\) = 4960.4768) is
 statistically different than 0. Assuming that conditions are met and the
 null hypothesis is true, we can use the \(t\) distribution to
 standardize the difference in sample means
@@ -9458,13 +9490,13 @@ \subsubsection{Observed test statistic}\label{observed-test-statistic-3}
 ##  Welch Two Sample t-test
 ## 
 ## data:  sacramento$income and cleveland$income
-## t = 1.5006, df = 323.36, p-value = 0.1344
+## t = 1.5, df = 320, p-value = 0.1
 ## alternative hypothesis: true difference in means is not equal to 0
 ## 95 percent confidence interval:
-##  -1542.758 11463.712
+##  -1543 11464
 ## sample estimates:
 ## mean of x mean of y 
-##  32427.54  27467.07
+##     32428     27467
 \end{verbatim}
 
 Note that the degrees of freedom reported above are different than what
@@ -9497,7 +9529,7 @@ \subsection{\texorpdfstring{Compute
 \end{Shaded}
 
 \begin{verbatim}
-## [1] 0.135168
+## [1] 0.1352
 \end{verbatim}
 
 We can also approximate by using the standard normal curve:
@@ -9509,7 +9541,7 @@ \subsection{\texorpdfstring{Compute
 \end{Shaded}
 
 \begin{verbatim}
-## [1] 0.1333556
+## [1] 0.1334
 \end{verbatim}
 
 Note that the 95 percent confidence interval given above matches well
@@ -9625,7 +9657,7 @@ \subsection{Exploring the sample
 \hline
 sample\_size & mean & sd & minimum & lower\_quartile & median & upper\_quartile & max\\
 \hline
-10 & -0.0804 & 0.0522732 & -0.177 & -0.11 & -0.084 & -0.0355 & -0.015\\
+10 & -0.0804 & 0.0523 & -0.177 & -0.11 & -0.084 & -0.0355 & -0.015\\
 \hline
 \end{tabular}
 
@@ -9712,7 +9744,7 @@ \subsubsection{Randomization for Hypothesis
 \end{Highlighting}
 \end{Shaded}
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-162-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-166-1} \end{center}
 
 \paragraph{\texorpdfstring{Calculate
 \(p\)-value}{Calculate p-value}}\label{calculate-p-value-4}
@@ -9758,7 +9790,7 @@ \subsubsection{Bootstrapping for Confidence
 \end{Highlighting}
 \end{Shaded}
 
-\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-164-1} \end{center}
+\begin{center}\includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-168-1} \end{center}
 
 \begin{Shaded}
 \begin{Highlighting}[]
@@ -9768,8 +9800,8 @@ \subsubsection{Bootstrapping for Confidence
 \end{Shaded}
 
 \begin{verbatim}
-##     lower      upper
-## 1 -0.1114 -0.0504975
+##     lower   upper
+## 1 -0.1114 -0.0505
 \end{verbatim}
 
 We see that 0 is not contained in this confidence interval as a
@@ -9863,10 +9895,10 @@ \subsubsection{Test statistic}\label{test-statistic-4}
 ##  One Sample t-test
 ## 
 ## data:  zinc_diff$pair_diff
-## t = -4.8638, df = 9, p-value = 0.0004456
+## t = -4.9, df = 9, p-value = 0.0004
 ## alternative hypothesis: true mean is less than 0
 ## 95 percent confidence interval:
-##        -Inf -0.0500982
+##     -Inf -0.0501
 ## sample estimates:
 ## mean of x 
 ##   -0.0804
@@ -9888,7 +9920,7 @@ \subsubsection{\texorpdfstring{Compute
 \end{Shaded}
 
 \begin{verbatim}
-## [1] 0.000369484
+## [1] 0.0003695
 \end{verbatim}
 
 \subsubsection{State conclusion}\label{state-conclusion-4}
@@ -9915,15 +9947,25 @@ \subsection{Comparing results}\label{comparing-results-4}
 
 \chapter{Reach for the Starts}\label{appendixC}
 
+\section*{Needed packages}\label{needed-packages-8}
+\addcontentsline{toc}{section}{Needed packages}
+
+\begin{Shaded}
+\begin{Highlighting}[]
+\KeywordTok{library}\NormalTok{(dplyr)}
+\KeywordTok{library}\NormalTok{(ggplot2)}
+\KeywordTok{library}\NormalTok{(knitr)}
+\KeywordTok{library}\NormalTok{(dygraphs)}
+\KeywordTok{library}\NormalTok{(nycflights13)}
+\end{Highlighting}
+\end{Shaded}
+
 \section{Sorted barplots}\label{sorted-barplots}
 
 Building upon the example in Section \ref{barplots}:
 
 \begin{Shaded}
 \begin{Highlighting}[]
-\KeywordTok{library}\NormalTok{(nycflights13)}
-\KeywordTok{library}\NormalTok{(ggplot2)}
-\KeywordTok{library}\NormalTok{(dplyr)}
 \NormalTok{flights_table <-}\StringTok{ }\KeywordTok{table}\NormalTok{(flights$carrier)}
 \NormalTok{flights_table}
 \end{Highlighting}
@@ -9972,11 +10014,11 @@ \section{Sorted barplots}\label{sorted-barplots}
 
 \begin{figure}
 
-{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-170-1} 
+{\centering \includegraphics[width=\textwidth]{ismaykim_files/figure-latex/unnamed-chunk-175-1} 
 
 }
 
-\caption[Number of flights departing NYC in 2013 by airline - Descending numbers]{Number of flights departing NYC in 2013 by airline - Descending numbers}\label{fig:unnamed-chunk-170}
+\caption[Number of flights departing NYC in 2013 by airline - Descending numbers]{Number of flights departing NYC in 2013 by airline - Descending numbers}\label{fig:unnamed-chunk-175}
 \end{figure}
 
 The last addition here specifies the values of the horizontal \texttt{x}
@@ -10006,7 +10048,7 @@ \subsection{Interactive line-graphs}\label{interactive-line-graphs}
 \end{Highlighting}
 \end{Shaded}
 
-\begin{center}\includegraphics[width=1\linewidth]{ismaykim_files/figure-latex/unnamed-chunk-172-1} \end{center}
+\begin{center}\includegraphics[width=1\linewidth]{ismaykim_files/figure-latex/unnamed-chunk-177-1} \end{center}
 
 The syntax here is a little different than what we have covered so far.
 The \texttt{dygraph} function is expecting for the dates to be given as
diff --git a/docs/ismaykim_files/figure-html/facethistogram-1.png b/docs/ismaykim_files/figure-html/facethistogram-1.png
index a0f93ed0d..6bab6dc81 100644
Binary files a/docs/ismaykim_files/figure-html/facethistogram-1.png and b/docs/ismaykim_files/figure-html/facethistogram-1.png differ
diff --git a/docs/ismaykim_files/figure-html/jitter-1.png b/docs/ismaykim_files/figure-html/jitter-1.png
index d37e3161d..5d083267a 100644
Binary files a/docs/ismaykim_files/figure-html/jitter-1.png and b/docs/ismaykim_files/figure-html/jitter-1.png differ
diff --git a/docs/ismaykim_files/figure-html/qqplotmean-1.png b/docs/ismaykim_files/figure-html/qqplotmean-1.png
new file mode 100644
index 000000000..81b06c6d2
Binary files /dev/null and b/docs/ismaykim_files/figure-html/qqplotmean-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-120-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-100-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-120-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-100-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-101-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-101-1.png
deleted file mode 100644
index a2cb58e1b..000000000
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-101-1.png and /dev/null differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-102-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-102-1.png
deleted file mode 100644
index 0fda34577..000000000
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-102-1.png and /dev/null differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-104-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-104-1.png
index dd13186af..5c49f073d 100644
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-104-1.png and b/docs/ismaykim_files/figure-html/unnamed-chunk-104-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-105-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-105-1.png
new file mode 100644
index 000000000..60f314376
Binary files /dev/null and b/docs/ismaykim_files/figure-html/unnamed-chunk-105-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-107-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-107-1.png
new file mode 100644
index 000000000..3d7763f00
Binary files /dev/null and b/docs/ismaykim_files/figure-html/unnamed-chunk-107-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-109-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-112-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-109-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-112-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-111-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-114-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-111-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-114-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-116-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-119-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-116-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-119-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-93-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-123-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-93-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-123-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-127-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-127-1.png
deleted file mode 100644
index bd4244c6b..000000000
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-127-1.png and /dev/null differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-130-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-130-1.png
index 5b4619e74..bd4244c6b 100644
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-130-1.png and b/docs/ismaykim_files/figure-html/unnamed-chunk-130-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-128-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-131-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-128-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-131-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-129-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-132-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-129-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-132-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-133-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-133-1.png
index 9b9f6e799..5b4619e74 100644
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-133-1.png and b/docs/ismaykim_files/figure-html/unnamed-chunk-133-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-134-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-134-1.png
deleted file mode 100644
index 5be897c52..000000000
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-134-1.png and /dev/null differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-136-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-136-1.png
index 1ca3c0753..9b9f6e799 100644
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-136-1.png and b/docs/ismaykim_files/figure-html/unnamed-chunk-136-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-137-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-137-1.png
index 1b14f9ef4..5be897c52 100644
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-137-1.png and b/docs/ismaykim_files/figure-html/unnamed-chunk-137-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-139-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-139-1.png
new file mode 100644
index 000000000..1ca3c0753
Binary files /dev/null and b/docs/ismaykim_files/figure-html/unnamed-chunk-139-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-140-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-140-1.png
new file mode 100644
index 000000000..1b14f9ef4
Binary files /dev/null and b/docs/ismaykim_files/figure-html/unnamed-chunk-140-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-138-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-141-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-138-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-141-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-142-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-142-1.png
deleted file mode 100644
index c25300dcd..000000000
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-142-1.png and /dev/null differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-144-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-144-1.png
deleted file mode 100644
index 1363b6a20..000000000
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-144-1.png and /dev/null differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-146-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-146-1.png
index ff284aaec..c25300dcd 100644
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-146-1.png and b/docs/ismaykim_files/figure-html/unnamed-chunk-146-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-148-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-148-1.png
index f23f19d8b..1363b6a20 100644
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-148-1.png and b/docs/ismaykim_files/figure-html/unnamed-chunk-148-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-150-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-150-1.png
new file mode 100644
index 000000000..ff284aaec
Binary files /dev/null and b/docs/ismaykim_files/figure-html/unnamed-chunk-150-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-152-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-152-1.png
new file mode 100644
index 000000000..f23f19d8b
Binary files /dev/null and b/docs/ismaykim_files/figure-html/unnamed-chunk-152-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-151-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-155-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-151-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-155-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-154-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-158-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-154-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-158-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-16-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-16-1.png
deleted file mode 100644
index 40b2ae6af..000000000
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-16-1.png and /dev/null differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-156-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-160-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-156-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-160-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-159-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-163-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-159-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-163-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-162-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-166-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-162-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-166-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-164-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-168-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-164-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-168-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-170-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-175-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-170-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-175-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-18-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-18-1.png
deleted file mode 100644
index 22e1d307b..000000000
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-18-1.png and /dev/null differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-19-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-19-1.png
index c2cbddca9..40b2ae6af 100644
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-19-1.png and b/docs/ismaykim_files/figure-html/unnamed-chunk-19-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-17-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-20-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-17-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-20-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-21-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-21-1.png
new file mode 100644
index 000000000..4e2091c19
Binary files /dev/null and b/docs/ismaykim_files/figure-html/unnamed-chunk-21-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-22-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-22-1.png
new file mode 100644
index 000000000..950bb9970
Binary files /dev/null and b/docs/ismaykim_files/figure-html/unnamed-chunk-22-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-23-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-27-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-23-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-27-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-24-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-28-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-24-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-28-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-25-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-29-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-25-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-29-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-41-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-45-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-41-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-45-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-66-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-71-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-66-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-71-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-72-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-72-1.png
index ee85ea5d4..5286bef64 100644
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-72-1.png and b/docs/ismaykim_files/figure-html/unnamed-chunk-72-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-77-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-77-1.png
index 5286bef64..ee85ea5d4 100644
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-77-1.png and b/docs/ismaykim_files/figure-html/unnamed-chunk-77-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-67-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-81-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-67-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-81-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-80-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-83-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-80-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-83-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-84-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-84-1.png
deleted file mode 100644
index f3bc1cc3b..000000000
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-84-1.png and /dev/null differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-87-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-87-1.png
index a588c4569..f3bc1cc3b 100644
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-87-1.png and b/docs/ismaykim_files/figure-html/unnamed-chunk-87-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-86-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-89-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-86-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-89-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-90-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-90-1.png
new file mode 100644
index 000000000..a588c4569
Binary files /dev/null and b/docs/ismaykim_files/figure-html/unnamed-chunk-90-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-94-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-94-1.png
deleted file mode 100644
index c061a53f0..000000000
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-94-1.png and /dev/null differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-96-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-96-1.png
index 732516c96..31f9de5fa 100644
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-96-1.png and b/docs/ismaykim_files/figure-html/unnamed-chunk-96-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-97-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-97-1.png
index 31f9de5fa..c061a53f0 100644
Binary files a/docs/ismaykim_files/figure-html/unnamed-chunk-97-1.png and b/docs/ismaykim_files/figure-html/unnamed-chunk-97-1.png differ
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-95-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-98-1.png
similarity index 100%
rename from docs/ismaykim_files/figure-html/unnamed-chunk-95-1.png
rename to docs/ismaykim_files/figure-html/unnamed-chunk-98-1.png
diff --git a/docs/ismaykim_files/figure-html/unnamed-chunk-99-1.png b/docs/ismaykim_files/figure-html/unnamed-chunk-99-1.png
new file mode 100644
index 000000000..732516c96
Binary files /dev/null and b/docs/ismaykim_files/figure-html/unnamed-chunk-99-1.png differ
diff --git a/docs/ismaykim_files/figure-html/with-reg-1.png b/docs/ismaykim_files/figure-html/with-reg-1.png
index 995ee223f..6d9c5ebc2 100644
Binary files a/docs/ismaykim_files/figure-html/with-reg-1.png and b/docs/ismaykim_files/figure-html/with-reg-1.png differ
diff --git a/docs/references.html b/docs/references.html
index a7e29d44c..b5fc09d83 100644
--- a/docs/references.html
+++ b/docs/references.html
@@ -26,7 +26,7 @@
 <meta name="author" content="Chester Ismay and Albert Y. Kim">
 
 
-<meta name="date" content="2017-01-07">
+<meta name="date" content="2017-01-10">
 
   <meta name="viewport" content="width=device-width, initial-scale=1">
   <meta name="apple-mobile-web-app-capable" content="yes">
@@ -119,9 +119,9 @@
 
 <ul class="summary">
 <li class="chapter" data-level="1" data-path="index.html"><a href="index.html"><i class="fa fa-check"></i><b>1</b> Preamble</a><ul>
-<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book</a></li>
+<li class="chapter" data-level="1.1" data-path="index.html"><a href="index.html#principles-of-this-book---for-instructors"><i class="fa fa-check"></i><b>1.1</b> Principles of this Book - For Instructors</a></li>
 <li class="chapter" data-level="1.2" data-path="index.html"><a href="index.html#contribute"><i class="fa fa-check"></i><b>1.2</b> Contribute</a></li>
-<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started"><i class="fa fa-check"></i><b>1.3</b> Getting Started</a></li>
+<li class="chapter" data-level="1.3" data-path="index.html"><a href="index.html#getting-started---for-students"><i class="fa fa-check"></i><b>1.3</b> Getting Started - For Students</a></li>
 <li class="chapter" data-level="" data-path="index.html"><a href="index.html#colophon"><i class="fa fa-check"></i>Colophon</a></li>
 </ul></li>
 <li class="chapter" data-level="2" data-path="2-intro.html"><a href="2-intro.html"><i class="fa fa-check"></i><b>2</b> Introduction</a><ul>
@@ -133,87 +133,100 @@
 </ul></li>
 <li class="part"><span><b>I Data Exploration</b></span></li>
 <li class="chapter" data-level="3" data-path="3-tidy.html"><a href="3-tidy.html"><i class="fa fa-check"></i><b>3</b> Tidy Data</a><ul>
+<li class="chapter" data-level="" data-path="3-tidy.html"><a href="3-tidy.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="3.1" data-path="3-tidy.html"><a href="3-tidy.html#what-is-tidy-data"><i class="fa fa-check"></i><b>3.1</b> What is tidy data?</a></li>
-<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#the-nycflights13-datasets"><i class="fa fa-check"></i><b>3.2</b> The <code>nycflights13</code> datasets</a></li>
-<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a></li>
+<li class="chapter" data-level="3.2" data-path="3-tidy.html"><a href="3-tidy.html#datasets-in-the-nycflights13-package"><i class="fa fa-check"></i><b>3.2</b> Datasets in the <code>nycflights13</code> package</a></li>
+<li class="chapter" data-level="3.3" data-path="3-tidy.html"><a href="3-tidy.html#how-is-flights-tidy"><i class="fa fa-check"></i><b>3.3</b> How is <code>flights</code> tidy?</a><ul>
+<li class="chapter" data-level="3.3.1" data-path="3-tidy.html"><a href="3-tidy.html#identification-variables"><i class="fa fa-check"></i><b>3.3.1</b> Identification variables</a></li>
+</ul></li>
 <li class="chapter" data-level="3.4" data-path="3-tidy.html"><a href="3-tidy.html#normal-forms-of-data"><i class="fa fa-check"></i><b>3.4</b> Normal forms of data</a></li>
 <li class="chapter" data-level="3.5" data-path="3-tidy.html"><a href="3-tidy.html#whats-to-come"><i class="fa fa-check"></i><b>3.5</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via <code id="viz">ggplot2</code></a><ul>
-<li class="chapter" data-level="" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#needed-packages"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
-<li class="chapter" data-level="4.1.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
-<li class="chapter" data-level="4.1.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
-<li class="chapter" data-level="4.1.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
-<li class="chapter" data-level="4.1.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The <code>ggplot2</code> Package</a></li>
-</ul></li>
-<li class="chapter" data-level="4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
-<li class="chapter" data-level="4.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
-<li class="chapter" data-level="4.3.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#scatter-plots-via-geom_point"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via <code id="geompoint">geom_point</code></a></li>
-<li class="chapter" data-level="4.3.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
-<li class="chapter" data-level="4.3.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
-<li class="chapter" data-level="4.4.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#line-graphs-via-geom_line"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via <code id="geomline">geom_line</code></a></li>
-<li class="chapter" data-level="4.4.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.5" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
-<li class="chapter" data-level="4.5.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#histograms-via-geom_histogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via <code id="geomhistogram">geom_histogram</code></a></li>
-<li class="chapter" data-level="4.5.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
-<li class="chapter" data-level="4.5.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.6" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
-<li class="chapter" data-level="4.7" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
-<li class="chapter" data-level="4.7.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#boxplots-via-geom_boxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via <code id="geomboxplot">geom_boxplot</code></a></li>
-<li class="chapter" data-level="4.7.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.8" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
-<li class="chapter" data-level="4.8.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via <code>geom_bar</code></a></li>
-<li class="chapter" data-level="4.8.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
-<li class="chapter" data-level="4.8.3" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
-<li class="chapter" data-level="4.8.4" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
-</ul></li>
-<li class="chapter" data-level="4.9" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
-<li class="chapter" data-level="4.9.1" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.1</b> What’s to come?</a></li>
-<li class="chapter" data-level="4.9.2" data-path="4-data-visualization-via-ggplot2.html"><a href="4-data-visualization-via-ggplot2.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
-</ul></li>
-</ul></li>
-<li class="chapter" data-level="5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via <code id="manip">dplyr</code></a><ul>
-<li class="chapter" data-level="" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="5.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
-<li class="chapter" data-level="5.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#four-main-verbs---the-4mv"><i class="fa fa-check"></i><b>5.2</b> Four Main Verbs - The 4MV</a><ul>
-<li class="chapter" data-level="5.2.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#filter-observations-using-filter"><i class="fa fa-check"></i><b>5.2.1</b> Filter observations using <code id="filter">filter</code></a></li>
-<li class="chapter" data-level="5.2.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> Summarize variables using <code>summarize</code></a></li>
-<li class="chapter" data-level="5.2.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.3</b> Create new variables/change old variables using <code>mutate</code></a></li>
-<li class="chapter" data-level="5.2.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#reorder-the-data-frame-using-arrange"><i class="fa fa-check"></i><b>5.2.4</b> Reorder the data frame using <code id="arrange">arrange</code></a></li>
-</ul></li>
-<li class="chapter" data-level="5.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#other-verbs"><i class="fa fa-check"></i><b>5.3</b> Other verbs</a><ul>
-<li class="chapter" data-level="5.3.1" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#select-variables-using-select"><i class="fa fa-check"></i><b>5.3.1</b> Select variables using <code id="select">select</code></a></li>
-<li class="chapter" data-level="5.3.2" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#rename-variables-using-rename"><i class="fa fa-check"></i><b>5.3.2</b> Rename variables using <code id="rename">rename</code></a></li>
-<li class="chapter" data-level="5.3.3" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.3.3</b> Find the top number of values using <code>top_n</code></a></li>
-</ul></li>
-<li class="chapter" data-level="5.4" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#joiningmerging-data-frames"><i class="fa fa-check"></i><b>5.4</b> Joining/merging data frames</a></li>
-<li class="chapter" data-level="5.5" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5</b> Script of R code</a></li>
-<li class="chapter" data-level="5.6" data-path="5-data-manipulation-via-dplyr.html"><a href="5-data-manipulation-via-dplyr.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.6</b> What’s to come?</a></li>
+<li class="chapter" data-level="4" data-path="4-viz.html"><a href="4-viz.html"><i class="fa fa-check"></i><b>4</b> Data Visualization via ggplot2</a><ul>
+<li class="chapter" data-level="" data-path="4-viz.html"><a href="4-viz.html#needed-packages-1"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="4.1" data-path="4-viz.html"><a href="4-viz.html#grammarofgraphics"><i class="fa fa-check"></i><b>4.1</b> The Grammar of Graphics</a><ul>
+<li class="chapter" data-level="4.1.1" data-path="4-viz.html"><a href="4-viz.html#components-of-grammar"><i class="fa fa-check"></i><b>4.1.1</b> Components of Grammar</a></li>
+<li class="chapter" data-level="4.1.2" data-path="4-viz.html"><a href="4-viz.html#napoleans-march-on-moscow"><i class="fa fa-check"></i><b>4.1.2</b> Napolean’s March on Moscow</a></li>
+<li class="chapter" data-level="4.1.3" data-path="4-viz.html"><a href="4-viz.html#other-components-of-the-grammar"><i class="fa fa-check"></i><b>4.1.3</b> Other Components of the Grammar</a></li>
+<li class="chapter" data-level="4.1.4" data-path="4-viz.html"><a href="4-viz.html#the-ggplot2-package"><i class="fa fa-check"></i><b>4.1.4</b> The ggplot2 Package</a></li>
+</ul></li>
+<li class="chapter" data-level="4.2" data-path="4-viz.html"><a href="4-viz.html#five-named-graphs---the-5ng"><i class="fa fa-check"></i><b>4.2</b> Five Named Graphs - The 5NG</a></li>
+<li class="chapter" data-level="4.3" data-path="4-viz.html"><a href="4-viz.html#scatterplots"><i class="fa fa-check"></i><b>4.3</b> 5NG#1: Scatter-plots</a><ul>
+<li class="chapter" data-level="4.3.1" data-path="4-viz.html"><a href="4-viz.html#geompoint"><i class="fa fa-check"></i><b>4.3.1</b> Scatter-plots via geom_point</a></li>
+<li class="chapter" data-level="4.3.2" data-path="4-viz.html"><a href="4-viz.html#over-plotting"><i class="fa fa-check"></i><b>4.3.2</b> Over-Plotting</a></li>
+<li class="chapter" data-level="4.3.3" data-path="4-viz.html"><a href="4-viz.html#summary"><i class="fa fa-check"></i><b>4.3.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.4" data-path="4-viz.html"><a href="4-viz.html#linegraphs"><i class="fa fa-check"></i><b>4.4</b> 5NG#2: Line-graphs</a><ul>
+<li class="chapter" data-level="4.4.1" data-path="4-viz.html"><a href="4-viz.html#geomline"><i class="fa fa-check"></i><b>4.4.1</b> Line-graphs via geom_line</a></li>
+<li class="chapter" data-level="4.4.2" data-path="4-viz.html"><a href="4-viz.html#summary-1"><i class="fa fa-check"></i><b>4.4.2</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.5" data-path="4-viz.html"><a href="4-viz.html#histograms"><i class="fa fa-check"></i><b>4.5</b> 5NG#3: Histograms</a><ul>
+<li class="chapter" data-level="4.5.1" data-path="4-viz.html"><a href="4-viz.html#geomhistogram"><i class="fa fa-check"></i><b>4.5.1</b> Histograms via geom_histogram</a></li>
+<li class="chapter" data-level="4.5.2" data-path="4-viz.html"><a href="4-viz.html#adjustbins"><i class="fa fa-check"></i><b>4.5.2</b> Adjusting the Bins</a></li>
+<li class="chapter" data-level="4.5.3" data-path="4-viz.html"><a href="4-viz.html#summary-2"><i class="fa fa-check"></i><b>4.5.3</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.6" data-path="4-viz.html"><a href="4-viz.html#facets"><i class="fa fa-check"></i><b>4.6</b> Facets</a></li>
+<li class="chapter" data-level="4.7" data-path="4-viz.html"><a href="4-viz.html#ng4-boxplots"><i class="fa fa-check"></i><b>4.7</b> 5NG#4: Boxplots</a><ul>
+<li class="chapter" data-level="4.7.1" data-path="4-viz.html"><a href="4-viz.html#geomboxplot"><i class="fa fa-check"></i><b>4.7.1</b> Boxplots via geom_boxplot</a></li>
+<li class="chapter" data-level="4.7.2" data-path="4-viz.html"><a href="4-viz.html#summary-3"><i class="fa fa-check"></i><b>4.7.2</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.8" data-path="4-viz.html"><a href="4-viz.html#geombar"><i class="fa fa-check"></i><b>4.8</b> 5NG#5: Barplots</a><ul>
+<li class="chapter" data-level="4.8.1" data-path="4-viz.html"><a href="4-viz.html#barplots-via-geom_bar"><i class="fa fa-check"></i><b>4.8.1</b> Barplots via geom_bar</a></li>
+<li class="chapter" data-level="4.8.2" data-path="4-viz.html"><a href="4-viz.html#must-avoid-pie-charts"><i class="fa fa-check"></i><b>4.8.2</b> Must avoid pie charts!</a></li>
+<li class="chapter" data-level="4.8.3" data-path="4-viz.html"><a href="4-viz.html#using-barplots-to-compare-two-variables"><i class="fa fa-check"></i><b>4.8.3</b> Using barplots to compare two variables</a></li>
+<li class="chapter" data-level="4.8.4" data-path="4-viz.html"><a href="4-viz.html#summary-4"><i class="fa fa-check"></i><b>4.8.4</b> Summary</a></li>
+</ul></li>
+<li class="chapter" data-level="4.9" data-path="4-viz.html"><a href="4-viz.html#conclusion"><i class="fa fa-check"></i><b>4.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="4.9.1" data-path="4-viz.html"><a href="4-viz.html#resources"><i class="fa fa-check"></i><b>4.9.1</b> Resources</a></li>
+<li class="chapter" data-level="4.9.2" data-path="4-viz.html"><a href="4-viz.html#script-of-r-code"><i class="fa fa-check"></i><b>4.9.2</b> Script of R code</a></li>
+<li class="chapter" data-level="4.9.3" data-path="4-viz.html"><a href="4-viz.html#whats-to-come-1"><i class="fa fa-check"></i><b>4.9.3</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="5" data-path="5-manip.html"><a href="5-manip.html"><i class="fa fa-check"></i><b>5</b> Data Manipulation via dplyr</a><ul>
+<li class="chapter" data-level="" data-path="5-manip.html"><a href="5-manip.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="5.1" data-path="5-manip.html"><a href="5-manip.html#the-pipe"><i class="fa fa-check"></i><b>5.1</b> The pipe <code>%&gt;%</code></a></li>
+<li class="chapter" data-level="5.2" data-path="5-manip.html"><a href="5-manip.html#five-main-verbs---the-5mv"><i class="fa fa-check"></i><b>5.2</b> Five Main Verbs - The 5MV</a><ul>
+<li class="chapter" data-level="5.2.1" data-path="5-manip.html"><a href="5-manip.html#filter"><i class="fa fa-check"></i><b>5.2.1</b> 5MV#1: Filter observations using filter</a></li>
+<li class="chapter" data-level="5.2.2" data-path="5-manip.html"><a href="5-manip.html#mv2-summarize-variables-using-summarize"><i class="fa fa-check"></i><b>5.2.2</b> 5MV#2: Summarize variables using summarize</a></li>
+<li class="chapter" data-level="5.2.3" data-path="5-manip.html"><a href="5-manip.html#mv3-group-rows-using-group_by"><i class="fa fa-check"></i><b>5.2.3</b> 5MV#3: Group rows using group_by</a></li>
+<li class="chapter" data-level="5.2.4" data-path="5-manip.html"><a href="5-manip.html#mv4-create-new-variableschange-old-variables-using-mutate"><i class="fa fa-check"></i><b>5.2.4</b> 5MV#4: Create new variables/change old variables using mutate</a></li>
+<li class="chapter" data-level="5.2.5" data-path="5-manip.html"><a href="5-manip.html#arrange"><i class="fa fa-check"></i><b>5.2.5</b> 5MV#5: Reorder the data frame using arrange</a></li>
+</ul></li>
+<li class="chapter" data-level="5.3" data-path="5-manip.html"><a href="5-manip.html#joining-data-frames"><i class="fa fa-check"></i><b>5.3</b> Joining data frames</a><ul>
+<li class="chapter" data-level="5.3.1" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables"><i class="fa fa-check"></i><b>5.3.1</b> Joining by Key Variables</a></li>
+<li class="chapter" data-level="5.3.2" data-path="5-manip.html"><a href="5-manip.html#joining-by-key-variables-with-different-names"><i class="fa fa-check"></i><b>5.3.2</b> Joining by Key Variables with Different Names</a></li>
+</ul></li>
+<li class="chapter" data-level="5.4" data-path="5-manip.html"><a href="5-manip.html#optional-other-verbs"><i class="fa fa-check"></i><b>5.4</b> Optional: Other verbs</a><ul>
+<li class="chapter" data-level="5.4.1" data-path="5-manip.html"><a href="5-manip.html#select"><i class="fa fa-check"></i><b>5.4.1</b> Select variables using select</a></li>
+<li class="chapter" data-level="5.4.2" data-path="5-manip.html"><a href="5-manip.html#rename"><i class="fa fa-check"></i><b>5.4.2</b> Rename variables using rename</a></li>
+<li class="chapter" data-level="5.4.3" data-path="5-manip.html"><a href="5-manip.html#find-the-top-number-of-values-using-top_n"><i class="fa fa-check"></i><b>5.4.3</b> Find the top number of values using top_n</a></li>
+</ul></li>
+<li class="chapter" data-level="5.5" data-path="5-manip.html"><a href="5-manip.html#conclusion-1"><i class="fa fa-check"></i><b>5.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="5.5.1" data-path="5-manip.html"><a href="5-manip.html#resources-1"><i class="fa fa-check"></i><b>5.5.1</b> Resources</a></li>
+<li class="chapter" data-level="5.5.2" data-path="5-manip.html"><a href="5-manip.html#script-of-r-code-1"><i class="fa fa-check"></i><b>5.5.2</b> Script of R code</a></li>
+<li class="chapter" data-level="5.5.3" data-path="5-manip.html"><a href="5-manip.html#whats-to-come-2"><i class="fa fa-check"></i><b>5.5.3</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="part"><span><b>II Inference</b></span></li>
-<li class="chapter" data-level="6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via <code id="sim">mosaic</code></a><ul>
-<li class="chapter" data-level="" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#needed-packages-2"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="6.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
-<li class="chapter" data-level="6.1.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
-<li class="chapter" data-level="6.1.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
+<li class="chapter" data-level="6" data-path="6-sim.html"><a href="6-sim.html"><i class="fa fa-check"></i><b>6</b> Simulating Randomness via mosaic</a><ul>
+<li class="chapter" data-level="" data-path="6-sim.html"><a href="6-sim.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="6.1" data-path="6-sim.html"><a href="6-sim.html#random-sampling"><i class="fa fa-check"></i><b>6.1</b> Random sampling</a><ul>
+<li class="chapter" data-level="6.1.1" data-path="6-sim.html"><a href="6-sim.html#tasting-soup"><i class="fa fa-check"></i><b>6.1.1</b> Tasting soup</a></li>
+<li class="chapter" data-level="6.1.2" data-path="6-sim.html"><a href="6-sim.html#common-terms"><i class="fa fa-check"></i><b>6.1.2</b> Common terms</a></li>
 </ul></li>
-<li class="chapter" data-level="6.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
-<li class="chapter" data-level="6.2.1" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
-<li class="chapter" data-level="6.2.2" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
+<li class="chapter" data-level="6.2" data-path="6-sim.html"><a href="6-sim.html#visualizing-sampling"><i class="fa fa-check"></i><b>6.2</b> Visualizing sampling</a><ul>
+<li class="chapter" data-level="6.2.1" data-path="6-sim.html"><a href="6-sim.html#sampling-distribution"><i class="fa fa-check"></i><b>6.2.1</b> Sampling distribution</a></li>
+<li class="chapter" data-level="6.2.2" data-path="6-sim.html"><a href="6-sim.html#repeated-sampling-via-do"><i class="fa fa-check"></i><b>6.2.2</b> Repeated sampling via <code>do</code></a></li>
+</ul></li>
+<li class="chapter" data-level="6.3" data-path="6-sim.html"><a href="6-sim.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
+<li class="chapter" data-level="6.4" data-path="6-sim.html"><a href="6-sim.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
+<li class="chapter" data-level="6.5" data-path="6-sim.html"><a href="6-sim.html#conclusion-2"><i class="fa fa-check"></i><b>6.5</b> Conclusion</a><ul>
+<li class="chapter" data-level="6.5.1" data-path="6-sim.html"><a href="6-sim.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5.1</b> Script of R code</a></li>
+<li class="chapter" data-level="6.5.2" data-path="6-sim.html"><a href="6-sim.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.5.2</b> What’s to come?</a></li>
 </ul></li>
-<li class="chapter" data-level="6.3" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#simulation"><i class="fa fa-check"></i><b>6.3</b> Simulation</a></li>
-<li class="chapter" data-level="6.4" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#review-of-mosaic-simulation-functions"><i class="fa fa-check"></i><b>6.4</b> Review of <code>mosaic</code> simulation functions</a></li>
-<li class="chapter" data-level="6.5" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#script-of-r-code-2"><i class="fa fa-check"></i><b>6.5</b> Script of R code</a></li>
-<li class="chapter" data-level="6.6" data-path="6-simulating-randomness-via-mosaic.html"><a href="6-simulating-randomness-via-mosaic.html#whats-to-come-3"><i class="fa fa-check"></i><b>6.6</b> What’s to come?</a></li>
 </ul></li>
 <li class="chapter" data-level="7" data-path="7-hypo.html"><a href="7-hypo.html"><i class="fa fa-check"></i><b>7</b> Hypothesis Testing</a><ul>
-<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-3"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="" data-path="7-hypo.html"><a href="7-hypo.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="7.1" data-path="7-hypo.html"><a href="7-hypo.html#when-inference-is-not-needed"><i class="fa fa-check"></i><b>7.1</b> When Inference Is Not Needed</a></li>
 <li class="chapter" data-level="7.2" data-path="7-hypo.html"><a href="7-hypo.html#basics-of-hypothesis-testing"><i class="fa fa-check"></i><b>7.2</b> Basics of Hypothesis Testing</a></li>
 <li class="chapter" data-level="7.3" data-path="7-hypo.html"><a href="7-hypo.html#trial"><i class="fa fa-check"></i><b>7.3</b> Criminal trial analogy</a><ul>
@@ -249,44 +262,50 @@
 <li class="chapter" data-level="7.8.1" data-path="7-hypo.html"><a href="7-hypo.html#example-t-test-for-two-independent-samples"><i class="fa fa-check"></i><b>7.8.1</b> EXAMPLE: <span class="math inline">\(t\)</span>-test for two independent samples</a></li>
 <li class="chapter" data-level="7.8.2" data-path="7-hypo.html"><a href="7-hypo.html#conditions-for-t-test"><i class="fa fa-check"></i><b>7.8.2</b> Conditions for t-test</a></li>
 </ul></li>
-<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9</b> What’s to come?</a></li>
+<li class="chapter" data-level="7.9" data-path="7-hypo.html"><a href="7-hypo.html#conclusion-3"><i class="fa fa-check"></i><b>7.9</b> Conclusion</a><ul>
+<li class="chapter" data-level="7.9.1" data-path="7-hypo.html"><a href="7-hypo.html#script-of-r-code-3"><i class="fa fa-check"></i><b>7.9.1</b> Script of R code</a></li>
+<li class="chapter" data-level="7.9.2" data-path="7-hypo.html"><a href="7-hypo.html#whats-to-come-4"><i class="fa fa-check"></i><b>7.9.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="chapter" data-level="8" data-path="8-ci.html"><a href="8-ci.html"><i class="fa fa-check"></i><b>8</b> Confidence Intervals</a><ul>
-<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-4"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a></li>
-<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#mean_rating"><i class="fa fa-check"></i><b>8.2</b> mean_rating</a><ul>
-<li class="chapter" data-level="8.2.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.2.1</b> Review of Bootstrapping</a></li>
-</ul></li>
-<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.3</b> Relation to hypothesis testing</a></li>
-<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.4</b> Effect size</a></li>
-<li class="chapter" data-level="8.5" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-3"><i class="fa fa-check"></i><b>8.5</b> Script of R code</a></li>
-<li class="chapter" data-level="8.6" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.6</b> What’s to come?</a></li>
-</ul></li>
-<li class="chapter" data-level="9" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html"><i class="fa fa-check"></i><b>9</b> Regression via <code id="regress">broom</code></a><ul>
-<li class="chapter" data-level="" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
-<li class="chapter" data-level="9.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
-<li class="chapter" data-level="9.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
-<li class="chapter" data-level="9.2.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
-</ul></li>
-<li class="chapter" data-level="9.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
-<li class="chapter" data-level="9.3.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
-<li class="chapter" data-level="9.3.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
-<li class="chapter" data-level="9.3.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#interpretting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpretting the slope</a></li>
-<li class="chapter" data-level="9.3.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
-</ul></li>
-<li class="chapter" data-level="9.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
-<li class="chapter" data-level="9.4.1" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
-<li class="chapter" data-level="9.4.2" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
-<li class="chapter" data-level="9.4.3" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
-<li class="chapter" data-level="9.4.4" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
-<li class="chapter" data-level="9.4.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
-<li class="chapter" data-level="9.4.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
-</ul></li>
-<li class="chapter" data-level="9.5" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
-<li class="chapter" data-level="9.6" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
-<li class="chapter" data-level="9.7" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#script-of-r-code-4"><i class="fa fa-check"></i><b>9.7</b> Script of R code</a></li>
-<li class="chapter" data-level="9.8" data-path="9-regression-via-broom.html"><a href="9-regression-via-broom.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.8</b> What’s to come?</a></li>
+<li class="chapter" data-level="" data-path="8-ci.html"><a href="8-ci.html#needed-packages-5"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="8.1" data-path="8-ci.html"><a href="8-ci.html#bootstrapping"><i class="fa fa-check"></i><b>8.1</b> Bootstrapping</a><ul>
+<li class="chapter" data-level="8.1.1" data-path="8-ci.html"><a href="8-ci.html#review-of-bootstrapping"><i class="fa fa-check"></i><b>8.1.1</b> Review of Bootstrapping</a></li>
+</ul></li>
+<li class="chapter" data-level="8.2" data-path="8-ci.html"><a href="8-ci.html#relation-to-hypothesis-testing"><i class="fa fa-check"></i><b>8.2</b> Relation to hypothesis testing</a></li>
+<li class="chapter" data-level="8.3" data-path="8-ci.html"><a href="8-ci.html#effect-size"><i class="fa fa-check"></i><b>8.3</b> Effect size</a></li>
+<li class="chapter" data-level="8.4" data-path="8-ci.html"><a href="8-ci.html#conclusion-4"><i class="fa fa-check"></i><b>8.4</b> Conclusion</a><ul>
+<li class="chapter" data-level="8.4.1" data-path="8-ci.html"><a href="8-ci.html#script-of-r-code-4"><i class="fa fa-check"></i><b>8.4.1</b> Script of R code</a></li>
+<li class="chapter" data-level="8.4.2" data-path="8-ci.html"><a href="8-ci.html#whats-to-come-5"><i class="fa fa-check"></i><b>8.4.2</b> What’s to come?</a></li>
+</ul></li>
+</ul></li>
+<li class="chapter" data-level="9" data-path="9-regress.html"><a href="9-regress.html"><i class="fa fa-check"></i><b>9</b> Regression via broom</a><ul>
+<li class="chapter" data-level="" data-path="9-regress.html"><a href="9-regress.html#needed-packages-6"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="9.1" data-path="9-regress.html"><a href="9-regress.html#example-alaskan-airlines-delays"><i class="fa fa-check"></i><b>9.1</b> EXAMPLE: Alaskan Airlines delays</a></li>
+<li class="chapter" data-level="9.2" data-path="9-regress.html"><a href="9-regress.html#correlation"><i class="fa fa-check"></i><b>9.2</b> Correlation</a><ul>
+<li class="chapter" data-level="9.2.1" data-path="9-regress.html"><a href="9-regress.html#correlation-does-not-imply-causation"><i class="fa fa-check"></i><b>9.2.1</b> Correlation does not imply causation</a></li>
+</ul></li>
+<li class="chapter" data-level="9.3" data-path="9-regress.html"><a href="9-regress.html#linear-regression"><i class="fa fa-check"></i><b>9.3</b> Linear regression</a><ul>
+<li class="chapter" data-level="9.3.1" data-path="9-regress.html"><a href="9-regress.html#understanding-linear-regression-basics"><i class="fa fa-check"></i><b>9.3.1</b> Understanding linear regression basics</a></li>
+<li class="chapter" data-level="9.3.2" data-path="9-regress.html"><a href="9-regress.html#the-equation-of-the-line"><i class="fa fa-check"></i><b>9.3.2</b> The equation of the line</a></li>
+<li class="chapter" data-level="9.3.3" data-path="9-regress.html"><a href="9-regress.html#interpreting-the-slope"><i class="fa fa-check"></i><b>9.3.3</b> Interpreting the slope</a></li>
+<li class="chapter" data-level="9.3.4" data-path="9-regress.html"><a href="9-regress.html#predicting-values"><i class="fa fa-check"></i><b>9.3.4</b> Predicting values</a></li>
+</ul></li>
+<li class="chapter" data-level="9.4" data-path="9-regress.html"><a href="9-regress.html#inference-for-regression"><i class="fa fa-check"></i><b>9.4</b> Inference for regression</a><ul>
+<li class="chapter" data-level="9.4.1" data-path="9-regress.html"><a href="9-regress.html#data-2"><i class="fa fa-check"></i><b>9.4.1</b> Data</a></li>
+<li class="chapter" data-level="9.4.2" data-path="9-regress.html"><a href="9-regress.html#test-statistic-delta-2"><i class="fa fa-check"></i><b>9.4.2</b> Test Statistic <span class="math inline">\(\delta\)</span></a></li>
+<li class="chapter" data-level="9.4.3" data-path="9-regress.html"><a href="9-regress.html#observed-effect-delta-2"><i class="fa fa-check"></i><b>9.4.3</b> Observed effect <span class="math inline">\(\delta^*\)</span></a></li>
+<li class="chapter" data-level="9.4.4" data-path="9-regress.html"><a href="9-regress.html#model-of-h_0-2"><i class="fa fa-check"></i><b>9.4.4</b> Model of <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.5" data-path="9-regress.html"><a href="9-regress.html#simulated-data-2"><i class="fa fa-check"></i><b>9.4.5</b> Simulated Data</a></li>
+<li class="chapter" data-level="9.4.6" data-path="9-regress.html"><a href="9-regress.html#distribution-of-delta-under-h_0-2"><i class="fa fa-check"></i><b>9.4.6</b> Distribution of <span class="math inline">\(\delta\)</span> under <span class="math inline">\(H_0\)</span></a></li>
+<li class="chapter" data-level="9.4.7" data-path="9-regress.html"><a href="9-regress.html#the-p-value-2"><i class="fa fa-check"></i><b>9.4.7</b> The p-value</a></li>
+</ul></li>
+<li class="chapter" data-level="9.5" data-path="9-regress.html"><a href="9-regress.html#resid"><i class="fa fa-check"></i><b>9.5</b> Residual analysis</a></li>
+<li class="chapter" data-level="9.6" data-path="9-regress.html"><a href="9-regress.html#conditions-for-regression"><i class="fa fa-check"></i><b>9.6</b> Conditions for regression</a></li>
+<li class="chapter" data-level="9.7" data-path="9-regress.html"><a href="9-regress.html#conclusion-5"><i class="fa fa-check"></i><b>9.7</b> Conclusion</a><ul>
+<li class="chapter" data-level="9.7.1" data-path="9-regress.html"><a href="9-regress.html#script-of-r-code-5"><i class="fa fa-check"></i><b>9.7.1</b> Script of R code</a></li>
+<li class="chapter" data-level="9.7.2" data-path="9-regress.html"><a href="9-regress.html#whats-to-come-6"><i class="fa fa-check"></i><b>9.7.2</b> What’s to come?</a></li>
+</ul></li>
 </ul></li>
 <li class="part"><span><b>III Conclusion</b></span></li>
 <li class="chapter" data-level="10" data-path="10-effective-data-storytelling.html"><a href="10-effective-data-storytelling.html"><i class="fa fa-check"></i><b>10</b> Effective Data Storytelling</a><ul>
@@ -304,7 +323,8 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="B" data-path="B-appendixB.html"><a href="B-appendixB.html"><i class="fa fa-check"></i><b>B</b> Inference Examples</a><ul>
-<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-6"><i class="fa fa-check"></i><b>B.1</b> Needed packages</a></li>
+<li class="chapter" data-level="" data-path="B-appendixB.html"><a href="B-appendixB.html#needed-packages-7"><i class="fa fa-check"></i>Needed packages</a></li>
+<li class="chapter" data-level="B.1" data-path="B-appendixB.html"><a href="B-appendixB.html#inference-mind-map"><i class="fa fa-check"></i><b>B.1</b> Inference Mind Map</a></li>
 <li class="chapter" data-level="B.2" data-path="B-appendixB.html"><a href="B-appendixB.html#one-mean"><i class="fa fa-check"></i><b>B.2</b> One Mean</a><ul>
 <li class="chapter" data-level="B.2.1" data-path="B-appendixB.html"><a href="B-appendixB.html#problem-statement"><i class="fa fa-check"></i><b>B.2.1</b> Problem Statement</a></li>
 <li class="chapter" data-level="B.2.2" data-path="B-appendixB.html"><a href="B-appendixB.html#competing-hypotheses"><i class="fa fa-check"></i><b>B.2.2</b> Competing Hypotheses</a></li>
@@ -352,6 +372,7 @@
 </ul></li>
 </ul></li>
 <li class="chapter" data-level="C" data-path="C-appendixC.html"><a href="C-appendixC.html"><i class="fa fa-check"></i><b>C</b> Reach for the Starts</a><ul>
+<li class="chapter" data-level="" data-path="C-appendixC.html"><a href="C-appendixC.html#needed-packages-8"><i class="fa fa-check"></i>Needed packages</a></li>
 <li class="chapter" data-level="C.1" data-path="C-appendixC.html"><a href="C-appendixC.html#sorted-barplots"><i class="fa fa-check"></i><b>C.1</b> Sorted barplots</a></li>
 <li class="chapter" data-level="C.2" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-graphics"><i class="fa fa-check"></i><b>C.2</b> Interactive graphics</a><ul>
 <li class="chapter" data-level="C.2.1" data-path="C-appendixC.html"><a href="C-appendixC.html#interactive-line-graphs"><i class="fa fa-check"></i><b>C.2.1</b> Interactive line-graphs</a></li>
@@ -392,9 +413,15 @@ <h1>References</h1>
 <p>Ismay, Chester. 2016. <em>Getting Used to R, RStudio, and R Markdown</em>. <a href="http://ismayc.github.io/rbasics-book" class="uri">http://ismayc.github.io/rbasics-book</a>.</p>
 </div>
 <div>
+<p>Kim, Albert Y., and Adriana Escobedo-Land. 2016. <em>Okcupiddata: OkCupid Profile Data for Introductory Statistics and Data Science Courses</em>. <a href="https://CRAN.R-project.org/package=okcupiddata" class="uri">https://CRAN.R-project.org/package=okcupiddata</a>.</p>
+</div>
+<div>
 <p>Lock, Robin, Patti Frazer Lock, Kari Lock Morgan, Eric F. Lock, and Dennis F. Lock. 2012. <em>Statistics: UnLOCKing the Power of Data</em>. Wiley.</p>
 </div>
 <div>
+<p>Pruim, Randall, Daniel T. Kaplan, and Nicholas J. Horton. 2016. <em>Mosaic: Project Mosaic Statistics and Mathematics Teaching Utilities</em>. <a href="https://CRAN.R-project.org/package=mosaic" class="uri">https://CRAN.R-project.org/package=mosaic</a>.</p>
+</div>
+<div>
 <p>Robbins, Naomi. 2013. <em>Creating More Effective Graphs</em>. Chart House.</p>
 </div>
 <div>
@@ -404,6 +431,9 @@ <h1>References</h1>
 <p>Wickham, Hadley. 2014. “Tidy Data.” <em>Journal of Statistical Software</em> Volume 59 (Issue 10). <a href="https://www.jstatsoft.org/index.php/jss/article/view/v059i10/v59i10.pdf" class="uri">https://www.jstatsoft.org/index.php/jss/article/view/v059i10/v59i10.pdf</a>.</p>
 </div>
 <div>
+<p>———. 2015. <em>Ggplot2movies: Movies Data</em>. <a href="https://CRAN.R-project.org/package=ggplot2movies" class="uri">https://CRAN.R-project.org/package=ggplot2movies</a>.</p>
+</div>
+<div>
 <p>———. 2016. <em>Nycflights13: Flights That Departed Nyc in 2013</em>. <a href="https://CRAN.R-project.org/package=nycflights13" class="uri">https://CRAN.R-project.org/package=nycflights13</a>.</p>
 </div>
 <div>
@@ -415,6 +445,9 @@ <h1>References</h1>
 <div>
 <p>Wilkinson, Leland. 2005. <em>The Grammar of Graphics (Statistics and Computing)</em>. Secaucus, NJ, USA: Springer-Verlag New York, Inc.</p>
 </div>
+<div>
+<p>Xie, Yihui. 2016. <em>Bookdown: Authoring Books and Technical Documents with R Markdown</em>. <a href="https://CRAN.R-project.org/package=bookdown" class="uri">https://CRAN.R-project.org/package=bookdown</a>.</p>
+</div>
 </div>
 </div>
             </section>
diff --git a/docs/search_index.json b/docs/search_index.json
index f92fd0343..01f2692bc 100644
--- a/docs/search_index.json
+++ b/docs/search_index.json
@@ -1,16 +1,16 @@
 [
-["index.html", "ModernDive 1 Preamble 1.1 Principles of this Book 1.2 Contribute 1.3 Getting Started Colophon", " ModernDive An Introduction to Statistical and Data Sciences via R Chester Ismay and Albert Y. Kim 2017-01-07 1 Preamble 1.1 Principles of this Book These are some principles we keep in mind. If you agree with them, this might be the book for you. Blur the lines between lecture and lab Laptops and open source software are rendering the lab/lecture dichotomy ever more archaic. It’s much harder for students to understand the importance of using the software if they only use it once a week or less. They forget the syntax in much the same way someone learning a foreign language forgets the rules. Focus on the entire data/science research pipeline Grolemund and Wickham’s graphic George Cobb argued for “Minimizing prerequisites to research” It’s all about data, data, data We leverage R packages for rich/complex yet easy-to-load data sets. We’ve heard it before: “You can’t teach ggplot2 for data visualization in intro stats!” We, like David Robinson, are more optimistic and we’ve had success doing so. dplyr is a game changer for data manipulation: the verb describing your desired data action is the command name! Use simulation/resampling for intro stats, not probability/large sample approximation Reinforce concepts, not equations, formulas, and probability tables. To this end, we’re big fans of the mosaic package’s shuffle(), resample(), and do() functions for sampling and simulation. Don’t fence off students from the computation pool, throw them in! Don’t teach them coding/programming per se, but computation and algorithmic thinking. Drawing Venn diagrams delineating statistics, computer science, and data science is also ever more archaic; embrace computation! Complete reproducibility We find it frustrating when textbooks give examples but not the source code and the data itself. We not only give you the source code for all examples, but also the source code for the whole book! We encourage use of R Markdown to foster notions of reproducible research. Ultimately the best textbook is one you’ve written yourself You best know your audience, their background, and their priorities and you know best your own style and types of examples and problems you like best. Customizability is the ultimate end. A new paradigm for textbooks? Versions, not editions? Pull requests, crowd-sourcing, and development versions? 1.2 Contribute This book is in beta testing and is currently at Version 0.1.0. If you would like to receive periodic updates on this book and other similar projects, please fill out this Google Form. The source code for this book is available for download/forking on GitHub. If you find typos or other errors or have suggestions on how to better word something in the book, please create a pull request too! Please feel free to modify the book as you wish for your own needs! All we ask is that you list the authors field above as “Chester Ismay, Albert Y. Kim, and YOU!” We’d also appreciate if you let us now what changes you’ve made and how you’ve used the textbook. We’d love some data on what’s working well and what’s not working so well. 1.3 Getting Started This book was written using the bookdown R package from Yihui Xie. In order to follow along and run the code in this book on your own, you’ll need to have access to R and RStudio. You can find more information on both of these with a simple Google search for “R” and for “RStudio.” An introduction to using R, RStudio, and R Markdown is also available in a free book here (Ismay 2016). It is recommended that you refer back to this book frequently as it has GIF screen recordings that you can follow along with as you learn. We will keep a running list of R packages you will need to have installed to complete the analysis as well here in the needed_pkgs character vector. You can check if you have all of the needed packages installed by running all of the lines below. The last lines including the if will install them as needed (i.e., download their needed files from the internet to your hard drive). You can run the library function on them to load them into your current analysis. Prior to each analysis where a package is needed, you will see the corresponding library function in the text. Make sure to check the top of the chapter to see if a package was loaded there. needed_pkgs &lt;- c(&quot;nycflights13&quot;, &quot;dplyr&quot;, &quot;ggplot2&quot;, &quot;knitr&quot;, &quot;okcupiddata&quot;, &quot;dygraphs&quot;, &quot;rmarkdown&quot;, &quot;mosaic&quot;, &quot;ggplot2movies&quot;) new.pkgs &lt;- needed_pkgs[!(needed_pkgs %in% installed.packages())] if(length(new.pkgs)) { install.packages(new.pkgs, repos = &quot;http://cran.rstudio.com&quot;) } Colophon The source of the book is available here and was built with versions of R packages (and their dependent packages) given below. This may not be of importance for initial readers of this book, but the hope is you can reproduce a duplicate of this book by installing these versions of the packages. package * version date source assertthat 0.1 2013-12-06 CRAN (R 3.3.0) backports 1.0.4 2016-10-24 CRAN (R 3.3.0) base64enc 0.1-3 2015-07-28 CRAN (R 3.3.0) BH 1.62.0-1 2016-11-19 CRAN (R 3.3.2) bitops 1.0-6 2013-08-17 CRAN (R 3.3.0) caTools 1.17.1 2014-09-10 CRAN (R 3.3.0) colorspace 1.3-2 2016-12-14 CRAN (R 3.3.2) curl 2.3 2016-11-24 CRAN (R 3.3.2) DBI 0.5-1 2016-09-10 CRAN (R 3.3.0) dichromat 2.0-0 2013-01-24 CRAN (R 3.3.0) digest 0.6.11 2017-01-03 CRAN (R 3.3.2) dplyr * 0.5.0 2016-06-24 CRAN (R 3.3.0) dygraphs * 1.1.1.4 2017-01-04 CRAN (R 3.3.2) evaluate 0.10 2016-10-11 CRAN (R 3.3.0) ggdendro 0.1-20 2016-04-27 CRAN (R 3.3.0) ggplot2 * 2.2.1 2016-12-30 CRAN (R 3.3.2) ggplot2movies * 0.0.1 2015-08-25 CRAN (R 3.3.0) gridExtra 2.2.1 2016-02-29 CRAN (R 3.3.0) gtable 0.2.0 2016-02-26 CRAN (R 3.3.0) highr 0.6 2016-05-09 CRAN (R 3.3.0) hms 0.3 2016-11-22 CRAN (R 3.3.2) htmltools 0.3.5 2016-03-21 CRAN (R 3.3.0) htmlwidgets 0.8 2016-11-09 CRAN (R 3.3.2) jsonlite 1.2 2016-12-31 CRAN (R 3.3.2) knitr * 1.15.1 2016-11-22 CRAN (R 3.3.2) labeling 0.3 2014-08-23 CRAN (R 3.3.0) lattice * 0.20-34 2016-09-06 CRAN (R 3.3.2) latticeExtra 0.6-28 2016-02-09 CRAN (R 3.3.0) lazyeval 0.2.0 2016-06-12 CRAN (R 3.3.0) magrittr 1.5 2014-11-22 CRAN (R 3.3.0) markdown 0.7.7 2015-04-22 CRAN (R 3.3.0) MASS 7.3-45 2016-04-21 CRAN (R 3.3.2) Matrix * 1.2-7.1 2016-09-01 CRAN (R 3.3.2) mime 0.5 2016-07-07 CRAN (R 3.3.0) mosaic * 0.14.4 2016-07-29 CRAN (R 3.3.0) mosaicData * 0.14.0 2016-06-17 CRAN (R 3.3.0) munsell 0.4.3 2016-02-13 CRAN (R 3.3.0) nycflights13 * 0.2.1 2016-12-30 CRAN (R 3.3.2) okcupiddata * 0.1.0 2016-08-19 CRAN (R 3.3.0) plyr 1.8.4 2016-06-08 CRAN (R 3.3.0) R6 2.2.0 2016-10-05 CRAN (R 3.3.0) RColorBrewer 1.1-2 2014-12-07 CRAN (R 3.3.0) Rcpp 0.12.8 2016-11-17 CRAN (R 3.3.2) readr * 1.0.0 2016-08-03 CRAN (R 3.3.0) reshape2 1.4.2 2016-10-22 CRAN (R 3.3.0) rmarkdown 1.3 2016-12-21 CRAN (R 3.3.2) rprojroot 1.1 2016-10-29 CRAN (R 3.3.0) scales 0.4.1 2016-11-09 CRAN (R 3.3.2) stringi 1.1.2 2016-10-01 CRAN (R 3.3.0) stringr 1.1.0 2016-08-19 CRAN (R 3.3.0) tibble 1.2 2016-08-26 CRAN (R 3.3.0) tidyr 0.6.0 2016-08-12 CRAN (R 3.3.0) xts 0.9-7 2014-01-02 CRAN (R 3.3.0) yaml 2.1.14 2016-11-12 CRAN (R 3.3.2) zoo 1.7-14 2016-12-16 CRAN (R 3.3.2) Book was last updated: ## [1] &quot;By Chester on Saturday, January 07, 2017 11:29:21 EST&quot; References "],
-["2-intro.html", "2 Introduction 2.1 Preamble 2.2 Three driving data sources 2.3 Data/science pipeline 2.4 Reproducibility 2.5 Who is this book for?", " 2 Introduction 2.1 Preamble This book is inspired by three books: “Mathematical Statistics with Resampling and R” (Chihara and Hesterberg 2011), “Intro Stat with Randomization and Simulation” (Diez, Barr, and Çetinkaya-Rundel 2014), and “R for Data Science” (Grolemund and Wickham 2016). The first book, while designed for upper-level undergraduates and graduate students, provides an excellent resource on how to use resampling to build statistical concepts like normal distributions using computers instead of focusing on memorization of formulas. The last two books also provide a path towards free alternatives to the traditionally expensive introductory statistics textbook. When looking over the vast number of introductory statistics textbooks we found that there wasn’t one that incorporated many of the new R packages directly into the text. Additionally, there wasn’t an open-source, free textbook available that showed new learners all of the following how to use R to explore and visualize data how to use randomization and simulation to build inferential ideas how to effectively create stories using these ideas to convey information to a lay audience. We will introduce sometimes difficult statistics concepts through the medium of data visualization. In today’s world, we are bombarded with graphics that attempt to convey ideas. We will explore what makes a good graphic and what the standard ways are to convey relationships with data. You’ll also see the use of visualization to introduce concepts like mean, median, standard deviation, distributions, etc. In general, we’ll use visualization as a way of building almost all of the ideas in this book. Additionally, this book will focus on the triad of computational thinking, data thinking, and inferential thinking. We’ll see throughout the book how these three modes of thinking can build effective ways to work with, describe, and convey statistical knowledge. In order to do so, you’ll see the importance of literate programming to develop literate data science. In other words, you’ll see how to write code and descriptions that are useful not just for a computer to execute but also for readers to understand exactly what a statistical analysis is doing and how it works. Hal Abelson coined the phrase that we will follow throughout this book: “Programs must be written for people to read, and only incidentally for machines to execute.” 2.2 Three driving data sources Instead of hopping from one data set to the next, we’ve decided to focus throughout the book on three different data sources: flights leaving New York City in 2013 profiles of OKCupid users in San Francisco IMDB movie ratings By focusing on just three large data sources, it is our hope that you’ll be able to see how each of the chapters is interconnected. You’ll see how the data being tidy leads into data visualization and manipulation and how those concepts tie into inference and regression. 2.3 Data/science pipeline You may think of statistics as just being a bunch of numbers. We commonly hear the phrase “statistician” when listening to broadcasts of sporting events. Statistics (in particular, data analysis), in addition to describing numbers like with baseball batting averages, plays a vital role in all of the sciences. You’ll commonly hear the phrase “statistically significant” thrown around in the media. You’ll see things that say “Science now shows that chocolate is good for you.” Underpinning these claims is data analysis. By the end of this book, you’ll be able to better understand whether these claims should be trusted or whether we should be weary. Inside data analysis are many sub-fields that we will discuss throughout this book (not necessarily in this order): data collection data manipulation data visualization data modeling inference interpretation of results data storytelling This can be summarized in a graphic that is commonly used by Hadley Wickham: Figure 2.1: Hadley’s workflow graphic We will begin with a discussion on what is meant by tidy data and then dig into the gray Understand portion of the cycle and conclude by talking about interpreting and discussing the results of our models via Communication. These steps are vital to any statistical analysis. But why should you care about statistics? “Why did they make me take this class?” There’s a reason so many fields require a statistics course. Scientific knowledge grows through an understanding of statistical significance and data analysis. You needn’t be intimidated by statistics. It’s not the beast that it used to be and paired with computation you’ll see how reproducible research in the sciences particularly increases scientific knowledge. 2.4 Reproducibility “The most important tool is the mindset, when starting, that the end product will be reproducible.” – Keith Baggerly Another large goal of this book is to help readers understand the importance of reproducible analyses. The hope is to get readers into the habit of making their analyses reproducible from the very beginning. This means we’ll be trying to help you build new habits. This will take practice and be difficult at times. You’ll see just why it is so important for you to keep track of your code and well-document it to help yourself later and any potential collaborators as well. Copying and pasting is not the way that efficient and effective scientific research is conducted. It’s much more important for time to be spent on data collection and data analysis and not on copying and pasting plots back and forth across a variety of programs. In a traditional analyses if an error was made with the original data, we’d need to step through the entire process again: recreate the plots and copy and paste all of the new plots and our statistical analysis into your document. This is error prone and a frustrating use of time. We’ll see how to use R Markdown to get away from this tedious activity so that we can spend more time doing science. “We are talking about computational reproducibility.” - Yihui Xie Reproducibility means a lot of things in terms of different scientific fields. Are experiments conducted in a way that another researcher could follow the steps and get similar results? In this book, we will focus on what is known as computational reproducibility. This refers to being able to pass all of one’s data analysis and conclusions to someone else and have them get exactly the same results on their machine. This allows for time to be spent doing actual science and interpreting of results and assumptions instead of the more error prone way of starting from scratch or follow a list of steps that may be different from machine to machine. 2.5 Who is this book for? This book is targeted at students taking a traditional intro stats class in a small college environment using RStudio and preferably RStudio Server. We assume no prerequisites: no calculus and no prior programming experience. This is intended to be a gentle and nice introduction to the practice of statistics in terms of how data scientists, statisticians, and other scientists analyze data and write stories about data. We have intentionally avoided the use of throwing formulas at you and instead have focused on developing statistical concepts via data visualization and statistical computing. We hope this is a more intuitive experience than the way statistics has traditionally been taught in the past (and how it is commonly perceived from the outside). We additionally hope that you see the value of reproducible research via R as you continue in your studies. We understand that there will initially be growing pains in learning to program but we are here to help you and you should know that there is a huge community of R users that are always happy to help newbies along. Now let’s get into learning about how to create good stories about and with data! References "],
-["3-tidy.html", "3 Tidy Data 3.1 What is tidy data? 3.2 The nycflights13 datasets 3.3 How is flights tidy? 3.4 Normal forms of data 3.5 What’s to come?", " 3 Tidy Data In this chapter, we’ll discuss the importance of tidy data. You may think that this means just having your data in a spreadsheet, but you’ll see that it is actually more specific than that. Data actually comes to us in a variety of formats from pictures to text and to just numbers. We’ll focus on datasets that can be stored in a spreadsheet throughout this book as that is the most common way data is collected in the sciences. Having tidy data will allow us to more easily create data visualizations as we will see in Chapter ??. It will also help us with manipulating data in Chapter ?? and in all subsequent chapters when we discuss statistical inference. You may not necessarily understand the importance for tidy data but it will become more and more apparent as we proceed through the book. 3.1 What is tidy data? You have surely heard the word “tidy” in your life: “Tidy up your room!” “Please write your homework in a tidy way so that it is easier to grade and to provide feedback.” Marie Kondo’s best-selling book The Life-Changing Magic of Tidying Up: The Japanese Art of Decluttering and Organizing “I am not by any stretch of the imagination a tidy person, and the piles of unread books on the coffee table and by my bed have a plaintive, pleading quality to me - ‘Read me, please!’” - Linda Grant So what does it mean for your data to be tidy? Put simply: it means that your data is organized. But it’s more than just that. It means that your data follows the same standard format making it easy for others to find elements of your data, to manipulate and transform your data, and for our purposes continuing with the common theme: it makes it easier to visualize your data and the relationships between different variables in your data. We will follow Hadley Wickham’s definition of tidy data here (Wickham 2014): A dataset is a collection of values, usually either numbers (if quantitative) or strings (if qualitative). Values are organised in two ways. Every value belongs to a variable and an observation. A variable contains all values that measure the same underlying attribute (like height, temperature, duration) across units. An observation contains all values measured on the same unit (like a person, or a day, or a race) across attributes. Tidy data is a standard way of mapping the meaning of a dataset to its structure. A dataset is messy or tidy depending on how rows, columns and tables are matched up with observations, variables and types. In tidy data: Each variable forms a column. Each observation forms a row. Each type of observational unit forms a table. Figure 3.1: Tidy data graphic from http://r4ds.had.co.nz/tidy-data.html Reading over this definition, you can begin to think about datasets that won’t follow this nice format. Learning check (LC3.1) Give an example dataset that doesn’t follow this format. What features of this dataset might make it difficult to visualize? How could the dataset be tweaked to make it tidy? 3.2 The nycflights13 datasets We likely have all flown on airplanes or know someone that has. Air travel has become an ever-present aspect of our daily lives. If you live in or are visiting a relatively large city and you walk around that city’s airport, you see gates showing flight information from many different airlines. And you will frequently see that some flights are delayed because of a variety of conditions. Are there ways that we can avoid having to deal with these flight delays? We’d all like to arrive at our destinations on time whenever possible. (Unless you secretly love hanging out at airports. If you are one of these people, pretend for the moment that you are very much anticipating being at your final destination.) Hadley Wickham (herein just referred to as “Hadley”) created multiple datasets containing information about departing flights from the New York City area in 2013 (Wickham 2016). We will begin by loading in one of these datasets, the flights dataset, and getting an idea of its structure: library(nycflights13) data(flights) The library function here loads the R package nycflights13 into the current R environment in which you are working. The data(flights) loads in the flights dataset that is stored in the nycflights13 package. Note that you’ll get an error if you try to load this package in and it hasn’t been downloaded and installed. You can ensure it is installed by running the code below: if(!require(nycflights13)) install.packages(&quot;nycflights13&quot;, repos = &quot;http://cran.rstudio.org&quot;) This code checks to see if nycflights13 is installed and, if not, then goes to the specified repository of “http://cran.rstudio.org” and downloads the package from there and installs it. If it is already installed you can see it listed in the Packages tab in the bottom right portion of RStudio and the code will not install the package again since this is redundant and you won’t need to do it over and over again. This dataset and most others presented in this book will be in the data.frame format in R. Data frames are ways to look at collections of variables that are tightly coupled together. Frequently, the best way to get a feel for a data frame is to use the View function in RStudio. This command will be given throughout the book as a reminder, but the actual output will be hidden. View(flights) Learning check (LC3.2) What does any ONE row in this flights dataset refer to? A. Data on an airline B. Data on a flight C. Data on an airport D. Data on multiple flights By running View(flights), we see the different variables listed in the columns and we see that there are different types of variables. Some of the variables like distance, day, and arr_delay are what we will call quantitative variables. These variables vary in a numerical way. Other variables here are categorical. Note that if you look in the leftmost column of the View(flights) output, you will see a column of numbers. These are the row numbers of the dataset. If you glance across a row with the same number, say row 5, you can get an idea of what each row corresponds to. In other words, this will allow you to identify what object is being referred to in a given row. This is often called the observational unit. The observational unit in this example is an individual flight departing New York City in 2013. Note: Frequently the first thing you should do when given a dataset is to identify the observation unit, specify the variables, and give the types of variables you are presented with. str(flights) ## Classes &#39;tbl_df&#39;, &#39;tbl&#39; and &#39;data.frame&#39;: 336776 obs. of 19 variables: ## $ year : int 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ... ## $ month : int 1 1 1 1 1 1 1 1 1 1 ... ## $ day : int 1 1 1 1 1 1 1 1 1 1 ... ## $ dep_time : int 517 533 542 544 554 554 555 557 557 558 ... ## $ sched_dep_time: int 515 529 540 545 600 558 600 600 600 600 ... ## $ dep_delay : num 2 4 2 -1 -6 -4 -5 -3 -3 -2 ... ## $ arr_time : int 830 850 923 1004 812 740 913 709 838 753 ... ## $ sched_arr_time: int 819 830 850 1022 837 728 854 723 846 745 ... ## $ arr_delay : num 11 20 33 -18 -25 12 19 -14 -8 8 ... ## $ carrier : chr &quot;UA&quot; &quot;UA&quot; &quot;AA&quot; &quot;B6&quot; ... ## $ flight : int 1545 1714 1141 725 461 1696 507 5708 79 301 ... ## $ tailnum : chr &quot;N14228&quot; &quot;N24211&quot; &quot;N619AA&quot; &quot;N804JB&quot; ... ## $ origin : chr &quot;EWR&quot; &quot;LGA&quot; &quot;JFK&quot; &quot;JFK&quot; ... ## $ dest : chr &quot;IAH&quot; &quot;IAH&quot; &quot;MIA&quot; &quot;BQN&quot; ... ## $ air_time : num 227 227 160 183 116 150 158 53 140 138 ... ## $ distance : num 1400 1416 1089 1576 762 ... ## $ hour : num 5 5 5 5 6 5 6 6 6 6 ... ## $ minute : num 15 29 40 45 0 58 0 0 0 0 ... ## $ time_hour : POSIXct, format: &quot;2013-01-01 05:00:00&quot; ... Learning check (LC3.3) What are some examples in this dataset of categorical variables? What makes them different than quantitative variables? (LC3.4) What does int, num, and chr mean in the output above? (LC3.5) How many different columns are in this dataset? (LC3.6) How many different rows are in this dataset? Another way to view the properties of a dataset is to use the str function (“str” is short for “structure”). The str function is expecting an object for its argument. In this case, the object is a data frame named flights. You can use the str function on other objects and data frames using the syntax str(object) where object is the name of an object in R. This will give you the first few entries of each variable in a row after the variable. In addition, the type of the variable is given immediately after the : following each variable’s name. Here, int and num refer to quantitative variables. In contrast, chr refers to categorical variables. One more type of variable is given here with the time_hour variable: POSIXct. As you may suspect, this variable corresponds to a specific date and time of day. Another nice feature of R is the help system. You can get help in R by simply entering a question mark before the name of a function or an object and you will be presented with a page showing the documentation. Note that this output help file is omitted here but can be accessed here on page 3 of the PDF document. ?str ?flights Another aspect of tidy data is a description of what each variable in the dataset represents. This helps others to understand what your variable names mean and what they correspond to. If we look at the output of ?flights, we can see that a description of each variable by name is given. An important feature to ALWAYS include with your data is the appropriate units of measurement. We’ll see this further when we work with the dep_delay variable in Chapter ??. (It’s in minutes, but you’d get some really strange interpretations if you thought it was in hours or seconds. UNITS MATTER!) 3.3 How is flights tidy? We see that flights has a rectangular shape with each row corresponding to a different flight and each column corresponding to a characteristic of that flight. This matches exactly with how Hadley defined tidy data: Each variable forms a column. Each observation forms a row. But what about the third property? Each type of observational unit forms a table. We identified earlier that the observational unit in the flights dataset is an individual flight. And we have shown that this dataset consists of 336,776 flights with 19 variables. In other words, some rows of this dataset don’t refer to a measurement on an airline or on an airport. They specifically refer to characteristics/measurements on a given flight from New York City in 2013. By contrast, also included in the nycflights13 package are datasets with different observational units (Wickham 2016): weather: hourly meteorological data for each airport planes: construction information about each plane airports: airport names and locations airlines: translation between two letter carrier codes and names You may have been asking yourself what carrier refers to in the str(flights) output above. The airlines dataset provides a description of this with each airline being the observational unit: data(airlines) airlines ## # A tibble: 16 × 2 ## carrier name ## &lt;chr&gt; &lt;chr&gt; ## 1 9E Endeavor Air Inc. ## 2 AA American Airlines Inc. ## 3 AS Alaska Airlines Inc. ## 4 B6 JetBlue Airways ## 5 DL Delta Air Lines Inc. ## 6 EV ExpressJet Airlines Inc. ## 7 F9 Frontier Airlines Inc. ## 8 FL AirTran Airways Corporation ## 9 HA Hawaiian Airlines Inc. ## 10 MQ Envoy Air ## 11 OO SkyWest Airlines Inc. ## 12 UA United Air Lines Inc. ## 13 US US Airways Inc. ## 14 VX Virgin America ## 15 WN Southwest Airlines Co. ## 16 YV Mesa Airlines Inc. As can be seen here when you just enter the name of an object in R, by default it will print the contents of that object to the screen. Be careful! It’s usually better to use the View() function in RStudio since larger objects may take awhile to print to the screen and it likely won’t be helpful to you to have hundreds of lines outputted. 3.4 Normal forms of data The datasets included in the nycflights13 package are in a form that minimizes redundancy of data. We will see that there are ways to merge (or join) the different tables together easily. We are capable of doing so because each of the tables have keys in common to relate one to another. This is an important property of normal forms of data. The process of decomposing data frames into less redundant tables without losing information is called normalization. More information is available on Wikipedia. We saw an example of this above with the airlines dataset. While the flights data frame could also include a column with the names of the airlines instead of the carrier code, this would be repetitive since there is a unique mapping of the carrier code to the name of the airline/carrier. Below an example is given showing how to join the airlines data frame together with the flights data frame by linking together the two datasets via a common key of &quot;carrier&quot;. Note that this “joined” data frame is assigned to a new data frame called joined_flights. if(!require(nycflights13)) install.packages(&quot;nycflights13&quot;, repos = &quot;http://cran.rstudio.org&quot;) library(dplyr) joined_flights &lt;- inner_join(x = flights, y = airlines, by = &quot;carrier&quot;) View(joined_flights) If we View this dataset, we see a new variable has been created called (We will see in Subsection 5.1.1 ways to change name to a more descriptive variable name.) More discussion about joining data frames together will be given in Chapter ??. We will see there that the names of the columns to be linked need not match as they did here with &quot;carrier&quot;. Review questions (RQ3.1) What are common characteristics of “tidy” datasets? (RQ3.2) What makes “tidy” datasets useful for organizing data? (RQ3.3) How many variables are presented in the table below? What does each row correspond to? (Hint: You may not be able to answer both of these questions immediately but take your best guess.) students faculty 4 2 6 3 (RQ3.4) The confusion you may have encountered in Question 4 is a common one those that work with data are commonly presented with. This dataset is not tidy. Actually, the dataset in Question 4 has three variables not the two that were presented. Make a guess as to what these variables are and present a tidy dataset instead of this untidy one given in Question 4. (RQ3.5) The actual data presented in Question 4 is given below in tidy data format: role Sociology? Type of School student TRUE Public student TRUE Public student TRUE Public student TRUE Public student FALSE Public student FALSE Public student FALSE Private student FALSE Private student FALSE Private student FALSE Private faculty TRUE Public faculty TRUE Public faculty FALSE Public faculty FALSE Private faculty FALSE Private What does each row correspond to? What are the different variables in this data frame? The Sociology? variable is known as a logical variable. What types of values does a logical variable take on? (RQ3.6) What are some advantages of data in normal forms? What are some disadvantages? 3.5 What’s to come? In Chapter ??, we will further explore the distribution of a variable in a related dataset to flights: the temp variable in the weather dataset. We’ll be interested in understanding how this variable varies in relation to the values of other variables in the dataset. We will see that visualization is often a powerful tool in helping us see what is going on in a dataset. It will be a useful way to expand on the str function we have seen here for tidy data. References "],
-["4-data-visualization-via-ggplot2.html", "4 Data Visualization via ggplot2 Needed packages 4.1 The Grammar of Graphics 4.2 Five Named Graphs - The 5NG 4.3 5NG#1: Scatter-plots 4.4 5NG#2: Line-graphs 4.5 5NG#3: Histograms 4.6 Facets 4.7 5NG#4: Boxplots 4.8 5NG#5: Barplots 4.9 Conclusion", " 4 Data Visualization via ggplot2 In Chapter 3, we discussed the importance of datasets being tidy. You will see in examples here why having a tidy dataset helps us immensely when plotting our data. In plotting our data, we will be able to gain valuable insights from our data that we couldn’t initially see from just looking at the raw data. We will focus on using Hadley Wickham’s ggplot2 package in doing so, which was developed to work specifically on datasets that are tidy. It provides an easy way to customize your plots and is based on data visualization theory given in The Grammar of Graphics (Wilkinson 2005). At the most basic level, graphics/plots/charts provide a nice way for us to get a sense for how quantitative variables compare in terms of their center and their spread. The most important thing to know about graphics is that they should be created to make it obvious for your audience to see the findings you want to get across. This requires a balance of not including too much in your plots, but also including enough so that relationships and interesting findings can be easily seen. As we will see, plots/graphics also help us to identify patterns and outliers in our data. We will see that a common extension of these ideas is to compare the distribution of one quantitative variable (i.e., what the spread of a variable looks like) as we go across the levels of a different categorical variable. Needed packages Before we proceed with this chapter, let’s load all the necessary packages, in particular the nycflights13 package introduced in Chapter 3 containing various data sets. library(dplyr) library(ggplot2) library(nycflights13) 4.1 The Grammar of Graphics We begin with a discussion of a theoretical framework for data visualization known as the “The Grammar of Graphics”, which serves as the basis for the ggplot2 package. Much like the way we construct sentences in any language using a linguistic grammar (nouns, verbs, subjects, objects, etc.), the theoretical framework given by Leland Wilkinson (Wilkinson 2005) allows us to specify the components of a statistical graphic. 4.1.1 Components of Grammar In short, the grammar tells us that: A statistical graphic is a mapping of data variables to aesthetic attributes of geometric objects. Specifically, we can break a graphic into the following three essential components: data: the data set comprised of variables that we map. geom: the geometric object in question. This refers to our type of objects we can observe in our plot. For example, points, lines, bars, etc. aes: aesthetic attributes of the geometric object that we can perceive on a graphic. For example, x/y position, color, shape, and size. Each assigned aesthetic attribute can be mapped to a variable in our data set. If not assigned, they are set to defaults. 4.1.2 Napolean’s March on Moscow In 1812, Napoleon led a French invasion of Russia, marching on Moscow. It was one of the biggest military disasters due in large part to the Russian winter. In 1869, a French civil engineer named Charles Joseph Minard published arguably one of the greatest statistical visualizations of all time which summarized this march: Figure 4.1: Minard’s Visualization of Napolean’s March This was considered a revolution in statistical graphics because between the map on top and the line graph on the bottom, there are 6 dimensions of information (i.e. variables) being displayed on a 2-dimensional page. Let’s view this graphic through the lens of the Grammar of Graphics: Table 4.1: Grammar of Map (Top) and Line-Graph (Bottom) in Minard’s Graphic of Napolean’s March data aes geom longitude x point latitude y point army size size path army direction color path data aes geom date x line &amp; text temperature y line &amp; text For example, the data variable longitude gets mapped to x aesthetic of the points geometric objects on the map while the annotated line-graph displays date and temperature variable information via its mapping to the x and y aesthetic of the line geometric object. 4.1.3 Other Components of the Grammar There are other components of the Grammar of Graphics we can control: facet: how to break up a plot into subsets statistical transformations: this includes smoothing, binning values into a histogram, or just itself untransformed &quot;identity&quot;. scales both convert data units to physical units the computer can display draw a legend and/or axes, which provide an inverse mapping to make it possible to read the original data values from the graph. coordinate system for x/y values: typically cartesian, but can also be polar, map position adjustments In this text, we will only focus on the first two: faceting (introduced in Section 4.6) and statistical transformations (in a limited sense when consider Barplots in Section 4.8) ; the other components are left to a more advanced text. This is not a problem when producing a plot as each of these components have default settings. There are other extra attributes that can be tweaked as well including the plot title, axes labels, and over-arching themes for the plot. In general, the Grammar of Graphics allows for customization but also a consistent framework that allows the user to easily tweak their creations as needed in order to convey a message about their data. 4.1.4 The ggplot2 Package We introduce Hadley Wickham’s ggplot2 package, which is an implementation of the Grammar of Graphics for R (Wickham and Chang 2016). You may have noticed that a lot of previous text in this chapter is written in computer font. This is because the various components of the Grammar of Graphics are specified using the ggplot function, which expects at a bare minimal as arguments the data frame where the variables exist (the data argument) and the names of the variables to be plotted (the mapping argument). The names of the variables will be entered into the aes function as arguments where aes stands for “aesthetics”. The plot given above is not a histogram, but the output does show us a bit of what is going on with ggplot(data = weather, mapping = aes(x = temp)). It is producing a backdrop onto which we will “paint” elements. We next proceed by adding a layer—hence, the use of the + symbol—to the plot to produce a histogram. (Note also here that we don’t have to specify the data = and mapping = text in our function calls. This is covered in more detail in Chapter 5 of the “Getting Used to R, RStudio, and R Markdown” book (Ismay 2016)). You are encouraged to enter Return on your keyboard after entering the +. As we add more and more elements, it will be nice to keep them indented as you see below. Note that this will not work if you begin the line with the +. An excellent resource as you begin to create plots using the ggplot2 package is a cheatsheet that RStudio has put together entitled “Data Visualization with ggplot2” available By clicking here or by clicking the RStudio Menu Bar -&gt; Help -&gt; Cheatsheets -&gt; “Data Visualization with ggplot2” This covers more than what we’ve discussed in this chapter but provides nice visual descriptions of what each function produces. Review questions **`paste0(\"(RQ\", chap, \".\", (rq 4.2 Five Named Graphs - The 5NG For our purposes, we will be limiting consideration to five different types of graphs (note that in this text we use the terms “graphs”, “plots”, and “charts” interchangeably). We term these five named graphs the 5NG: scatter-plots line-graphs boxplots histograms barplots With this repertoire of plots, you can visualize a wide array of data variables thrown at you. We will discuss some variations of these, but with the 5NG in your toolbox you can do big things! Something we will also stress here is that certain plots only work for categorical/logical variables and others only for quantitative variables. You’ll want to quiz yourself often as we go along on which plot makes sense a given a particular problem set-up. 4.3 5NG#1: Scatter-plots The simplest of the 5NG are scatter-plots (also called bivariate plots); they allow you to investigate the relationship between two continuous variables. While you may already be familiar with such plots, let’s view it through the lens of the Grammar of Graphics. Specifically, we will graphically investigate the relationship between the following two continuous variables in the flights data frame: dep_delay: departure delay on the horizontal “x” axis arr_delay: arrival delay on the vertical “y” axis for Alaska Airlines flights leaving NYC in 2013. This requires paring down the flights data frame to a smaller data frame alaska_flights consisting of only Alaska Airlines (carrier code “AS”) flights. data(flights) alaska_flights &lt;- flights %&gt;% filter(carrier == &quot;AS&quot;) This code snippet makes use of functions in the dplyr package for data manipulation to achieve our goal: it takes the flights data frame and filters it to only return the rows which meet the condition carrier == &quot;AS&quot; (recall equality is specified with == and not =). You will see many more examples using this function in Chapter ??. Learning check (LC3.1) Take a look at both the flights and alaska_flights data frames by running View(flights) and View(alaska_flights) in the console. In what respect do these data frames differ? 4.3.1 Scatter-plots via geom_point We proceed to create the scatter-plot using the ggplot() function: ggplot(data=alaska_flights, aes(x = dep_delay, y = arr_delay)) + geom_point() Figure 4.2: Arrival Delays vs Departure Delays for Alaska Airlines flights from NYC in 2013 Let’s break down this keeping in mind our discussion in Section 4.1: Within the ggplot() function call, we specify two of the components of the grammar: The data frame to be alaska_flights by setting data=alaska_flights The aesthetic mapping by setting aes(x = dep_delay, y = arr_delay). Specifically dep_delay maps to the x position arr_delay maps to the y position We add a layer to the ggplot() function call using the + sign The layer in question specifies the third component of the grammar: the geometric object in question. In this case the geometric object are points, set by specifying geom_point() In Figure 4.2 we see that a positive relationship exists between dep_delay and arr_delay: as departure delays increase, arrival delays tend to also increase. We also note that the majority of points fall near the point (0, 0). There is a large mass of points clustered there. Learning check (LC3.2) What are some practical reasons why dep_delay and arr_delay have a positive relationship? (LC3.3) What variables (not necessarily in the flights data frame) would you expect to have a negative correlation (i.e. a negative relationship) with dep_delay? Why? Remember that we are focusing on continuous variables here. (LC3.4) Why do you believe there is a cluster of points near (0, 0)? What does (0, 0) correspond to in terms of the Alaskan flights? (LC3.5) What are some other features of the plot that stand out to you? (LC3.6) Create a new scatter-plot using different variables in the alaska_flights data frame by modifying the example above. 4.3.2 Over-Plotting The large mass of points near (0, 0) can cause some confusion. This is the result of a phenomenon called over-plotting. As one may guess, this corresponds to values being plotted on top of each other over and over again. It is often difficult to know just how many values are plotted in this way when looking at a basic scatter-plot as we have here. There are two ways to address this issue: By adjusting the transparency of the points via the alpha argument By jittering the points via geom_jitter() The first way of relieving over-plotting is by changing the alpha argument to geom_point() which controls the transparency of the points. By default, this value is set to 1. We can change this value to a smaller fraction to change the transparency of the points in the plot: ggplot(data=alaska_flights, aes(x = dep_delay, y = arr_delay)) + geom_point(alpha = 0.2) Figure 4.3: Delay scatterplot with alpha=0.2 Note how this function call is identical to the one in Section 4.3, but with geom_point() replaced with alpha=0.2 added. The second way of relieving over-plotting is to jitter the points a bit. In other words, we are going to add just a bit of random noise to the points to better see them and remove some of the over-plotting. You can think of “jittering” as shaking the points a bit on the plot. Instead of using geom_point, we use geom_jitter to perform this shaking and specify around how much jitter to add with the width and height arguments. This corresponds to how hard you’d like to shake the plot in units corresponding to those for both the horizontal and vertical variables (in this case minutes). ggplot(data=alaska_flights, aes(x = dep_delay, y = arr_delay)) + geom_jitter(width = 30, height = 30) Figure 4.4: Jittered delay scatterplot Note how this function call is identical to the one in Section ??, but with geom_point() replaced with geom_jitter(). The plot in 4.4 helps us a little bit in getting a sense for the over-plotting, but with a relatively large dataset like this one (714 flights), it can be argued that changing the transparency of the points by setting alpha proved more effective. Learning check (LC3.7) Why is setting the alpha argument value useful with scatter-plots? What further information does it give you that a regular scatter-plot cannot? (LC3.8) After viewing the Figure 4.3 above, give a range of arrival times and departure times that occur most frequently? How has that region changed compared to when you observed the same plot without the alpha = 0.2 set in Figure 4.2? 4.3.3 Summary Scatter-plots display the relationship between two continuous variables and may be the most used plot today as they can provide an immediate way to see the trend in one variable versus another. If you try to create a scatter-plot where either one of the two variables is not quantitative however, you will get strange results. Be careful! With medium to large datasets, you may need to play with either geom_jitter or the alpha argument in order to get a good feel for relationships in your data. This tweaking is often a fun part of data visualization since you’ll have the chance to see different relationships come about as you make subtle changes to your plots. 4.4 5NG#2: Line-graphs The next of the 5NG is a line-graph. They are most frequently used when the x-axis represents time and the y-axis represents some other numerical variable; such plots are known as time series. Time represents a variable that is connected together by each day following the previous day. In other words, time has a natural ordering. Line-graphs should be avoided when there is not a clear sequential ordering to the explanatory variable i.e. the x-variable. Our focus turns to the temp variable in this weather dataset. By Looking over the weather dataset by typing View(weather) in the console. Running ?weather to bring up the help file. We can see that the temp variable corresponds to hourly temperature (in Fahrenheit) recordings at weather stations near airports in New York City. Instead of considering all hours in 2013 for all three airports in NYC, let’s focus in the hourly temperature at Newark airport (origin code “EWR”) for the first 15 days in January 2013. The weather data frame in the nycflights13 package contains this data, but we first need to filter it to only include those rows that correspond to Newark in the first 15 days of January. data(weather) early_january_weather &lt;- weather %&gt;% filter(origin==&quot;EWR&quot; &amp; month == 1 &amp; day &lt;= 15) This is very similar to the previous use of the filter command in Section 4.3, however we now use the &amp; operator. The above selects only those rows in weather where origin==&quot;EWR&quot; **and**month=1**and**day &lt;= 15`. Learning check (LC3.9) Take a look at both the weather and early_january_weather data frames by running View(weather) and View(early_january_weather) in the console. In what respect do these data frames differ? (LC3.10) The weather data is recorded hourly. Why does the time_hour variable correctly identify the hour of the measurement and not the just the hour variable? 4.4.1 Line-graphs via geom_line We plot a line-graph of hourly temperature using geom_line(): ggplot(data=early_january_weather, aes(x=time_hour, y=temp)) + geom_line() Figure 4.5: Hourly Temperature in Newark for Jan 1-15 2013 Much as with the ggplot() call in Section ??, we specify the components of the Grammar of Graphics: Within the ggplot() function call, we specify two of the components of the grammar: The data frame to be early_january_weather by setting data=early_january_weather The aesthetic mapping by setting aes(x = time_hour, y = temp). Specifically time_hour (i.e. the time variable) maps to the x position temp maps to the y position We add a layer to the ggplot() function call using the + sign The layer in question specifies the third component of the grammar: the geometric object in question. In this case the geometric object is a line, set by specifying geom_line() Learning check (LC3.11) Why should line-graphs be avoided when there is not a clear ordering of the horizontal axis? (LC3.12) Why are line-graphs frequently used when time is the explanatory variable? ?? instead of `flights` or `flights_day`? --> (LC3.13) Plot a time series of a variable other than temp for Newark Airport in the first 15 days of January 2013. 4.4.2 Summary Line-graphs, just like scatter-plots, display the relationship between two continuous variables. However the variable on the x-axis (i.e. the explanatory variable) should have a natural ordering, like some notion of time. We can mislead our audience if that isn’t the case. 4.5 5NG#3: Histograms Let’s consider the temp variable in the weather data frame once again, but now unlike with the line-graphs in Section 4.4, let’s say we don’t care about the relationship of temperature to time, but rather you care about the (statistical) distribution of temperatures. We could just produce points where each of the different values appear on something similar to a number line: Figure 4.6: Strip Plot of Hourly Temperature Recordings from NYC in 2013 This gives us a general idea of how the values of temp differ. We see that temperatures vary from around 11 up to 100 degrees Fahrenheit. The area between 40 and 60 degrees appears to have more points plotted than outside that range. 4.5.1 Histograms via geom_histogram What is commonly produced instead of this strip plot is a plot known as a histogram. The histogram shows how many elements of a single numerical variable fall in specified bins. In this case, these bins may correspond to between 0-10°F, 10-20°F, etc. We produce a histogram of the hour temperatures at all three NYC airports in 2013: ggplot(data = weather, mapping = aes(x = temp)) + geom_histogram() ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. ## Warning: Removed 1 rows containing non-finite values (stat_bin). Figure 4.7: Histogram of Hourly Temperature Recordings from NYC in 2013 Note here: There is only one variable being mapped in aes(): the single continuous variable temp. You don’t need to compute the y-aesthetic: it gets computed automatically. We set the geometric object to be geom_histogram() We got a warning message of 1 rows containing non-finite values being removed. This is due to one of the values of temperature being missing. R is alerting us that this happened. 4.5.2 Adjusting the Bins We can adjust the number/size of the bins two ways: By adjusting the number of bins via the bins argument By adjusting the width of the bins via the binwidth argument First, we have the power to specify how many bins we would like to put the data into as an argument in the geom_histogram function. By default, this is chosen to be 30 somewhat arbitrarily we have received a warning above our plot that this was done. ggplot(data = weather, mapping = aes(x = temp)) + geom_histogram(bins = 60) Figure 4.8: Histogram of Hourly Temperature Recordings from NYC in 2013 - 60 Bins Second, instead of specifying the number of bins, we can also specify the width of the bins by using the binwidth argument in the geom_histogram function. ggplot(data = weather, mapping = aes(x = temp)) + geom_histogram(binwidth = 10) Figure 4.9: Histogram of Hourly Temperature Recordings from NYC in 2013 - Binwidth = 10 Learning check (LC3.14) What does changing the number of bins from 30 to 60 tell us about the distribution of temperatures? (LC3.15) Would you classify the distribution of temperatures as symmetric or skewed? (LC3.16) What would you guess is the “center” value in this distribution? Why did you make that choice? (LC3.17) Is this data spread out greatly from the center or is it close? Why? 4.5.3 Summary Histograms, unlike scatter-plots and line-graphs, presents information on only a single continuous variable. In particular they are visualizations of the (statistical) distribution of values. 4.6 Facets Before continuing the 5NG, we briefly introduce a new concept called faceting. Faceting is used when we’d like to create small multiples of the same plot over a different categorical variable. By default, all of the small multiples will have the same vertical axis. For example, suppose we were interested in looking at how the temperature histograms we saw in Chapter 4.5 varied by month. This is what is meant by “the distribution of a variable over another variable”: temp is one variable and month is the other variable. In order to look at histograms of temp for each month, we add a layer facet_wrap(~month). ggplot(data = weather, aes(x = temp)) + geom_histogram(binwidth = 5) + facet_wrap(~month) Figure 4.10: Faceted histogram As we might expect, the temperature tends to increase as summer approaches and then decrease as winter approaches. Learning check (LC3.18) What other things do you notice about the faceted plot above? How does a faceted plot help us see how relationships between two variables? (LC3.19) What do the numbers 1-12 correspond to in the plot above? What about 25, 50, 75, 100? (LC3.21) For which types of datasets would these types of faceted plots not work well in comparing relationships between variables? Give an example describing the variability of the variables and other important characteristics. (LC3.22) Does the temp variable in the weather data set have a lot of variability? Why do you say that? 4.7 5NG#4: Boxplots While using faceted histograms can provide a way to compare distributions of a continuous variable split by groups of a categorical variable as in Chapter 4.6, an alternative plot called a boxplot (also called a side-by-side boxplot) achieves the same task. The boxplot uses the information provided in the five-number summary referred to in Appendix A. It gives a way to compare this summary information across the different levels of a categorical variable. 4.7.1 Boxplots via geom_boxplot Let’s create a boxplot to compare the monthly temperatures as we did above with the faceted histograms. ggplot(data = weather, aes(x = month, y = temp)) + geom_boxplot() Figure 4.11: Invalid boxplot specification Note the first warning that is given here. (The second one corresponds to missing values in the data frame and it is turned off on subsequent plots.) Observe that this plot does not look like what we were expecting. We were expecting to see the distribution of temperatures for each month (so 12 different boxplots). This gives us the overall boxplot without any other groupings. We can get around this by introducing a new function for our x variable: ggplot(data = weather, mapping = aes(x = factor(month), y = temp)) + geom_boxplot() Figure 4.12: Month by temp boxplot We have introduced a new function called factor() here. One of the things this function does is to convert a discrete value like month (1, 2, …, 12) into a categorical variable. The “box” part of this plot represents the 25th percentile, the median (50th percentile), and the 75th percentile. The dots correspond to outliers. (The specific formulation for these outliers is discussed in Appendix A.) The lines show how the data varies that is not in the center 50% defined by the first and third quantiles. Longer lines correspond to more variability and shorter lines correspond to less variability. Learning check (LC3.23) What does the dot at the bottom of the plot for May correspond to? Explain what might have occurred in May to produce this point. (LC3.24) Which months have the highest variability in temperature? What reasons do you think this is? (LC3.25) We looked at the distribution of a continuous variable over a categorical variable here with this boxplot. Why can’t we look at the distribution of one continuous variable over the distribution of another continuous variable? Say temperature across pressure, for example? (LC3.26) Boxplots provide a simple way to identify outliers. Why may outliers be easier to identify when looking at a boxplot instead of a faceted histogram? 4.7.2 Summary Boxplots provide a way to compare and contrast the distribution of one quantitative variable across multiple levels of one categorical variable. One can easily look to see where the median falls across the different groups by looking at the center line in the box. You can also see how spread out the variable is across the different groups by looking at the width of the box and also how far out the lines stretch from the box. If the lines stretch far from the box but the box has a small width, the variability of the values closer to the center is much smaller than the variable of the outer ends of the variable. Lastly, outliers are even more easily identified when looking at a boxplot than when looking at a histogram. 4.8 5NG#5: Barplots Both histograms and boxplots represent ways to visualize the variability of continuous variables. Another common task is to present the distribution of a categorical variable. This is a simpler task since we will be interested in how many elements from our data fall into the different categories of the categorical variable. 4.8.1 Barplots via geom_bar Frequently, the best way to visualize these different counts (also known as frequencies) is via a barplot. Consider the distribution of airlines that flew out of New York City in 2013. Here we explore the number of flights from each airline/carrier. This can be plotted by invoking the geom_bar function in ggplot2: ggplot(data = flights, mapping = aes(x = carrier)) + geom_bar() Figure 4.13: Number of flights departing NYC in 2013 by airline We see that United Air Lines, JetBlue Airways, and ExpressJet Airlines had the most flights depart New York City in 2013. To get the actual number of flights by each airline we can use the count function in the dplyr package on the carrier variable in flights, which we will introduce formally in Chapter @ref{manip}. ## # A tibble: 1 × 1 ## `1.n` ## &lt;int&gt; ## 1 336776 Learning check (LC3.27) Why are histograms inappropriate for visualizing categorical variables? (LC3.28) What is the difference between histograms and barplots? (LC3.29) How many Envoy Air flights departed NYC in 2013? (LC3.30) What was the seventh highest airline in terms of departed flights from NYC in 2013? How can we better present the table to get this answer quickly. 4.8.2 Must avoid pie charts! Unfortunately, one of the most common plots seen today for categorical data is the pie chart. While they may see harmless enough, they actually present a problem in that humans are unable to judge angles well. As Naomi Robbins describes in her book “Creating More Effective Graphs” (Robbins 2013), we overestimate angles greater than 90 degrees and we underestimate angles less than 90 degrees. In other words, it is difficult for us to determine relative size of one piece of the pie compared to another. Let’s examine our previous barplot example on the number of flights departing NYC by airline. This time we will use a pie chart. As you review this chart, try to identify how much larger the portion of the pie is for ExpressJet Airlines (EV) compared to US Airways (US), what the third largest carrier is in terms of departing flights, and how many carriers have fewer flights than United Airlines (UA)? Figure 4.14: The dreaded pie chart While it is quite easy to look back at the barplot to get the answer to these questions, it’s quite difficult to get the answers correct when looking at the pie graph. Barplots can always present the information in a way that is easier for the eye to determine relative position. There may be one exception from Nathan Yau at FlowingData.com but we will leave this for the reader to decide: Figure 4.15: The only good pie chart Learning check (LC3.31) Why should pie charts be avoided and replaced by barplots? (LC3.32) What is your opinion as to why pie charts continue to be used? 4.8.3 Using barplots to compare two variables Barplots are the go-to way to visualize the frequency of different categories of a categorical variable. They make it easy to order the counts and to compare one group’s frequency to another. Another use of barplots (unfortunately, sometimes inappropriately and confusingly) is to compare two categorical variables together. Let’s examine the distribution of outgoing flights from NYC by carrier and airport. We begin by getting the names of the airports in NYC that were included in the flights dataset. Remember from Chapter 3 that this can be done by using the inner_join function (more in Chapter ??). flights_namedports &lt;- flights %&gt;% inner_join(airports, by = c(&quot;origin&quot; = &quot;faa&quot;)) After running View(flights_namedports), we see that name now corresponds to the name of the airport as referenced by the origin variable. We will now plot carrier as the horizontal variable. When we specify geom_bar, it will specify count as being the vertical variable. A new addition here is fill = name. Look over what was produced from the plot to get an idea of what this argument gives. ggplot(data = flights_namedports, mapping = aes(x = carrier, fill = name)) + geom_bar() Figure 4.16: Stacked barplot comparing the number of flights by carrier and airport This plot is what is known as a stacked barplot. While simple to make, it often leads to many problems. Learning check (LC3.33) What kinds of questions are not easily answered by looking at the above figure? (LC3.34) What can you say, if anything, about the relationship between airline and airport in NYC in 2013 in regards to the number of departing flights? Another variation on the stacked barplot is the side-by-side barplot. ggplot(data = flights_namedports, mapping = aes(x = carrier, fill = name)) + geom_bar(position = &quot;dodge&quot;) Figure 4.17: Side-by-side barplot comparing the number of flights by carrier and airport Learning check (LC3.35) Why might the side-by-side barplot be preferable to a stacked barplot in this case? (LC3.36) What are the disadvantages of using a side-by-side barplot, in general? Lastly, an often preferred type of barplot is the faceted barplot. We already saw this concept of faceting and small multiples in Section 4.6. This gives us a nicer way to compare the distributions across both carrier and airport/name. ggplot(data = flights_namedports, mapping = aes(x = carrier, fill = name)) + geom_bar() + facet_grid(name ~ .) Figure 4.18: Faceted barplot comparing the number of flights by carrier and airport Note how the facet_grid function arguments are written here. We are wanting the names of the airports vertically and the carrier listed horizontally. As you may have guessed, this argument and other formulas of this sort in R are in y ~ x order. We will see more examples of this in Chapter ??. Learning check (LC3.37) Why is the faceted barplot preferred to the side-by-side and stacked barplots in this case? (LC3.38) What information about the different carriers at different airports is more easily seen in the faceted barplot? 4.8.4 Summary Barplots are the preferred way of displaying categorical variables. They are easy-to-understand and to make comparisons across groups of a categorical variable. When dealing with more than one categorical variable, faceted barplots are frequently preferred over side-by-side or stacked barplots. Stacked barplots are sometimes nice to look at, but it is quite difficult to compare across the levels since the sizes of the bars are all of different sizes. Side-by-side barplots can provide an improvement on this, but the issue about comparing across groups still must be dealt with. 4.9 Conclusion 4.9.1 What’s to come? In Chapter ??, we’ll further explore data by grouping our data, creating summaries based on those groupings, filtering our data to match conditions, selecting specific columns of our data, and other manipulations with our data including defining new columns/variables. These data manipulation procedures will go hand-in-hand with the data visualizations you’ve produced here. 4.9.2 Script of R code An R script file of all R code used in this chapter is available here. References "],
-["5-data-manipulation-via-dplyr.html", "5 Data Manipulation via dplyr Needed packages 5.1 The pipe %&gt;% 5.2 Four Main Verbs - The 4MV 5.3 Other verbs 5.4 Joining/merging data frames 5.5 Script of R code 5.6 What’s to come?", " 5 Data Manipulation via dplyr Let’s briefly recap where we have been so far and where we are headed. In Chapter 3, we discussed what it means for data to be tidy. We saw that this refers to observational units corresponding to rows and variables being stored in columns. The entries in the data frame correspond to different combinations of observational units and variables. In the flights data frame, we saw that each row corresponded to a different flight leaving New York City. (In other words, the observational unit of that tidy data frame is a flight.) The variables are listed as columns and for flights they include both quantitative variables like dep_delay and distance but also categorical variables like carrier and origin. An entry in the table corresponds to a particular flight on a given day and a particular value of a given variable representing that flight. We saw in Chapter ?? that organizing data in this tidy way makes it easy for us to produce graphics. We can simply specify what variable/column we would like on one axis, what variable we’d like on the other axis, and what type of plot we’d like to make. We can also do things such as changing the color by another variable or change the size of our points by a fourth variable given this tidy data set. In Chapter ??, we also introduced some ways to summarize and manipulate data to suit your needs. This chapter focuses more on the details of this by giving a variety of examples using the four main verbs in the dplyr package (Wickham and Francois 2016). There are more advanced operations that can be done than these and you’ll see some examples of this near the end of the chapter. Needed packages library(dplyr) library(ggplot2) library(nycflights13) library(knitr) 5.1 The pipe %&gt;% Just as the + sign was used to add layers to a plot created using ggplot we will use the pipe operator (%&gt;%) to chain together dplyr functions. We read the pipe operator as “and then”. The %&gt;% operator allows us to go from one step in dplyr to the next easily so we can filter our data frame to only focus on a few rows, and then take that filtered data set, and group_by another variable, and then lastly summarize this grouped data to calculate the mean for each level of the group. The piping syntax will be our major focus throughout the rest of this book and you’ll find that you’ll quickly be addicted to the chaining with some practice. If you’d like to see more examples on using dplyr, the 4MV (in addition to some other dplyr verbs), and %&gt;% with the nycflights13 data set, you can check out Chapter 5 of Hadley and Garrett’s book (Grolemund and Wickham 2016). 5.2 Four Main Verbs - The 4MV The d in dplyr stands for data frames so the functions here work when you are working with objects of the data frame type. It’s most important for you to focus on the four most commonly used functions that help us manipulate and summarize data. A description of these verbs follows with each subsection devoted to seeing an example of that verb in play (or a combination of a few verbs): filter: Pick rows based on conditions about their values summarize: Create summary measures of variables (or groups of observations on variables using group_by) mutate: Make a new variable in the data frame arrange: Sort the rows based on one or more variables Just as we had the 5NG (The Five Named Graphs in Chapter ?? using ggplot2), we have the 4MV here (The Four Main Verbs in dplyr): 5.2.1 Filter observations using filter Figure 5.1: Filter diagram from Data Wrangling with dplyr and tidyr cheatsheet All of the 4MVs follow the same syntax with the argument before the pipe being the name of the data frame and then the name of the verb with other arguments specifying which criteria you’d like the verb to work with in parantheses. The filter function here works much like the “Filter” option in Microsoft Excel. It allows you to specify criteria about values of a variable in your data set and then chooses only those rows that match that criteria. We begin by focusing only on flights from New York City to Portland, Oregon. The dest code (or airport code) for Portland, Oregon is &quot;PDX&quot;: portland_flights &lt;- flights %&gt;% filter(dest == &quot;PDX&quot;) portland_flights ## # A tibble: 1,354 × 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## &lt;int&gt; &lt;int&gt; &lt;int&gt; &lt;int&gt; &lt;int&gt; &lt;dbl&gt; &lt;int&gt; ## 1 2013 1 1 1739 1740 -1 2051 ## 2 2013 1 1 1805 1757 8 2117 ## 3 2013 1 1 2052 2029 23 2349 ## 4 2013 1 2 804 805 -1 1039 ## 5 2013 1 2 1552 1550 2 1853 ## 6 2013 1 2 1727 1720 7 2042 ## 7 2013 1 2 1738 1740 -2 2028 ## 8 2013 1 2 2024 2029 -5 2314 ## 9 2013 1 3 1755 1745 10 2110 ## 10 2013 1 3 1814 1727 47 2108 ## # ... with 1,344 more rows, and 12 more variables: ## # sched_arr_time &lt;int&gt;, arr_delay &lt;dbl&gt;, carrier &lt;chr&gt;, flight &lt;int&gt;, ## # tailnum &lt;chr&gt;, origin &lt;chr&gt;, dest &lt;chr&gt;, air_time &lt;dbl&gt;, ## # distance &lt;dbl&gt;, hour &lt;dbl&gt;, minute &lt;dbl&gt;, time_hour &lt;dttm&gt; Note the second equals sign here. You are almost guaranteed to make the mistake at least once of only including one equals sign. Let’s see what happens when we make this error: portland_flights &lt;- flights %&gt;% filter(dest = &quot;PDX&quot;) Error: filter() takes unnamed arguments. Do you need `==`? You should run View(pdx_flights) to glance at the data in spreadsheet form and ensure that only flights heading to Portland are chosen here. You can combine multiple criteria together using operators that make comparisons: | corresponds to “or” &amp; corresponds to “and” We can often skip the use of &amp; and just separate our conditions with a comma. You’ll see this in the example below. In addition, you can use other mathematical checks (similar to ==): &gt; corresponds to “greater than” &lt; corresponds to “less than” &gt;= corresponds to “greater than or equal to” &lt;= corresponds to “less than or equal to” != corresponds to “not equal to” To see many of these in action, let’s select all flights that left JFK airport heading to Burlington, Vermont (&quot;BTV&quot;) or Seattle, Washington (&quot;SEA&quot;) in the months of October, November, or December: btv_sea_flights_fall &lt;- flights %&gt;% filter( origin == &quot;JFK&quot;, (dest == &quot;BTV&quot;) | (dest == &quot;SEA&quot;), month &gt;= 10) Another example uses the ! to pick rows that DON’T match a condition. Here we are referring to excluding the Northern Hemisphere summer months of June, July, and August. not_summer_flights &lt;- flights %&gt;% filter(!between(month, 6, 8)) not_summer_flights ## # A tibble: 249,781 × 19 ## year month day dep_time sched_dep_time dep_delay arr_time ## &lt;int&gt; &lt;int&gt; &lt;int&gt; &lt;int&gt; &lt;int&gt; &lt;dbl&gt; &lt;int&gt; ## 1 2013 1 1 517 515 2 830 ## 2 2013 1 1 533 529 4 850 ## 3 2013 1 1 542 540 2 923 ## 4 2013 1 1 544 545 -1 1004 ## 5 2013 1 1 554 600 -6 812 ## 6 2013 1 1 554 558 -4 740 ## 7 2013 1 1 555 600 -5 913 ## 8 2013 1 1 557 600 -3 709 ## 9 2013 1 1 557 600 -3 838 ## 10 2013 1 1 558 600 -2 753 ## # ... with 249,771 more rows, and 12 more variables: ## # sched_arr_time &lt;int&gt;, arr_delay &lt;dbl&gt;, carrier &lt;chr&gt;, flight &lt;int&gt;, ## # tailnum &lt;chr&gt;, origin &lt;chr&gt;, dest &lt;chr&gt;, air_time &lt;dbl&gt;, ## # distance &lt;dbl&gt;, hour &lt;dbl&gt;, minute &lt;dbl&gt;, time_hour &lt;dttm&gt; To check that we are correct here we can use the count function in the dplyr package on the month variable in our not_summer_flights data frame to ensure June, July, and August are not selected: not_summer_flights %&gt;% count(month) ## # A tibble: 1 × 1 ## `1.n` ## &lt;int&gt; ## 1 249781 The function between is a shortcut. We could also have written the following to get the same result: not_summer2 &lt;- flights %&gt;% filter(month &lt;= 5 | month &gt;= 9) not_summer2 %&gt;% count(month) ## # A tibble: 1 × 1 ## `1.n` ## &lt;int&gt; ## 1 249781 Learning check (LC5.1) What’s another way using ! we could filter only the rows that are not summer months (June, July, or August) in the flights data frame? 5.2.2 Summarize variables using summarize Figure 5.2: Summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet Figure 5.3: Another summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet We saw in Subsection ?? a way to calculate the standard deviation and mean of the temperature variable temp in the weather data frame of nycflights. We can do so in one step using the summarize function in dplyr: weather %&gt;% summarize(mean = mean(temp), std_dev = sd(temp)) ## # A tibble: 1 × 2 ## mean std_dev ## &lt;dbl&gt; &lt;dbl&gt; ## 1 NA NA What happened here? The mean and the standard deviation temperatures are missing? Remember that by default the mean and sd functions do not ignore missing values. We need to specify TRUE for the na.rm parameter: summary_temp &lt;- weather %&gt;% summarize(mean = mean(temp, na.rm = TRUE), std_dev = sd(temp, na.rm = TRUE)) summary_temp ## # A tibble: 1 × 2 ## mean std_dev ## &lt;dbl&gt; &lt;dbl&gt; ## 1 55.20351 17.78212 % summarize(std_dev = sd(temp, na.rm = TRUE)) does not work --> We’ve created a small data frame here called summary_temp that includes both the mean and the std_dev of the temp variable in weather. If we’d like to access either of these values directly we can use the $ to specify a column in a data frame: summary_temp$mean ## [1] 55.20351 summary_temp$std_dev ## [1] 17.78212 It’s often more useful to summarize a variable based on the groupings of another variable. Let’s say we were interested in the mean and standard deviation of temperatures for each month. We believe that you will be amazed at just how simple this is: Figure 5.4: Group by and summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet summary_tempXmonth &lt;- weather %&gt;% group_by(month) %&gt;% summarize(mean = mean(temp, na.rm = TRUE), std_dev = sd(temp, na.rm = TRUE)) summary_tempXmonth ## # A tibble: 12 × 3 ## month mean std_dev ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 1 35.64127 10.185459 ## 2 2 34.15454 6.940228 ## 3 3 39.81404 6.224948 ## 4 4 51.67094 8.785250 ## 5 5 61.59185 9.608687 ## 6 6 72.14500 7.603356 ## 7 7 80.00967 7.147631 ## 8 8 74.40495 5.171365 ## 9 9 67.42582 8.475824 ## 10 10 60.03305 8.829652 ## 11 11 45.10893 10.502249 ## 12 12 38.36811 9.940822 By simply grouping the weather data set by month first and then passing this new data frame into summarize we get a resulting data frame that shows the mean and standard deviation temperature for each month in New York City. Another useful function is the n function which gives a count of how many entries appeared in the groupings. Suppose we’d like to get a sense for how many flights departed each of the three airports in New York City: by_origin &lt;- flights %&gt;% group_by(origin) %&gt;% summarize(count = n()) by_origin ## # A tibble: 3 × 2 ## origin count ## &lt;chr&gt; &lt;int&gt; ## 1 EWR 120835 ## 2 JFK 111279 ## 3 LGA 104662 We see that Newark (&quot;EWR&quot;) had the most flights departing in 2013 followed by &quot;JFK&quot; and lastly by LaGuardia (&quot;LGA&quot;). Learning check (LC5.2) Recall from Chapter ?? when we looked at plots of temperatures by months in NYC. What does the standard deviation column in the summary_tempXmonth data frame tell us about temperatures in New York City throughout the year? (LC5.3) What code would be required to get the mean and standard deviation temperature for each day in 2013 for NYC? (LC5.4) How could we identify how many flights left each of the three airports in each of the months of 2013? 5.2.3 Create new variables/change old variables using mutate Figure 5.5: Mutate diagram from Data Wrangling with dplyr and tidyr cheatsheet When looking at the flights data set, there are some clear additional variables that could be calculated based on the values of variables already in the data set. Passengers are often frustrated when their flights departs late, but change their mood a bit if pilots can make up some time during the flight to get them to their destination close to when they expected to land. This is commonly referred to as “gain” and we will create this variable using the mutate function. Note that we have also overwritten the flights data frame with what it was before as well as an additional variable gain here. flights &lt;- flights %&gt;% mutate(gain = arr_delay - dep_delay) We can now look at summary measures of this gain variable and even plot it in the form of a histogram: gain_summary &lt;- flights %&gt;% summarize( min = min(gain, na.rm = TRUE), q1 = quantile(gain, 0.25, na.rm = TRUE), median = quantile(gain, 0.5, na.rm = TRUE), q3 = quantile(gain, 0.75, na.rm = TRUE), max = max(gain, na.rm = TRUE), mean = mean(gain, na.rm = TRUE), sd = sd(gain, na.rm = TRUE), missing = sum(is.na(gain)) ) gain_summary ## # A tibble: 1 × 8 ## min q1 median q3 max mean sd missing ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;int&gt; ## 1 -109 -17 -7 3 196 -5.659779 18.04365 9430 We’ve recreated the summary function we saw in Chapter ?? here using the summarize function in dplyr. library(ggplot2) ggplot(data = flights, mapping = aes(x = gain)) + geom_histogram(color = &quot;white&quot;, bins = 20) Figure 5.6: Histogram of gain variable We can also create multiple columns at once and even refer to columns that were just created in a new column. Hadley produces one such example in Chapter 5 of “R for Data Science” (Grolemund and Wickham 2016): flights_plus &lt;- flights %&gt;% mutate( gain = arr_delay - dep_delay, hours = air_time / 60, gain_per_hour = gain / hours ) Learning check (LC5.5) What do positive values of the gain variable in flights_plus correspond to? What about negative values? And what about a zero value? (LC5.6) Could we create the dep_delay and arr_delay columns by simply subtracting dep_time from sched_dep_time and similarly for arrivals? Try the code out and explain any differences between the result and what actually appears in flights. (LC5.7) What can we say about the distribution of gain? Describe it in a few sentences using the plot and the gain_summary data frame values. 5.2.4 Reorder the data frame using arrange As you may have thought about with the data frames we’ve worked with so far in the book, one of the most common things you’d like to do is sort the data frames by a specific column. Have you ever been asked to calculate a median by hand? This requires you to put the data in order from smallest to highest in value. The dplyr package has a function called arrange that we will use to sort/reorder our data according to the values of the specified variable. This is most frequently used after we have used the group_by and summarize functions as we will see. Let’s suppose we were interested in determining the most frequent destination airports from New York City in 2013: freq_dest &lt;- flights %&gt;% group_by(dest) %&gt;% summarize(num_flights = n()) freq_dest ## # A tibble: 105 × 2 ## dest num_flights ## &lt;chr&gt; &lt;int&gt; ## 1 ABQ 254 ## 2 ACK 265 ## 3 ALB 439 ## 4 ANC 8 ## 5 ATL 17215 ## 6 AUS 2439 ## 7 AVL 275 ## 8 BDL 443 ## 9 BGR 375 ## 10 BHM 297 ## # ... with 95 more rows You’ll see that by default the values of dest are displayed in alphabetical order here. Remember to use View() in the R Console to look at all the values of freq_dest in spreadsheet format. We are interested in finding those airports that appear most: freq_dest %&gt;% arrange(num_flights) ## # A tibble: 105 × 2 ## dest num_flights ## &lt;chr&gt; &lt;int&gt; ## 1 LEX 1 ## 2 LGA 1 ## 3 ANC 8 ## 4 SBN 10 ## 5 HDN 15 ## 6 MTJ 15 ## 7 EYW 17 ## 8 PSP 19 ## 9 JAC 25 ## 10 BZN 36 ## # ... with 95 more rows This is actually giving us the opposite of what we are looking for. It tells us the least frequent destination airports first. To switch the ordering to be descending instead of ascending we use the desc function: freq_dest %&gt;% arrange(desc(num_flights)) ## # A tibble: 105 × 2 ## dest num_flights ## &lt;chr&gt; &lt;int&gt; ## 1 ORD 17283 ## 2 ATL 17215 ## 3 LAX 16174 ## 4 BOS 15508 ## 5 MCO 14082 ## 6 CLT 14064 ## 7 SFO 13331 ## 8 FLL 12055 ## 9 MIA 11728 ## 10 DCA 9705 ## # ... with 95 more rows 5.3 Other verbs 5.3.1 Select variables using select Figure 5.7: Select diagram from Data Wrangling with dplyr and tidyr cheatsheet We’ve seen that the flights data frame in the nycflights13 package contains many different variables (19 in fact). You can identify this by running the dim function or the ncol function: data(flights) dim(flights) ## [1] 336776 19 ncol(flights) ## [1] 19 One of these variables is year. If you remember the original description of the flights data frame (or by running ?flights), you’ll remember that this data correspond to flights in 2013 departing New York City. The year variable isn’t really a variable here in that it doesn’t vary… flights actually comes from a larger data set that covers many years. We may want to remove the year variable from our data set since it won’t be helpful for analysis in this case. To do so easily, we use the select variable: flights_small &lt;- flights %&gt;% select( -year) names(flights_small) ## [1] &quot;month&quot; &quot;day&quot; &quot;dep_time&quot; &quot;sched_dep_time&quot; ## [5] &quot;dep_delay&quot; &quot;arr_time&quot; &quot;sched_arr_time&quot; &quot;arr_delay&quot; ## [9] &quot;carrier&quot; &quot;flight&quot; &quot;tailnum&quot; &quot;origin&quot; ## [13] &quot;dest&quot; &quot;air_time&quot; &quot;distance&quot; &quot;hour&quot; ## [17] &quot;minute&quot; &quot;time_hour&quot; The names function gives a listing of all the columns in a data frame. We see that year has been removed. This was done using a - in front of the name of the column we’d like to remove. We could also select specific columns (instead of deselecting columns) by listing them out: flight_dep_times &lt;- flights %&gt;% select(month, day, dep_time, sched_dep_time) flight_dep_times ## # A tibble: 336,776 × 4 ## month day dep_time sched_dep_time ## &lt;int&gt; &lt;int&gt; &lt;int&gt; &lt;int&gt; ## 1 1 1 517 515 ## 2 1 1 533 529 ## 3 1 1 542 540 ## 4 1 1 544 545 ## 5 1 1 554 600 ## 6 1 1 554 558 ## 7 1 1 555 600 ## 8 1 1 557 600 ## 9 1 1 557 600 ## 10 1 1 558 600 ## # ... with 336,766 more rows Or we could specify a ranges of columns: flight_arr_times &lt;- flights %&gt;% select(month:day, arr_time:sched_arr_time) flight_arr_times ## # A tibble: 336,776 × 4 ## month day arr_time sched_arr_time ## &lt;int&gt; &lt;int&gt; &lt;int&gt; &lt;int&gt; ## 1 1 1 830 819 ## 2 1 1 850 830 ## 3 1 1 923 850 ## 4 1 1 1004 1022 ## 5 1 1 812 837 ## 6 1 1 740 728 ## 7 1 1 913 854 ## 8 1 1 709 723 ## 9 1 1 838 846 ## 10 1 1 753 745 ## # ... with 336,766 more rows The select function can also be used to reorder columns in combination with the everything helper function. Let’s suppose we’d like the hour, minute, and time_hour variables, which appear at the end of the flights data set, to actually appear immediately after the day variable: flights_reorder &lt;- flights %&gt;% select(month:day, hour:time_hour, everything()) names(flights_reorder) ## [1] &quot;month&quot; &quot;day&quot; &quot;hour&quot; &quot;minute&quot; ## [5] &quot;time_hour&quot; &quot;year&quot; &quot;dep_time&quot; &quot;sched_dep_time&quot; ## [9] &quot;dep_delay&quot; &quot;arr_time&quot; &quot;sched_arr_time&quot; &quot;arr_delay&quot; ## [13] &quot;carrier&quot; &quot;flight&quot; &quot;tailnum&quot; &quot;origin&quot; ## [17] &quot;dest&quot; &quot;air_time&quot; &quot;distance&quot; Lastly, the helper functions starts_with, ends_with, and contains can be used to choose column names that match those conditions: flights_begin_a &lt;- flights %&gt;% select(starts_with(&quot;a&quot;)) flights_begin_a ## # A tibble: 336,776 × 3 ## arr_time arr_delay air_time ## &lt;int&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 830 11 227 ## 2 850 20 227 ## 3 923 33 160 ## 4 1004 -18 183 ## 5 812 -25 116 ## 6 740 12 150 ## 7 913 19 158 ## 8 709 -14 53 ## 9 838 -8 140 ## 10 753 8 138 ## # ... with 336,766 more rows flights_delays &lt;- flights %&gt;% select(ends_with(&quot;delay&quot;)) flights_delays ## # A tibble: 336,776 × 2 ## dep_delay arr_delay ## &lt;dbl&gt; &lt;dbl&gt; ## 1 2 11 ## 2 4 20 ## 3 2 33 ## 4 -1 -18 ## 5 -6 -25 ## 6 -4 12 ## 7 -5 19 ## 8 -3 -14 ## 9 -3 -8 ## 10 -2 8 ## # ... with 336,766 more rows flights_time &lt;- flights %&gt;% select(contains(&quot;time&quot;)) flights_time ## # A tibble: 336,776 × 6 ## dep_time sched_dep_time arr_time sched_arr_time air_time ## &lt;int&gt; &lt;int&gt; &lt;int&gt; &lt;int&gt; &lt;dbl&gt; ## 1 517 515 830 819 227 ## 2 533 529 850 830 227 ## 3 542 540 923 850 160 ## 4 544 545 1004 1022 183 ## 5 554 600 812 837 116 ## 6 554 558 740 728 150 ## 7 555 600 913 854 158 ## 8 557 600 709 723 53 ## 9 557 600 838 846 140 ## 10 558 600 753 745 138 ## # ... with 336,766 more rows, and 1 more variables: time_hour &lt;dttm&gt; 5.3.2 Rename variables using rename Another useful function is rename, which as you may suspect renames one column to another name. Suppose we wanted dep_time and arr_time to be departure_time and arrival_time instead in the flights_time data frame: flights_time &lt;- flights_time %&gt;% rename(departure_time = dep_time, arrival_time = arr_time) names(flights_time) ## [1] &quot;departure_time&quot; &quot;sched_dep_time&quot; &quot;arrival_time&quot; &quot;sched_arr_time&quot; ## [5] &quot;air_time&quot; &quot;time_hour&quot; It’s easy to forget if the new name comes before or after the equals sign. I usually remember this as “New Before, Old After” or NBOA. You’ll receive an error if you try to do it the other way: Error: Unknown variables: departure_time, arrival_time. Learning check (LC5.8) What are some ways to select all three of the dest, air_time, and distance variables from flights? Give the code showing how to do this in at least three different ways. (LC5.9) How could one use starts_with, ends_with, and contains to select columns from the flights data frame? Provide three different examples in total: one for starts_with, one for ends_with, and one for contains. (LC5.10) Why might we want to use the select function on a data frame? 5.3.3 Find the top number of values using top_n We can also use the top_n function which automatically tells us the most frequent num_flights. We specify the top 10 airports here: freq_dest %&gt;% top_n(n = 10, wt = num_flights) ## # A tibble: 10 × 2 ## dest num_flights ## &lt;chr&gt; &lt;int&gt; ## 1 ATL 17215 ## 2 BOS 15508 ## 3 CLT 14064 ## 4 DCA 9705 ## 5 FLL 12055 ## 6 LAX 16174 ## 7 MCO 14082 ## 8 MIA 11728 ## 9 ORD 17283 ## 10 SFO 13331 We’ll still need to arrange this by num_flights though: freq_dest %&gt;% top_n(n = 10, wt = num_flights) %&gt;% arrange(desc(num_flights)) ## # A tibble: 10 × 2 ## dest num_flights ## &lt;chr&gt; &lt;int&gt; ## 1 ORD 17283 ## 2 ATL 17215 ## 3 LAX 16174 ## 4 BOS 15508 ## 5 MCO 14082 ## 6 CLT 14064 ## 7 SFO 13331 ## 8 FLL 12055 ## 9 MIA 11728 ## 10 DCA 9705 Note: Remember that I didn’t pull the n and wt arguments out of thin air. They can be found by using the ? function on top_n. We can go one stop further and tie together the group_by and summarize functions we used to find the most frequent flights: ten_freq_dests &lt;- flights %&gt;% group_by(dest) %&gt;% summarize(num_flights = n()) %&gt;% top_n(n = 10) %&gt;% arrange(desc(num_flights)) ## Selecting by num_flights Learning check paste0(&quot;(LC&quot;, chap, &quot;.&quot;, (lc &lt;- lc + 1), &quot;)&quot;) Create a new data frame that shows the top 5 airports with the largest arrival delays from NYC in 2013. 5.4 Joining/merging data frames Something you may have thought to yourself as you looked at the most freqent destinations of flights from NYC in 2013 is “What cities are these airports in?” “Is &quot;ORD&quot; Orlando?” “Where is &quot;FLL&quot;? The nycflights13 data package contains multiple data frames. Instead of having to manually look up different values of airport names corresponding to airport codes like ORD, we can have R automatically do this “looking up” for us. To do so, we’ll need to tell R how to match one data frame to another data frame. Let’s first check out the airports data frame inside of R: View(airports) The first column faa corresponds to the airport codes that we saw in dest in our flights and subsequent ten_freq_dests data sets. Hadley and Garrett (Grolemund and Wickham 2016) created the following diagram to help us understand how the different data sets are linked: Figure 5.8: Data relationships in nycflights13 from R for Data Science We see from View(airports) that airports contains a lot of other information about 1458. We are only really interested here in the faa and name columns. Let’s use the select function to only use those variables: airports_small &lt;- airports %&gt;% select(faa, name) So if we identify the names of the airports we can use the inner_join function to bring two different data frames together. Note that we will also rename the subsequent column name as airport_name: named_freq_dests &lt;- ten_freq_dests %&gt;% inner_join(airports_small, by = c(&quot;dest&quot; = &quot;faa&quot;)) %&gt;% rename(airport_name = name) named_freq_dests ## # A tibble: 10 × 3 ## dest num_flights airport_name ## &lt;chr&gt; &lt;int&gt; &lt;chr&gt; ## 1 ORD 17283 Chicago Ohare Intl ## 2 ATL 17215 Hartsfield Jackson Atlanta Intl ## 3 LAX 16174 Los Angeles Intl ## 4 BOS 15508 General Edward Lawrence Logan Intl ## 5 MCO 14082 Orlando Intl ## 6 CLT 14064 Charlotte Douglas Intl ## 7 SFO 13331 San Francisco Intl ## 8 FLL 12055 Fort Lauderdale Hollywood Intl ## 9 MIA 11728 Miami Intl ## 10 DCA 9705 Ronald Reagan Washington Natl In case you didn’t know, &quot;ORD&quot; is the airport code of Chicago O’Hare airport and &quot;FLL&quot; is the main airport in Fort Lauderdale, Florida, which we can now see in our named_freq_dests data frame. A visual representation of the inner_join is given below (Grolemund and Wickham 2016): Figure 5.9: Diagram of inner join from R for Data Science There are more complex joins available, but the inner_join will solve nearly all of the problems you’ll face in our experience. Learning check (LC5.11) What happens when you try to inner_join the ten_freq_dests data frame with airports instead of airports_small? How might one use this result to answer further questions about the top 10 destinations? (LC5.12) What surprises you about the top 10 destinations from NYC in 2013? As we saw with the RStudio cheatsheet on data visualization, RStudio has also created a cheatsheet for data manipulation entitled “Data Wrangling with dplyr and tidyr” available here. We will focus only on the dplyr functions in this book, but you are encouraged to also explore tidyr if you are presented with data that is not in the tidy format that we have specified as the preferred option for our purposes. 5.5 Script of R code An R script file of all R code used in this chapter is available here. 5.6 What’s to come? This concludes the Data Exploration unit of this book. You should be pretty proficient in both plotting variables (or multiple variables together) in various data sets and manipulating data as we’ve done in this chapter. You are encouraged to step back through the code in earlier chapters and make changes as you see fit based on your updated knowledge. In Chapter ??, we’ll begin to build the pieces needed to understand how this unit of Data Exploration can tie into statistical inference in the Inference part of the book. Remember that the focus throughout is on data visualization and we’ll see that next when we discuss sampling, resampling, and bootstrapping. These ideas will lead us into hypothesis testing and confidence intervals. References "],
-["6-simulating-randomness-via-mosaic.html", "6 Simulating Randomness via mosaic Needed packages 6.1 Random sampling 6.2 Visualizing sampling 6.3 Simulation 6.4 Review of mosaic simulation functions 6.5 Script of R code 6.6 What’s to come?", " 6 Simulating Randomness via mosaic In this chapter we will introduce new concepts that will serve as the basis for the remainder of the text: sampling and resampling. We will see that the tools that you learned in the Data Exploration part of this book (tidy data, data manipulation, and data visualization) will also play an important role here. As mentioned before, the concepts all build into a culmination allowing you to create better stories with data. We begin with some helpful definitions that will help us better understand why statistical inference exists and why it is needed. We will then progress with introducing the second of our main data sets (in addition to the nycflights13 data you’ve been working with) about OKCupid dating profiles to see how one can think of the distribution of a sample being an approximation of the distribution of the population. We will also focus on representative, random samples versus convenience samples in this context. We then shift to a famous example from statistics lore on a lady tasting tea. This section will focus on introducing concepts without a lot of statistical jargon. The chapter will conclude with a summary of the different functions introduced in the mosaic package in this chapter. Needed packages library(dplyr) library(ggplot2) library(okcupiddata) library(mosaic) library(knitr) 6.1 Random sampling Whenever you hear the phrases “random sampling” or just “sampling” (with regards to statistics), you should think about tasting soup. This likely sounds a little bonkers. Let’s dig into why tasting soup is such an excellent analogy to random sampling. 6.1.1 Tasting soup Figure 6.1: A bowl of Indian chicken and vegetable soup Imagine that you have invited a group of friends over to try a new recipe for soup that you’ve never made before. As in the image above downloaded from here, you’d like to make a bowl of Indian chicken soup with lots of different kinds of vegetables included. You’ve carefully followed along with the recipe but you are concerned that you don’t have a lot of experience making foods from India. It is coming near the end of the prescribed time to cook given in the recipe. You begin to wonder: “Did I add too much curry spice?” “Are the carrots cooked enough?” “Does this actually taste good?” How can we answer these questions? Does it matter where we take a bite of soup from? Is there anything we should do to the soup before we taste? Is one taste enough? Learning check (LC6.1) Explain in your own words how tasting soup relates to the concepts of sampling covered here. (LC6.2) Describe a different scenario (not food or drink related) that is analogous to sampling concepts covered here. 6.1.2 Common terms The process of sampling brings with it many common terms that we define now. As you read over these definitions, think about how they each apply to the tasting soup example above. Definition: population The population is the (usually) large pool of observational units that we are interested in. Definition: sample A sample is a smaller collection of observational units that is selected from the population. Definition: sampling Sampling refers to the process of selecting observations from a population. There are both random and non-random ways this can be done. Definition: representative sample A sample is said be a representative sample if the characteristics of observational units selected are a good approximation of the characteristics from the original population. Definition: bias Bias corresponds to a favoring of one group in a population over another group. Definition: generalizability Generalizability refers to the largest group in which it makes sense to make inferences about from the sample collected. This is directly related to how the sample was selected. Definition: parameter A parameter is a calculation based on one or more variables measured in the population. Parameters are almost always denoted symbolically using Greek letters such as \\(\\mu\\), \\(\\pi\\), \\(\\sigma\\), \\(\\rho\\), and \\(\\beta\\). Definition: statistic A statistic is a calculated based on one or more variables measured in the sample. Parameters are usually denoted by lower case Arabic letters with other symbols added sometimes. These include \\(\\bar{x}\\), \\(\\hat{p}\\), \\(s\\), \\(p\\), and \\(b\\). Let’s explore these terms for our tasting soup example: Population - the entire container of soup that we have cooked. Sample - any smaller portion of soup collected that isn’t the whole container of soup. We could say that each spoonful of soup represents one sample. Sampling - the process of selecting spoonfuls from the container of soup Representative sample - A sample we select will only be representative if it tastes like what the soup tastes like in general. If we only select a carrot in our spoonful, we might not have a representative sample. Bias - As we noted with the carrot selection example above, we may select a sample that is not representative. If you watch chefs cook or if you frequently cook, you’ll be sure to stir the soup before you taste it. Generalizability - If we stir our soup before we taste a spoonful (and if we make sure we don’t just pick our favorite item in the soup), results from our sample can be generalized (by and large) to the larger pot of soup. When we say “Yum! This is good!” after a couple spoonfuls, we can be pretty confident that each bowl of soup for our friends will taste good too. Parameter - An example here is could be the proportion of curry entered into the entire pot of soup. A measurement of how salty the pot of soup is on average is also a parameter. How crunchy, on average, the carrots are in the pot of soup is one more example. Statistic - To convert a parameter to a statistic, you need only to think about the same measurement on a spoonful: The proportion of curry to non-curry in a spoonful of soup How salty the spoonful of soup is that we collected as our sample How crunchy the carrots are in our spoonful of soup Learning check (LC6.3) Why isn’t our population all bowls of soup? All bowls of Indian chicken soup? (LC6.4) Describe a way in which we could select a sample of flights from nycflights13 that is not representative. (LC6.5) If we treat all of the flights in nycflights13 as the population, give examples of three parameters we could calculate. (LC6.6) If we treat all of the flights in nycflights13 as the population, give examples of three statistics we could calculate. (LC6.7) What biases might we see if we only select flights to Boston when we are interested in looking at mean flight delays from NYC? 6.2 Visualizing sampling Let’s explore how sampling and these other terms relate to working with data and data visualization. Here we introduce the okcupiddata R package. Note that permission to use this data to create the R package was explicitly granted by OkCupid. More information about this package is available here. The profiles data frame in this R data package contains data about 59,946 OkCupid users who were living within 25 miles of San Francisco, had active profiles on June 26, 2012, were online in the previous year, and had at least one picture in their profile. We will be focusing on the height variable, which corresponds to self-reported heights of the individual on their profile. Note that this is measured in inches. library(okcupiddata) data(profiles) Let’s take a look at the distribution of height using a histogram and ggplot2: library(ggplot2) ggplot(data = profiles, mapping = aes(x = height)) + geom_histogram(bins = 20, color = &quot;white&quot;) We see here that this being self-reported data has led to the data being a little messy. Learning check (LC6.8) Why does the histogram go all the way back to 0 for height and all the way up to 100? To clean up the data a bit, let’s focus on just looking at heights between 55 inches and 85 inches. Remember that the filter function in dplyr allows us to focus on a subset of rows. The specific subset of rows we are interested in corresponds to the argument to the filter function. We will create a new data frame called profiles_subset that contains all rows with heights between 55 and 85 inclusive. library(dplyr) profiles_subset &lt;- profiles %&gt;% filter(between(height, 55, 85)) Next, let’s produce the same histogram as above but using the profiles_subset data frame instead. library(ggplot2) ggplot(data = profiles_subset, mapping = aes(x = height)) + geom_histogram(bins = 20, color = &quot;white&quot;) We can think of this data as representing the population of interest. Let’s now take a random sample of size 100 from this population and look to see if this sample represents the overall shape of the population. In other words, we are going to use data visualization as our guide to understand the representativeness of the sample selected. library(mosaic) set.seed(2017) profiles_sample1 &lt;- profiles_subset %&gt;% resample(size = 100, replace = FALSE) The set.seed function is used to ensure that all users get the same random sample when they run the code above. It is a way of interfacing with the pseudo-random number generation scheme that R uses to generate “random” numbers. If that command was not run, you’d obtain a different random sample if you ran the code above for the first time. We have introduced the resample function from the mosaic package here. This function can be used for both sampling with and without replacement. Here we have chosen to sample without replacement. In other words, after the first row is chosen from the profiles_subset data frame at random it is kept out of the further 99 samples. Let’s now visualize the 100 values of the height variable in the profiles_sample1 data frame. To keep this visualization on the same horizontal scale as our original population presented in profiles_subset we can use the coord_cartesian function along with the c function to specify the limits on the horizontal axis. ggplot(data = profiles_sample1, mapping = aes(x = height)) + geom_histogram(bins = 20, color = &quot;white&quot;, fill = &quot;red&quot;) + coord_cartesian(xlim = c(55, 85)) Learning check (LC6.9) Does this random sample of height represent the population height variable well? Explain why or why not in a couple of sentences. We now repeat this process of sampling to look to see how another random sample of height compares to the original population distribution. profiles_sample2 &lt;- profiles_subset %&gt;% resample(size = 100, replace = FALSE) ggplot(data = profiles_sample2, mapping = aes(x = height)) + geom_histogram(bins = 20, color = &quot;black&quot;, fill = &quot;yellow&quot;) + coord_cartesian(xlim = c(55, 85)) Remember that a sample can never truly quantify all of the properties of a population since it contains less data and, thus, less information. We can use the overall shape as a good guess as to the representativeness of the sample in regards to the population. We see that the above two random samples of size 100 have roughly the same shape as the original population height data. Let’s next explore what is known as a convenience sample and how its distribution compares to the population distribution. A convenience sample is a sample that is chosen conveniently by the person selecting the sample. While certainly less work, convenience samples are generally not representative of the population since they will exclude some (usually large) portion of the population. Let’s look at values of height in our profiles_subset population that are larger than 6 feet tall (72 inches) and have that be the sample we choose. profiles_sample3 &lt;- profiles_subset %&gt;% filter(height &gt;= 72) ggplot(data = profiles_sample3, mapping = aes(x = height)) + geom_histogram(bins = 20, color = &quot;white&quot;, fill = &quot;blue&quot;) + coord_cartesian(xlim = c(55, 85)) This is a clear example of a sample that is not representative of the population. The population height is roughly symmetric, whereas this distribution is right-skewed. Further, since it only selects large heights it has completely excluded the small and middle heights. We have seen here that data visualization provides an excellent tool in judging the representativeness of a sample. 6.2.1 Sampling distribution The representativeness of a sample plays an even larger role than just looking at the shapes of distributions. Let’s suppose we were interested in estimating the mean height of all profiles in the profiles_subset data frame. To do so, we could look at the mean of the height variable in the profiles_sample1 data frame: profiles_sample1 %&gt;% summarize(mean(height)) ## mean(height) ## 1 68.45 But, we could also use profiles_sample2: profiles_sample2 %&gt;% summarize(mean(height)) ## mean(height) ## 1 68.2 Or maybe even profiles_sample3: profiles_sample3 %&gt;% summarize(mean(height)) ## mean(height) ## 1 73.37917 We see a clear difference here in looking at the mean of height in profiles_sample3 versus profiles_sample1 and profiles_sample2. This comes from the bias that is used in choosing only the top heights for profiles_sample3. If we had chosen to use this sample as our only sample, we would be quite a ways off from what the actual mean height in our population of profiles_subset is. We also see that even random samples produce means that aren’t exactly the same. This sampling variability can be shown via what is called a sampling distribution. This is defined as the behavior of a statistic under repeated sampling. To build this sampling distribution for this example, we’ve created an interactive app using the shiny R package below that is available at http://ismay.shinyapps.io/okcupidheights/. You can specify the sample size you’d like to work with (100 is chosen by default) and then generate a random sample. You then can see the mean of this generated sample plotted in the bottom visualization. Repeating this process many times, you can start to see the shape of the sampling distribution take form. Figure 6.2: Sampling distribution app 6.2.2 Repeated sampling via do We have looked at two random samples above, but using mosaic we can repeat this process over and over again with the do function. Below, we repeat this sampling process 10,000 times. We can then plot the different values of the sample means to get a sense for what a reasonable range of values for the population parameter mean height is in the profiles_subset data frame. sample_means &lt;- do(10000) * (profiles_subset %&gt;% resample(size = 100, replace = FALSE) %&gt;% summarize(mean_height = mean(height))) ggplot(data = sample_means, mapping = aes(x = mean_height)) + geom_histogram(color = &quot;white&quot;, bins = 20) Note how the range of sample mean height values is much more narrow than the original range of height in the profiles_subset data frame. We also see a characteristic shape to this distribution of sample_mean: the normal curve. This idea is commonly associated with statistics and you hopefully have a good sense of how this distribution comes about. As before, if you aren’t quite sure of this yet, go back and explore the shiny app above a bit more. We see that many values for the sample mean appear near the center of the distribution and a few values out in the tails providing the bell-shaped distribution linked with the normal distribution. You’ll see more examples of this in the chapters to come and in the appendices. Learning check (LC6.10) Why do the sample mean values have a much smaller spread than the original population data? You may want to play with the shiny app above a bit to understand why this is the case. (LC6.11) Why is random sampling so important here to create a distribution of sample means that provide a range of plausible values for the population mean height? 6.3 Simulation We next will introduce the ideas behind hypothesis testing that we will delve into more formally in the chapters to come. What follows is taken from a book entitled The Lady Tasting Tea (Salsburg 2001): It was a summer afternoon in Cambridge, England, in the late 1920s. A group of university dons, their wives, and some guests were sitting around an outdoor table for afternoon tea. One of the women was insisting that tea tasted different depending upon whether the tea was poured into the milk or whether the milk was poured into the tea. The scientific minds among the men scoffed at this as sheer nonsense. What could be the difference? They could not conceive of any difference in the chemistry of the mixtures that could exist. A thin, short man, with thick glasses and a Vandyke beard beginning to turn gray, pounced on the problem. “Let us test the proposition,” he said excitedly. He began to outline an experiment in which the lady who insisted there was a difference would be presented with a sequence of cups of tea, in some of which the milk had been poured into the tea and in others of which the tea had been poured into the milk… So it was that sunny summer afternoon in Cambridge. The lady might or might not have been correct about the tea infusion. The fun would be in finding a way to determine if she was right, and, under the direction of the man with the Vandyke beard, they began to discuss how they might make that determination. Enthusiastically, many of them joined with him in setting up the experiment. Within a few minutes, they were pouring different patterns of infusion in a place where the lady could not see which cup was which. Then, with an air of finality, the man with the Vandyke beard presented her with her first cup. She sipped for a minute and declared that it was one where the milk had been poured into the tea. He noted her response without comment and presented her with the second cup… The man with the Vandyke beard was Ronald Aylmer Fisher, who was in his late thirties at the time. He would later be knighted Sir Ronald Fisher. In 1935, he wrote a book entitled The Design of Experiments, and he described the experiment of the lady tasting tea in the second chapter of that book. In his book, Fisher discusses the lady and her belief as a hypothetical problem. He considers the various ways in which an experiment might be designed to determine if she could tell the difference. The problem in designing the experiment is that, if she is given a single cup of tea, she has a 50 percent chance of guessing correctly which infusion was used, even if she cannot tell the difference. If she is given two cups of tea, she still might guess correctly. In fact, if she knew that the two cups of tea were each made with a different infusion, one guess could be completely right (or completely wrong). Similarly, even if she could tell the difference, there is some chance that she might have made a mistake, that one of the cups was not mixed as well or that the infusion was made when the tea was not hot enough. She might be presented with a series of ten cups and correctly identify only nine of them, even if she could tell the difference. In his book, Fisher discusses the various possible outcomes of such an experiment. He describes how to decide how many cups should be presented and in what order and how much to tell the lady about the order of presentations. He works out the probabilities of different outcomes, depending upon whether the lady is or is not correct. Nowhere in this discussion does he indicate that such an experiment was ever run. Nor does he describe the outcome of an actual experiment. It’s amazing that there is no actual evidence that such an event actually took place. This problem is a great introduction into inference though and we can proceed by testing to see how likely it is for a person to guess correctly, say, 9 out of 10 times assuming that that person is just guessing. In other words, is the person just lucky or do we have reason to suspect that they can actually detect whether milk was put in first or not? We need to think about this problem from the standpoint of hypothesis testing. First, we’ll need to identify some important parts of a hypothesis test before we proceed with the analysis. Learning check (LC6.12) What does “by chance” mean in this context? (LC6.13) What is our observed statistic? (LC6.14) What is this statistic trying to estimate? (LC6.15) How could we test to see whether the person is just guessing or if they have some special talent of identifying milk before tea or vice-versa? Let’s begin with an experiment. I will flip a coin 10 times. Your job is to try to predict the sequence of my 10 flips. Write down 10 H’s and T’s corresponding to your predictions. We could compare your guesses with my actual flips and then we will note how many correct guesses you have. You may be asking yourself how this models a way to test whether the person was just guessing or not. All we are trying to do is see how likely it is to have 9 matches out of 10 if the person was truly guessing. When we say “truly guessing” we are assuming that we have a 50/50 chance of guessing correctly. This can be modeled using a coin flip and then seeing whether we guessed correctly for each of the coin flips. If we guessed correctly, we can think of that as a “success.” We often don’t have time to do the physical flipping over and over again and we’d like to be able to do more than just 20 different simulations or so. Luckily, we can use R to simulate this process many times. The mosaic package includes a function called rflip(), which can be used to flip one coin. Well, not exactly. It uses pseudo-random number generation to “flip” a virtual coin. In order for us all to get the same results here, we can set the seed of the pseudo-random number generator. Let’s see an example of this: (Remember to load the mosaic package!) library(mosaic) set.seed(2017) do(1) * rflip(1) ## n heads tails prop ## 1 1 1 0 1 This shows us the proportion of “successes” in one flip of a coin. The do function in the mosaic package will be useful and you can begin to understand what it does with another example. do(13) * rflip(10) ## n heads tails prop ## 1 10 4 6 0.4 ## 2 10 5 5 0.5 ## 3 10 5 5 0.5 ## 4 10 7 3 0.7 ## 5 10 5 5 0.5 ## 6 10 7 3 0.7 ## 7 10 5 5 0.5 ## 8 10 4 6 0.4 ## 9 10 7 3 0.7 ## 10 10 2 8 0.2 ## 11 10 4 6 0.4 ## 12 10 5 5 0.5 ## 13 10 4 6 0.4 We’ve now done a simulation of what actually happened when you flipped a coin ten times. We have 13 different simulations of flipping a coin 10 times. Note here that heads now corresponds to the number of correct guesses and tails corresponds to the number of incorrect guesses. (This can be tricky to understand at first since we’ve done a switch on what the meaning of “heads” and ``tails&quot; are.) If you look at the output above for our simulation of 13 student guesses, we can begin to get a sense for what an “expected” sample proportion of successes may be. Around five out of 10 seems to be the most likely value. What does this say about our assumed \\(\\hat{p}\\) of 9/10? To better answer this question, we can simulate 10,000 student guesses and then look at the distribution of the simulated sample proportion of successes, also known as the null distribution. library(dplyr) simGuesses &lt;- do(10000) * rflip(10) simGuesses %&gt;% group_by(heads) %&gt;% summarize(count = n()) ## # A tibble: 11 × 2 ## heads count ## &lt;dbl&gt; &lt;int&gt; ## 1 0 9 ## 2 1 98 ## 3 2 431 ## 4 3 1197 ## 5 4 2016 ## 6 5 2459 ## 7 6 2066 ## 8 7 1211 ## 9 8 408 ## 10 9 91 ## 11 10 14 We can see here that we have created a count of how many of each of the 10,000 sets of 10 flips resulted in 0, 1, 2, …, up to 10 heads. Note the use of the group_by and summarize functions from Chapter ?? here. In addition, we can plot the distribution of these simulated heads using the ideas from Chapter ??. heads is a quantitative variable. Think about which type of plot is most appropriate here before reading further. We already have an idea as to an appropriate plot by the data summarization that we did in the chunk above. We’d like to see how many heads occurred in the 10,000 sets of 10 flips. In other words, we’d like to see how frequently 9 or more heads occurred in the 10 flips: library(ggplot2) simGuesses %&gt;% ggplot(aes(x = heads)) + geom_histogram(binwidth = 1, color = &quot;white&quot;) Figure 6.3: Histogram of number of heads in simulation - needs tweaking This horizontal axis labels are a little confusing here. What does 2.5 or 7.5 heads mean? In simGuesses, heads is a numerical variable. Thus, ggplot is expecting the values to be on a continuous scale. We can switch the scale to be discrete by invoking the factor function and using geom_bar instead of geom_histogram: library(ggplot2) simGuesses %&gt;% ggplot(aes(x = factor(heads))) + geom_bar() Figure 6.4: Barplot of number of heads in simulation You’ll frequently need to make this conversion to factor when making a barplot with quantitative variables. Remember from “Getting Used to R, RStudio, and R Markdown” (Ismay 2016), that a factor variable is useful when there is a natural ordering to the variable and it only takes on discrete values and not fractional values like 2.5. Our heads variable has a natural ordering: 0, 1, 2, \\(\\ldots\\), 10. Again, note that the shape of these number of heads follows what appears to be a normal distribution. We’ll see that if appropriate conditions/assumptions are met with the data that we can expect to see a normal distribution result. When these conditions aren’t met, the simulation methodology we’ve presented here still works well whereas the traditional normal-based methods start to fall apart. We will delve further into hypothesis testing in the next few chapters. This null distribution in combination with the sampling distribution concept covered earlier will be of utmost importance going forward. 6.4 Review of mosaic simulation functions In this chapter, we’ve discussed three functions in the mosaic package useful in understanding the stepping stones to statistical inference: do, rflip, and resample. We will also work with the shuffle function in later chapters and we summarize it here for your reference later. do: Its main use is in replicating a process many times. It has one argument n which specifies how many times to replicate the process. It then uses *, which can be read as “times”, and whatever follows immediately after it as the process. rflip: This is used to simulate the flipping of a coin many times. By default, it flips a fair coin one time giving an equal chance to heads and tails. It is frequently used with do() * to simulate many coin flips in multiple sets. resample: This is used to sample from a larger data set with or without replacement. When we are thinking about the concept of random sampling, we sample without replacement. We can also sample with replacement corresponding to the values being replaced into the pool to draw from with the possibility that they are drawn again in the resample. This will be of great importance when we discuss bootstrapping with confidence intervals. shuffle: Its main purpose is to permute the values of one variable across the values of another variable. This acts in much the same way as shuffling a deck of cards and then presenting the shuffled deck to two (or more) players. Learning check (LC6.16) Recreate rflip using only the resample function and specifying the appropriate arguments. (LC6.17) Recreate shuffle using only the resample function and specifying the appropriate arguments. 6.5 Script of R code An R script file of all R code used in this chapter is available here. 6.6 What’s to come? This chapter has served as an introduction into inferential techniques that will be discussed in greater detail in Chapter 7 for hypothesis testing and in Chapter 8 for confidence intervals. In these chapters, we will see how we can use a related concept of resampling when working with the distributions of two groups. All of these concepts will be further reinforced in Chapter ?? as well. References "],
-["7-hypo.html", "7 Hypothesis Testing Needed packages 7.1 When Inference Is Not Needed 7.2 Basics of Hypothesis Testing 7.3 Criminal trial analogy 7.4 Types of Errors in Hypothesis Testing 7.5 Statistical Significance 7.6 EXAMPLE: Revisiting the Lady Tasting Tea 7.7 EXAMPLE: Comparing two means 7.8 Building theory-based methods using computation 7.9 What’s to come?", " 7 Hypothesis Testing We saw some of the main concepts of hypothesis testing introduced in Chapter ??. We will expand further on these ideas here and also provide a framework for understanding hypothesis tests in general. Instead of presenting you with lots of different formulas and scenarios, we hope to build a way to think about all hypothesis tests. You can then adapt to different scenarios as needed down the road when you encounter different statistical situations. The same can be said for confidence intervals. There is one general framework that applies to all confidence intervals and we will elaborate on this further in Chapter 8. The specifics may change slightly for each variation, but the important idea is to understand the general framework so that you can apply it to more specific problems. We believe that this approach is much better in the long-term than teaching you specific tests and confidence intervals rigorously. You can find full worked out examples for five common hypothesis tests and their corresponding confidence intervals in Appendix B. We recommend that you carefully review these examples as they also cover how the general frameworks apply to traditional normal-based methodologies like the \\(t\\)-test and normal-theory confidence intervals. You’ll see there that these methods are just approximations for the general computational frameworks, but require conditions to be met for their results to be valid. The general frameworks using randomization, simulation, and bootstrapping do not hold the same sorts of restrictions and further advance computational thinking, which is one big reason for their emphasis throughout this textbook. Needed packages library(dplyr) library(ggplot2) library(okcupiddata) library(mosaic) library(knitr) library(nycflights13) 7.1 When Inference Is Not Needed Before we delve into the two techniques of inference (hypothesis testing and confidence intervals), it’s good to remember that there are cases where you need not perform a rigorous statistical inference. An important and time-saving skill is to ALWAYS do exploratory data analysis using dplyr and ggplot2 before thinking about running a hypothesis test. Let’s look at such an example selecting a sample of flights traveling to Boston and to San Francisco from New York City in the flights data frame in the nycflights13 package. (We will remove flights with missing data first using na.omit and then sample 100 flights going to each of the two airports.) library(nycflights13) data(flights) bos_sfo &lt;- flights %&gt;% na.omit() %&gt;% filter(dest %in% c(&quot;BOS&quot;, &quot;SFO&quot;)) %&gt;% group_by(dest) %&gt;% sample_n(100) Suppose we were interested in seeing if the air_time to SFO in San Francisco was statistically greater than the air_time to BOS in Boston. As suggested, let’s begin with some exploratory data analysis to get a sense for how the two variables of air_time and dest relate for these two destination airports: library(dplyr) bos_sfo_summary &lt;- bos_sfo %&gt;% group_by(dest) %&gt;% summarize(mean_time = mean(air_time), sd_time = sd(air_time)) kable(bos_sfo_summary) dest mean_time sd_time BOS 38.35 5.726732 SFO 345.61 15.354988 Looking at these results, we can clearly see that SFO air_time is much larger than BOS air_time. The standard deviation is also extremely informative here. Learning check (LC7.1) Could we make the same type of immediate conclusion that SFO had a statistically greater air_time if, say, its corresponding standard deviation was 200 minutes? What about 100 minutes? Explain. To further understand just how different the air_time variable is for BOS and SFO, let’s look at a boxplot: library(ggplot2) ggplot(data = bos_sfo, mapping = aes(x = dest, y = air_time)) + geom_boxplot() Since there is no overlap at all, we can conclude that the air_time for San Francisco flights is statistically greater (at any level of significance) than the air_time for Boston flights. This is a clear example of not needing to do anything more than some simple descriptive statistics to get an appropriate inferential conclusion. This is one reason why you should ALWAYS investigate the sample data first using dplyr and ggplot2 via exploratory data analysis. As you get more and more practice with hypothesis testing, you’ll be better able to determine in many cases whether or not the results will be statistically significant. There are circumstances where it is difficult to tell, but you should always try to make a guess FIRST about significance after you have completed your data exploration and before you actually begin the inferential techniques. 7.2 Basics of Hypothesis Testing In a hypothesis test, we will use data from a sample to help us decide between two competing hypotheses about a population. We make these hypotheses more concrete by specifying them in terms of at least one population parameter of interest. We refer to the competing claims about the population as the null hypothesis, denoted by \\(H_0\\), and the alternative (or research) hypothesis, denoted by \\(H_a\\). The roles of these two hypotheses are NOT interchangeable. The claim for which we seek significant evidence is assigned to the alternative hypothesis. The alternative is usually what the experimenter or researcher wants to establish or find evidence for. Usually, the null hypothesis is a claim that there really is “no effect” or “no difference.” In many cases, the null hypothesis represents the status quo or that nothing interesting is happening. We assess the strength of evidence by assuming the null hypothesis is true and determining how unlikely it would be to see sample results/statistics as extreme (or more extreme) as those in the original sample. Hypothesis testing brings about many weird and incorrect notions in the scientific community and society at large. One reason for this is that statistics has traditionally been thought of as this magic box of algorithms and procedures to get to results and this has been readily apparent if you do a Google search of “flowchart statistics hypothesis tests”. There are so many different complex ways to determine which test is appropriate. You’ll see that we don’t need to rely on these complicated series of assumptions and procedures to conduct a hypothesis test any longer. These methods were introduced in a time when computers weren’t powerful. Your cellphone (in 2016) has more power than the computers that sent NASA astronauts to the moon after all. We’ll see that ALL hypothesis tests can be broken down into the following framework given by Allen Downey here: Figure 7.1: Hypothesis Testing Framework Before we hop into this framework, we will provide another way to think about hypothesis testing that may be useful. 7.3 Criminal trial analogy We can think of hypothesis testing in the same context as a criminal trial in the United States. A criminal trial in the United States is a familiar situation in which a choice between two contradictory claims must be made. 1. The accuser of the crime must be judged either guilty or not guilty. Under the U.S. system of justice, the individual on trial is initially presumed not guilty. Only STRONG EVIDENCE to the contrary causes the not guilty claim to be rejected in favor of a guilty verdict. The phrase “beyond a reasonable doubt” is often used to set the cutoff value for when enough evidence has been given to convict. Theoretically, we should never say “The person is innocent.” but instead “There is not sufficient evidence to show that the person is guilty.” Now let’s compare that to how we look at a hypothesis test. The decision about the population parameter(s) must be judged to follow one of two hypotheses. We initially assume that \\(H_0\\) is true. The null hypothesis \\(H_0\\) will be rejected (in favor of \\(H_a\\)) only if the sample evidence strongly suggests that \\(H_0\\) is false. If the sample does not provide such evidence, \\(H_0\\) will not be rejected. The analogy to “beyond a reasonable doubt” in hypothesis testing is what is known as the significance level. This will be set before conducting the hypothesis test and is denoted as \\(\\alpha\\). Common values for \\(\\alpha\\) are 0.1, 0.01, and 0.05. 7.3.1 Two possible conclusions Therefore, we have two possible conclusions with hypothesis testing: Reject \\(H_0\\) Fail to reject \\(H_0\\) Gut instinct says that “Fail to reject \\(H_0\\)” should say “Accept \\(H_0\\)” but this technically is not correct. Accepting \\(H_0\\) is the same as saying that a person is innocent. We cannot show that a person is innocent; we can only say that there was not enough substantial evidence to find the person guilty. When you run a hypothesis test, you are the jury of the trial. You decide whether there is enough evidence to convince yourself that \\(H_a\\) is true (“the person is guilty”) or that there was not enough evidence to convince yourself \\(H_a\\) is true (“the person is not guilty”). You must convince yourself (using statistical arguments) which hypothesis is the correct one given the sample information. Important note: Therefore, DO NOT WRITE “Accept \\(H_0\\)” any time you conduct a hypothesis test. Instead write “Fail to reject \\(H_0\\).” 7.4 Types of Errors in Hypothesis Testing Unfortunately, just as a jury or a judge can make an incorrect decision in regards to a criminal trial by reaching the wrong verdict, there is some chance we will reach the wrong conclusion via a hypothesis test about a population parameter. As with criminal trials, this comes from the fact that we don’t have complete information, but rather a sample from which to try to infer about a population. The possible erroneous conclusions in a criminal trial are an innocent person is convicted (found guilty) or a guilty person is set free (found not guilty). The possible errors in a hypothesis test are - rejecting \\(H_0\\) when in fact \\(H_0\\) is true (Type I Error) - failing to reject \\(H_0\\) when in fact \\(H_0\\) is false (Type II Error) The risk of error is the price researchers pay for basing an inference about a population on a sample. With any reasonable sample-based procedure, there is some chance that a Type I error will be made and some chance that a Type II error will occur. To help understand the concepts of Type I error and Type II error, observe the following table: Figure 7.2: Type I and Type II errors If we are using sample data to make inferences about a parameter, we run the risk of making a mistake. Obviously, we want to minimize our chance of error; we want a small probability of drawing an incorrect conclusion. The probability of a Type I Error occurring is denoted by \\(\\alpha\\) and is called the significance level of a hypothesis test The probability of a Type II Error is denoted by \\(\\beta\\). Formally, we can define \\(\\alpha\\) and \\(\\beta\\) in regards to the table above, but for hypothesis tests instead of a criminal trial. \\(\\alpha\\) corresponds to the probability of rejecting \\(H_0\\) when, in fact, \\(H_0\\) is true. \\(\\beta\\) corresponds to the probability of failing to reject \\(H_0\\) when, in fact, \\(H_0\\) is false. Ideally, we want \\(\\alpha = 0\\) and \\(\\beta = 0\\), meaning that the chance of making an error does not exist. When we have to use incomplete information (sample data), it is not possible to have both \\(\\alpha = 0\\) and \\(\\beta = 0\\). We will always have the possibility of at least one error existing when we use sample data. Usually, what is done is that \\(\\alpha\\) is set before the hypothesis test is conducted and then the evidence is judged against that significance level. Common values for \\(\\alpha\\) are 0.05, 0.01, and 0.10. If \\(\\alpha = 0.05\\), we are using a testing procedure that, used over and over with different samples, rejects a TRUE null hypothesis five percent of the time. So if we can set \\(\\alpha\\) to be whatever we want, why choose 0.05 instead of 0.01 or even better 0.0000000000000001? Well, a small \\(\\alpha\\) means the test procedure requires the evidence against \\(H_0\\) to be very strong before we can reject \\(H_0\\). This means we will almost never reject \\(H_0\\) if \\(\\alpha\\) is very small. If we almost never reject \\(H_0\\), the probability of a Type II Error – failing to reject \\(H_0\\) when we should – will increase! Thus, as \\(\\alpha\\) decreases, \\(\\beta\\) increases and as \\(\\alpha\\) increases, \\(\\beta\\) decreases. We, therefore, need to strike a balance in \\(\\alpha\\) and \\(\\beta\\) and the common values of 0.05, 0.01, and 0.10 usually lead to a nice balance. Learning check (LC7.2) Reproduce the table above, but for a hypothesis test, instead of the one provided for a criminal trial. 7.4.1 Logic of Hypothesis Testing Take a random sample (or samples) from a population (or two populations) If the sample data are consistent with the null hypothesis, do not reject the null hypothesis. If the sample data are inconsistent with the null hypothesis (in the direction of the alternative hypothesis), reject the null hypothesis and conclude that there is evidence the alternative hypothesis is true (based on the particular sample collected). 7.5 Statistical Significance The idea that sample results are more extreme than we would reasonably expect to see by random chance if the null hypothesis were true is the fundamental idea behind statistical hypothesis tests. If data as extreme would be very unlikely if the null hypothesis were true, we say the data are statistically significant. Statistically significant data provide convincing evidence against the null hypothesis in favor of the alternative, and allow us to generalize our sample results to the claim about the population. Definition: Statistical Significance When results as extreme as the observed sample statistic are unlikely to occur by random chance alone (assuming the null hypothesis is true), we say the sample results/statistics are statistically significant. If our sample is statistically significant, we have convincing evidence against \\(H_0\\) and in favor of \\(H_a\\). Learning check (LC7.3) What is wrong about saying “The defendant is innocent.” based on the US system of criminal trials? (LC7.4) What is the purpose of hypothesis testing? (LC7.5) What are some flaws with hypothesis testing? How could we alleviate them? 7.6 EXAMPLE: Revisiting the Lady Tasting Tea Recall the “There is Only One Test” diagram from earlier: Figure 7.3: Hypothesis Testing Framework We will now walk-through how each of the steps to the diagram apply to determining whether the lady tasting tea was actually better than chance at determining whether or not milk was added first. We will see that the process of creating a null distribution is a statistical way to quantifying surprise. 7.6.1 Data Let’s assume as we did in Chapter ??, that the lady is correct in determining whether milk was added first or not in 9 out of 10 trials. Our data, therefore, may look something like Correct Correct Correct Incorrect Correct Correct Correct Correct Correct Correct 7.6.2 Test Statistic \\(\\delta\\) We are interested in the number of Correct out of our 10 trials. We can denote this number of successes using the symbol \\(t\\), where \\(t\\) corresponds to total. This is our test statistic \\(\\delta\\) in this case. 7.6.3 Observed effect \\(\\delta^*\\) The actual observed value of the test statistic from our observed sample is \\(\\hat{t}_{obs} = 9\\). Thus, \\(\\delta^* = 9\\). 7.6.4 Model of \\(H_0\\) Our null hypothesis is that the lady is only as good as chance at guessing correctly. Hypotheses always correspond to parameters and are denoted with Greek letters. Thus, symbolically, we have \\(H_0: \\tau = 5\\). Since we are assuming chance and we have 10 flips with 0.5 probability of success of each flip, we have \\(\\tau = 10 \\times 0.5 = 5\\). 7.6.5 Simulated Data We now want to use this null hypothesis to simulate the test statistic assuming that the null hypothesis is true. Therefore, we want to figure out a way to simulate in 10 trials, getting either the choice Correct or Incorrect, assuming that the probability of success (getting it Correct) in any given trial is 0.5. Tactile simulation When you are presented with a hypothesis testing problem, frequently the most challenging portion is setting up how to simulate the data assuming the null hypothesis is true. To facilitate with this, setting up a tactile, hands on experiment can help. In this case, flipping a fair coin is a great way to simulate this process. To simulate 10 trials, we could flip the fair coin and record Heads as Correct and Tails as Incorrect. Some simulated data using this coin flipping procedure may look like the following. Note that this data frame is not tidy, but is a convenient way to look at the results of the simulation in this wide format. The numbers on the fair left correspond to the number of the trial. sample1 sample2 sample3 1 Correct Correct Correct 2 Correct Incorrect Incorrect 3 Incorrect Incorrect Correct 4 Incorrect Incorrect Correct 5 Correct Incorrect Incorrect 6 Correct Incorrect Correct 7 Incorrect Incorrect Correct 8 Incorrect Correct Incorrect 9 Incorrect Correct Incorrect 10 Incorrect Correct Incorrect We then use the formula for the Test Statistic to determine the simulated test statistic for each of these simulated samples. So in this case we have \\(t_1 = 4\\), \\(t_2 = 4\\), \\(t_3 = 5\\) 7.6.6 Distribution of \\(\\delta\\) under \\(H_0\\) We could continue this process say 10,000 times by flipping a coin in sets of 10 for 10,000 repetitions and counting and taking note of how many heads out of 10 we have for each set. It’s at this point that you realize that a computer can do this procedure much faster and more efficient than the tactile experiment with a coin. Recall that we’ve already created the distribution of 10,000 such coin flips and we’ve stored these values in the heads variable in the simGuesses data frame: library(ggplot2) ggplot(data = simGuesses, aes(x = factor(heads))) + geom_bar() 7.6.7 The p-value Definition: \\(p\\)-value: The p-value is the probability of observing a sample statistic as extreme or more extreme than what was observed, assuming that the null hypothesis of a by chance operation is true. This definition may be a little intimidating the first time you read it, but it’s important to come back to this “The Lady Tasting Tea” problem whenever you encounter \\(p\\)-values as you begin to learn about the concept. Here the \\(p\\)-value corresponds to how many times in our null distribution of heads 9 or more heads occurred. We can use another neat feature of R to calculate the \\(p\\)-value for this problem. Note that “more extreme” in this case corresponds to looking at values of 9 or greater since our alternative hypothesis invokes a right-tail test corresponding to a “greater than” hypothesis of \\(H_a: \\pi &gt; 0.5\\). In other words, we are looking to see how likely it is for the lady to pick 9 or more correct instead of 9 or less correct. We’d like to go in the right direction. pvalue_tea &lt;- simGuesses %&gt;% filter(heads &gt;= 9) %&gt;% nrow() / nrow(simGuesses) Let’s walk through each step of this calculation: First, pvalue_tea will be the name of our calculated \\(p\\)-value and the assignment operator &lt;- directs us to this naming. We are working with the simGuesses data frame here so that comes immediately before the pipe operator. We would like to only focus on the rows in our simGuesses data frame that have heads values of 9 or 10. This represents simulated statistics “as extreme or more extreme” than what we observed (9 correct guesses out of 10). Let’s get a glimpse of what we have up to this point: kable(simGuesses %&gt;% filter(heads &gt;= 9)) n heads tails prop 10 9 1 0.9 10 10 0 1.0 10 9 1 0.9 10 9 1 0.9 10 10 0 1.0 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 10 0 1.0 10 10 0 1.0 10 10 0 1.0 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 10 0 1.0 10 9 1 0.9 10 9 1 0.9 10 10 0 1.0 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 10 0 1.0 10 10 0 1.0 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 10 0 1.0 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 10 0 1.0 10 9 1 0.9 10 9 1 0.9 10 10 0 1.0 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 10 0 1.0 10 10 0 1.0 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 10 9 1 0.9 Now that we have changed the focus to only those rows that have number of heads out of 10 flips corresponding to 9 or more, we count how many of those there are. The function nrow gives how many entries are in this filtered data frame and lastly we calculate the proportion that are at least as extreme as our observed value of 9 by dividing by the number of total simulations (10,000). We can see that the observed statistic of 9 correct guesses is not a likely outcome assuming the null hypothesis is true. Only around 1% of the outcomes in our 10,000 simulations fall at or above 9 successes. We have evidence supporting the conclusion that the person is actually better than just guessing at random at determining whether milk has been added first or not. To better visualize this we can also make use of pink shading on the histogram corresponding to the \\(p\\)-value: library(ggplot2) ggplot(data = simGuesses, aes(x = factor(heads), fill = (heads &gt;= 9))) + geom_bar() + labs(x = &quot;heads&quot;) Figure 7.4: Barplot of heads with p-value highlighted This helps us better see just how few of the values of heads are at our observed value or more extreme. We’ll see in Chapters 7 and 8 that this idea of a \\(p\\)-value can be extended to the more traditional methods using normal and \\(t\\) distributions in the traditional way that introductory statistics has been presented. These traditional methods were used because statisticians haven’t always been able to do 10,000 simulations on the computer within seconds. We’ll elaborate on this more in these later chapters. Learning check (LC7.6) What is meant by “pseudo-random number generation?” (LC7.7) How can simulation be used to help us address the question of whether or not an observed result is statistically significant? (LC7.8) In Chapter ??, we noted that barplots should be used when creating a plot of categorical variables. Why are we using barplots to make a plot of a numerical variable heads in this chapter? 7.7 EXAMPLE: Comparing two means 7.7.1 Randomization/Permutation We will now focus on building hypotheses looking at the difference between two population means in an example. We will denote population means using the Greek symbol \\(\\mu\\) (pronounced “mu”). Thus, we will be looking to see if one group “out-performs” another group. This is quite possibly the most common type of statistical inference and serves as a basis for many other types of analyses when comparing two groups. Our null hypothesis will be of the form \\(H_0: \\mu_1 = \\mu_2\\), which can also be written as \\(H_0: \\mu_1 - \\mu_2 = 0\\). Our alternative hypothesis will be of the form \\(H_0: \\mu_1 \\star \\mu_2\\) (or \\(H_a: \\mu_1 - \\mu_2 \\, \\star \\, 0\\)) where \\(\\star\\) = \\(&lt;\\), \\(\\ne\\), or \\(&gt;\\) depending on the context of the problem. You needn’t focus on these new symbols too much at this point. It will just be a shortcut way for us to describe our hypotheses. As we saw earlier, simulation is a valuable tool when conducting inferences based on one population variable. We will see that the process of randomization (also known as permutation) will be valuable in conducting tests comparing quantitative values from two groups. 7.7.2 Comparing Action and Romance Movies The movies data set in the ggplot2movies package contains information on a large number of movies that have been rated by users of IMDB.com. We are interested in the question here of whether Action movies are rated higher on IMDB than Romance movies. We will first need to do a little bit of data manipulation using the ideas from Chapter ?? to get the data in the form that we would like: library(dplyr) library(ggplot2movies) (movies_trimmed &lt;- movies %&gt;% select(title, year, rating, Action, Romance)) ## # A tibble: 58,788 × 5 ## title year rating Action Romance ## &lt;chr&gt; &lt;int&gt; &lt;dbl&gt; &lt;int&gt; &lt;int&gt; ## 1 $ 1971 6.4 0 0 ## 2 $1000 a Touchdown 1939 6.0 0 0 ## 3 $21 a Day Once a Month 1941 8.2 0 0 ## 4 $40,000 1996 8.2 0 0 ## 5 $50,000 Climax Show, The 1975 3.4 0 0 ## 6 $pent 2000 4.3 0 0 ## 7 $windle 2002 5.3 1 0 ## 8 &#39;15&#39; 2002 6.7 0 0 ## 9 &#39;38 1987 6.6 0 0 ## 10 &#39;49-&#39;17 1917 6.0 0 0 ## # ... with 58,778 more rows Note that Action and Romance are binary variables here. To remove any overlap of movies (and potential confusion) that are both Action and Romance, we will remove them from our population: movies_trimmed &lt;- movies_trimmed %&gt;% filter(!(Action == 1 &amp; Romance == 1)) We will now create a new variable called genre that specifies whether a movie in our movies_trimmed data frame is an &quot;Action&quot; movie, a &quot;Romance&quot; movie, or &quot;Neither&quot;. We aren’t really interested in the &quot;Neither&quot; category here so we will exclude those rows as well. Lastly, the Action and Romance columns are not needed anymore since they are encoded in the genre column. movies_trimmed &lt;- movies_trimmed %&gt;% mutate(genre = ifelse(Action == 1, &quot;Action&quot;, ifelse(Romance == 1, &quot;Romance&quot;, &quot;Neither&quot;))) %&gt;% filter(genre != &quot;Neither&quot;) %&gt;% select(-Action, -Romance) We are left with 8878 movies in our population data set that focuses on only &quot;Action&quot; and &quot;Romance&quot; movies. Learning check (LC7.9) Why are the different genre variables stored as binary variables (1s and 0s) instead of just listing the genre as a column of values like “Action”, “Comedy”, etc.? (LC7.10) What complications could come above with us excluding action romance movies? Should we question the results of our hypothesis test? Explain. Let’s now visualize the distributions of rating across both levels of genre. Think about what type(s) of plot is/are appropriate here before you proceed: library(ggplot2) ggplot(data = movies_trimmed, aes(x = genre, y = rating)) + geom_boxplot() Figure 7.5: Rating vs genre in the population We can see that the middle 50% of ratings for &quot;Action&quot; movies is more spread out than that of &quot;Romance&quot; movies in the population. &quot;Romance&quot; has outliers at both the top and bottoms of the scale though. We are initially interested in comparing the mean rating across these two groups so a faceted histogram may also be useful: ggplot(data = movies_trimmed, mapping = aes(x = rating)) + geom_histogram(binwidth = 1, color = &quot;white&quot;, fill = &quot;dodgerblue&quot;) + facet_grid(genre ~ .) Figure 7.6: Faceted histogram of genre vs rating Important note: Remember that we hardly ever have access to the population values as we do here. This example and the nycflights13 data set were used to create a common flow from chapter to chapter. In nearly all circumstances, we’ll be needing to use only a sample of the population to try to infer conclusions about the unknown population parameter values. These examples do show a nice relationship between statistics (where data is usually small and more focused on experimental settings) and data science (where data is frequently large and collected without experimental conditions). 7.7.3 Sampling \\(\\rightarrow\\) Randomization We can use hypothesis testing to investigate ways to determine, for example, whether a treatment has an effect over a control and other ways to statistically analyze if one group performs better than, worse than, or different than another. We will also use confidence intervals to determine the size of the effect if it exists. You’ll see more on this in Chapter 8. We are interested here in seeing how we can use a random sample of action movies and a random sample of romance movies from movies to determine if a statistical difference exists in the mean ratings of each group. Learning check (LC7.11) Define the relevant parameters here in terms of the populations of movies. 7.7.4 Data Let’s select a random sample of 34 action movies and a random sample of 34 romance movies. (The number 34 was chosen somewhat arbitrarily here.) library(dplyr) library(mosaic) set.seed(2016) movies_genre_sample &lt;- movies_trimmed %&gt;% group_by(genre) %&gt;% sample_n(34) We can now observe the distributions of our two sample ratings for both groups. Remember that these plots should be rough approximations of our population distributions of movie ratings for &quot;Action&quot; and &quot;Romance&quot; in our population of all movies in the movies data frame. ggplot(data = movies_genre_sample, aes(x = genre, y = rating)) + geom_boxplot() Figure 7.7: Genre vs rating for our sample ggplot(data = movies_genre_sample, mapping = aes(x = rating)) + geom_histogram(binwidth = 1, color = &quot;white&quot;, fill = &quot;dodgerblue&quot;) + facet_grid(genre ~ .) Figure 7.8: Genre vs rating for our sample as faceted histogram Learning check (LC7.12) What single value could we change to improve the approximation using the sample distribution on the population distribution? Do we have reason to believe, based on the sample distributions of rating over the two groups of genre, that there is a significant difference between the mean rating for action movies compared to romance movies? It’s hard to say just based on the plots. The boxplot does show that the median sample rating is higher for romance movies, but the histogram isn’t as clear. The two groups have somewhat differently shaped distributions but they are both over similar values of rating. It’s often useful to calculate the mean and standard deviation as well, conditioned on the two levels. summary_ratings &lt;- movies_genre_sample %&gt;% group_by(genre) %&gt;% summarize(mean = mean(rating), std_dev = sd(rating), n = n()) summary_ratings ## # A tibble: 2 × 4 ## genre mean std_dev n ## &lt;chr&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;int&gt; ## 1 Action 5.197059 1.464837 34 ## 2 Romance 6.026471 1.202096 34 Learning check (LC7.13) Why did we not specify na.rm = TRUE here as we did in Chapter ??? We see that the sample mean rating for romance movies, \\(\\bar{x}_{r}\\), is greater than the similar measure for action movies, \\(\\bar{x}_a\\). But is it statistically significantly greater (thus, leading us to conclude that the means are statistically different)? The standard deviation can provide some insight here but with these standard deviations being so similar it’s still hard to say for sure. Learning check (LC7.14) Why might the standard deviation provide some insight about the means being statistically different or not? 7.7.5 Model of \\(H_0\\) The hypotheses we specified can also be written in another form to better give us an idea of what we will be simulating to create our null distribution. \\(H_0: \\mu_r - \\mu_a = 0\\) \\(H_a: \\mu_r - \\mu_a \\ne 0\\) 7.7.6 Test Statistic \\(\\delta\\) We are, therefore, interested in seeing whether the difference in the sample means, \\(\\bar{x}_r - \\bar{x}_a\\), is statistically different than 0. R has a built-in command that can calculate the difference in these two sample means. 7.7.7 Observed effect \\(\\delta^*\\) mean_ratings &lt;- movies_genre_sample %&gt;% group_by(genre) %&gt;% summarize(mean = mean(rating)) obs_diff &lt;- diff(mean_ratings$mean) We see here that the diff function calculates \\(\\bar{x}_r - \\bar{x}_a = 6.0264706 - 5.1970588 = 0.8294118\\). We will now proceed similarly to how we conducted the hypothesis test above for the Lady Tasting Tea using simulation. Our goal is figure out a random process with which to simulate the null hypothesis being true. Earlier in this chapter, we used flipping of a fair coin as the random process we were simulating with the null hypothesis being true (\\(H_0: \\tau = 5\\)). 7.7.8 Simulated Data Tactile simulation Here, with us assuming the two population means are equal (\\(H_0: \\mu_r - \\mu_a = 0\\)), we can look at this from a tactile point of view by using index cards. There are \\(n_r = 34\\) data elements corresponding to romance movies and \\(n_a = 34\\) for action movies. We can write the 34 ratings from our sample for romance movies on one set of 34 index cards and the 34 ratings for action movies on another set of 34 index cards. (Note that the sample sizes need not be the same.) The next step is to put the two stacks of index cards together, creating a new set of 68 cards. If we assume that the two population means are equal, we are saying that there is no association between ratings and genre (romance vs action). We can use the index cards to create two new stacks for romance and action movies. First, we must shuffle all the cards thoroughly. After doing so, in this case with equal values of sample sizes, we split the deck in half. We then calculate the new sample mean rating of the romance deck, and also the new sample mean rating of the action deck. This creates one simulation of the samples that were collected originally. We next want to calculate a statistic from these two samples. Instead of actually doing the calculation using index cards, we can use R as we have before to simulate this process. Learning check (LC7.15) How would the tactile shuffling of index cards change if we had different samples of say 20 action movies and 60 romance movies? Describe each step that would change. (LC7.16) Why are we taking the difference in the means of the cards in the new shuffled decks? library(mosaic) shuffled_ratings &lt;- movies_trimmed %&gt;% mutate(genre = shuffle(genre)) %&gt;% group_by(genre) %&gt;% summarize(mean = mean(rating)) diff(shuffled_ratings$mean) ## [1] -0.0170207 7.7.9 Distribution of \\(\\delta\\) under \\(H_0\\) The only new command here is shuffle from the mosaic package, which does what we would expect it to do. It simulates a shuffling of the ratings between the two levels of genre just as we could have done with index cards. We can now proceed in a similar way to what we have done previously with the Lady Tasting Tea example by repeating this process many times to create a null distribution of simulated differences in sample means. set.seed(2016) many_shuffles &lt;- do(10000) * (movies_trimmed %&gt;% mutate(rating = shuffle(rating)) %&gt;% group_by(genre) %&gt;% summarize(mean = mean(rating)) ) It is a good idea here to View the many_shuffles data frame via View(many_shuffles). We need to figure out a way to subtract the first value of mean from the second value of mean for each of the 10,000 simulations. This is a little tricky but the group_by function comes to our rescue here: rand_distn &lt;- many_shuffles %&gt;% group_by(.index) %&gt;% summarize(diffmean = diff(mean)) We can now plot the distribution of these simulated differences in means: ggplot(data = rand_distn, aes(x = diffmean)) + geom_histogram(color = &quot;white&quot;, bins = 20) Figure 7.9: Simulated differences in means histogram 7.7.10 The p-value Remember that we are interested in seeing where our observed sample mean difference of 0.8294118 falls on this null/randomization distribution. We are interested in simply a difference here so “more extreme” corresponds to values in both tails on the distribution. Let’s shade our null distribution to show a visual representation of our \\(p\\)-value: ggplot(data = rand_distn, aes(x = diffmean, fill = (abs(diffmean) &gt;= obs_diff))) + geom_histogram(color = &quot;white&quot;, bins = 20) Figure 7.10: Shaded histogram to show p-value You may initially think there is an error here, but remember that the observed difference in means was 0.8294118. It falls far outside the range of simulated differences. We can add a vertical line to represent both it and its negative (since this is a two-tailed test) instead: ggplot(data = rand_distn, aes(x = diffmean)) + geom_histogram(color = &quot;white&quot;, bins = 100) + geom_vline(xintercept = obs_diff, color = &quot;red&quot;) + geom_vline(xintercept = -obs_diff, color = &quot;red&quot;) Figure 7.11: Histogram with vertical lines corresponding to observed statistic Based on this plot, we have no values as extreme or more extreme than our observed effect in both directions so we have evidence supporting the conclusion that the mean rating for romance movies is different from that of action movies. (It doesn’t really matter what significance level was chosen in this case. Think about why.) The next important idea is to better understand just how much higher of a mean rating can we expect the romance movies to have compared to that of action movies. This can be addressed by creating a 95% confidence interval as we will explore in Chapter 8. Learning check (LC7.17) Conduct the same analysis comparing action movies versus romantic movies using the median rating instead of the mean rating? Make sure to use the %&gt;% as much as possible. What was different and what was the same? (LC7.18) What conclusions can you make from viewing the faceted histogram looking at rating versus genre that you couldn’t see when looking at the boxplot? (LC7.19) Describe in a paragraph how we used Allen Downey’s diagram to conclude if a statistical difference existed between mean movie ratings for action and romance movies. (LC7.20) Why are we relatively confident that the distributions of the sample ratings will be good approximations of the population distributions of ratings for the two genres? (LC7.21) Using the definition of “\\(p\\)-value”, write in words what the \\(p\\)-value represents for the hypothesis test above comparing the mean rating of romance to action movies. (LC7.22) What is the value of the \\(p\\)-value for the hypothesis test comparing the mean rating of romance to action movies? (LC7.23) Do the results of the hypothesis test match up with the original plots we made looking at the population of movies? Why or why not? 7.7.11 Summary To review, these are the steps one would take whenever you’d like to do a hypothesis test comparing values from the distributions of two groups: Simulate many samples using a random process that matches the way the original data were collected and that assumes the null hypothesis is true. Collect the values of a sample statistic for each sample created using this random process to build a randomization distribution. Assess the significance of the original sample by determining where its sample statistic lies in the randomization distribution. If the proportion of values as extreme or more extreme than the observed statistic in the randomization distribution is smaller than the pre-determined significance level \\(\\alpha\\), we reject \\(H_0\\). Otherwise, we fail to reject \\(H_0\\). (If no significance level is given, one can assume \\(\\alpha = 0.05\\).) 7.8 Building theory-based methods using computation As a point of reference, we will now discuss the traditional theory-based way to conduct the hypothesis test for determining if there is a statistically significant difference in the sample mean rating of Action movies versus Romance movies. This method and ones like it work very well when the assumptions are met in order to run the test. They are based on probability models and distributions such as the normal and \\(t\\)-distributions. These traditional methods have been used for many decades back to the time when researchers didn’t have access to computers that could run 10,000 simulations in under a minute. They had to base their methods on probability theory instead. Many fields and researchers continue to use these methods and that is the biggest reason for their inclusion here. It’s important to remember that a \\(t\\)-test or a \\(z\\)-test is really just an approximation of what you have seen in this chapter already using simulation and randomization. The focus here is on understanding how the shape of the \\(t\\)-curve comes about without digging big into the mathematical underpinnings. 7.8.1 EXAMPLE: \\(t\\)-test for two independent samples What is commonly done in statistics is the process of normalization. What this entails is calculating the mean and standard deviation of a variable. Then you subtract the mean from each value of your variable and divide by the standard deviation. The most common normalization is known as the \\(z\\)-score. The formula for a \\(z\\)-score is \\[Z = \\frac{x - \\mu}{\\sigma}\\], where \\(x\\) represent the value of a variable, \\(\\mu\\) represents the mean of the variable, and \\(\\sigma\\) represents the standard deviation of the variable. \\(z\\)-scores are normally distributed with mean 0 and standard deviation 1. They have the common, bell-shaped pattern. Recall, that we hardly ever know the mean and standard deviation of the population of interest. This is almost always the case when considering the means of two independent groups. To help account for us not knowing the population parameter values, we can use the sample statistics instead, but this comes with a bit of a price in terms of complexity. Another form of normalization occurs when we need to use the sample standard deviations as estimates for the unknown population standard deviations. This normalization is often called the \\(t\\)-score. For the two independent samples case like what we have for comparing action movies to romance movies, the formula is \\[T =\\dfrac{ (\\bar{x}_1 - \\bar{x}_2) - (\\mu_1 - \\mu_2)}{ \\sqrt{\\dfrac{s_1^2}{n_1} + \\dfrac{s_2^2}{n_2}} }\\] There is a lot to try to unpack here. \\(\\bar{x}_1\\) is the sample mean response of the first group \\(\\bar{x}_2\\) is the sample mean response of the second group \\(\\mu_1\\) is the population mean response of the first group \\(\\mu_2\\) is the population mean response of the second group \\(s_1\\) is the sample standard deviation of the response of the first group \\(s_2\\) is the sample standard deviation of the response of the second group \\(n_1\\) is the sample size of the first group \\(n_2\\) is the sample size of the second group Assuming that the null hypothesis is true (\\(H_0: \\mu_1 - \\mu_2 = 0\\)), \\(T\\) is said to be distributed following a \\(t\\) distribution with degrees of freedom equal to the smaller value of \\(n_1 - 1\\) and \\(n_2 - 1\\). The “degrees of freedom” can be thought of measuring how different the \\(t\\) distribution will be as compared to a normal distribution. Small sample sizes lead to small degrees of freedom and, thus, \\(t\\) distributions that have more values in the tails of their distributions. Large sample sizes lead to large degrees of freedom and, thus, \\(t\\) distributions that closely align with the standard normal, bell-shaped curve. So, assuming \\(H_0\\) is true, our formula simplifies a bit: \\[T =\\dfrac{ \\bar{x}_1 - \\bar{x}_2}{ \\sqrt{\\dfrac{s_1^2}{n_1} + \\dfrac{s_2^2}{n_2}} }\\] We have already built an approximation for what we think the distribution of \\(\\delta = \\bar{x}_1 - \\bar{x}_2\\) looks like using randomization above. Recall this distribution: ggplot(data = rand_distn, aes(x = diffmean)) + geom_histogram(color = &quot;white&quot;, bins = 20) Figure 7.12: Simulated differences in means histogram If we’d like to have a guess as to what the distribution of \\(T\\) might look like instead, we need only to divide every value in rand_distn by \\(\\sqrt{\\dfrac{s_1^2}{n_1} + \\dfrac{s_2^2}{n_2}}\\). As we did before, we will assign Romance to be group 1 and Action to be group 2. (This was done since Romance comes second alphabetically and the reason why we have a number mismatch below with 1 and 2.) Remember that we’ve already calculated these values: kable(summary_ratings) genre mean std_dev n Action 5.197059 1.464837 34 Romance 6.026471 1.202096 34 We will create some shortcuts here so you can see the value being calculated for the denominator of \\(T\\). s1 &lt;- summary_ratings$std_dev[2] s2 &lt;- summary_ratings$std_dev[1] n1 &lt;- summary_ratings$n[2] n2 &lt;- summary_ratings$n[1] Here, we have \\(s_1 = 1.2020964\\), \\(s_2 = 1.4648374\\), \\(n_1 = 34\\), and \\(n_2 = 34\\). We can calculate the denominator via denom_T &lt;- sqrt( (s1^2 / n1) + (s2^2 / n2) ) Now if we divide all of the values of diffmean in rand_distn by denom_T we can have a simulated distribution of \\(T\\) test statistics instead: rand_distn &lt;- rand_distn %&gt;% mutate(t_stat = diffmean / denom_T) ggplot(data = rand_distn, aes(x = t_stat)) + geom_histogram(color = &quot;white&quot;, bins = 20) Figure 7.13: Simulated T statistics histogram We see that the shape of this distribution is the same as that of diffmean. The scale has changed though with t_stat having less spread than diffmean. A traditional \\(t\\)-test doesn’t look at this simulated distribution, but instead it looks at the \\(t\\)-curve with degrees of freedom equal to 33 (the minimum of \\(n_1 = 34 - 1 = 33\\) and \\(n_2 = 34 - 1 = 33\\)). We now overlay what this \\(t\\)-curve looks like on top of the histogram showing the simulated \\(T\\) statistics. ggplot(data = rand_distn, mapping = aes(x = t_stat)) + geom_histogram(aes(y = ..density..), color = &quot;white&quot;, binwidth = 0.1) + stat_function(fun = dt, args = list(df = min(n1 - 1, n2 - 1)), color = &quot;royalblue&quot;, size = 2) We can see that the curve does a good job of approximating the randomization distribution here. (More on when to expect for this to be the case when we discuss conditions for the \\(t\\)-test in a bit.) To calculate the \\(p\\)-value in this case, we need to figure out how much of the total area under the \\(t\\)-curve is at our observed \\(T\\)-statistic or more, plus also adding the area under the curve at the negative value of the observed \\(T\\)-statistic or below. (Remember this is a two-tailed test so we are looking for a difference–values in the tails of either direction.) Just as we converted all of the simulated values to \\(T\\)-statistics, we must also do so for our observed effect \\(\\delta^*\\): (t_obs &lt;- obs_diff / denom_T) ## [1] 2.552202 So graphically we are interested in finding the percentage of values that are at or above 2.5522017 or at or below -2.5522017. ggplot(data = rand_distn, mapping = aes(x = t_stat)) + stat_function(fun = dt, args = list(df = min(n1 - 1, n2 - 1)), color = &quot;royalblue&quot;, size = 2) + geom_vline(xintercept = t_obs, color = &quot;red&quot;) + geom_vline(xintercept = -t_obs, color = &quot;red&quot;) At this point, you should make a guess as to what a reasonable value may be for the \\(p\\)-value. Let’s say the \\(p\\)-value is 0.01 or so. To actually perform this calculation by hand, you’d need to do some calculus. Let’s have R do it for us instead using the pt function. pt(t_obs, df = min(n1 - 1, n2 - 1), lower.tail = FALSE) + pt(-t_obs, df = min(n1 - 1, n2 - 1), lower.tail = TRUE) ## [1] 0.01551859 7.8.2 Conditions for t-test In order for the results of the \\(t\\)-test to be valid, three conditions must be met: Independent observations in both samples Nearly normal populations OR large sample sizes (\\(n \\ge 30\\)) Independently selected samples Condition 1: This is met since we sampled at random using R from our population. Condition 2: Recall from Figure 7.6, that we know how the populations are distributed. Both of them are close to normally distributed. If we are a little concerned about this assumption, we also do have samples of size larger than 30 (\\(n_1 = n_2 = 34\\)). Condition 3: This is met since there is no natural pairing of a movie in the Action group to a movie in the Romance group. Since all three conditions are met, we can be reasonably certain that the theory-based test will match the results of the randomization-based test using shuffling. Remember that theory-based tests can produce some incorrect results in these assumptions are not carefully checked. The only assumption for randomization and computational-based methods is that the sample is selected at random. They are our preference and we strongly believe they should be yours as well, but it’s important to also see how the theory-based tests can be done and used as an approximation for the computational techniques until at least more researchers are using these techniques that utilize the power of computers. An R script file of all R code used in this chapter is available here. 7.9 What’s to come? This chapter examined the basics of hypothesis testing with terminology and also an example of how to apply the “There is Only One Test” diagram to the Lady Tasting Tea example presented in Chapter ?? and to an example on comparing the IMDB ratings of action movies and romance movies. We’ll see in Chapter 8 how we can provide a range of possible values for an unknown population parameter instead of just running a Yes/No decision from a hypothesis test. We will see in Chapter ?? many of the same ideas we have seen with hypothesis testing and confidence intervals in the last two chapters. Regression is frequently associated both correctly and incorrectly with statistics and data analysis, so you’ll need to make sure you understand when it is appropriate and when it is not. "],
-["8-ci.html", "8 Confidence Intervals Needed packages 8.1 Bootstrapping 8.2 mean_rating 8.3 Relation to hypothesis testing 8.4 Effect size 8.5 Script of R code 8.6 What’s to come?", " 8 Confidence Intervals Definition: Confidence Interval A confidence interval gives a range of plausible values for a parameter. It depends on a specified confidence level with higher confidence levels corresponding to wider confidence intervals and lower confidence levels corresponding to narrower confidence intervals. Common confidence levels include 90%, 95%, and 99%. Usually we don’t just begin chapters with a definition, but confidence intervals are simple to define and play an important role in the sciences and any field that uses data. You can think of a confidence interval as playing the role of a net when fishing. Instead of just trying to catch a fish with a single spear (estimating an unknown parameter by using a single point estimate/statistic), we can use a net to try to provide a range of possible locations for the fish (use a range of possible values based around our statistic to make a plausible guess as to the location of the parameter). Needed packages library(dplyr) library(ggplot2) library(mosaic) library(knitr) 8.1 Bootstrapping Just as we did in Chapter 7 with the Lady Tasting Tea when making hypotheses about a population total with which we would like to test which one is more plausible, we can also use simulation to infer conclusions about a population quantitative statistic such as the mean. In this case, we will focus on constructing confidence intervals to produce plausible values for a population mean. (We can do a similar analysis for a population median or other summary measure as well.) Traditionally, the way to construct confidence intervals for a mean is to assume a normal distribution for the population or to invoke the Central Limit Theorem and get, what often appears to be magic, results. (This is similar to what was done in Section 7.8.) These methods are often not intuitive, especially for those that lack a strong mathematical background. They also come with their fair share of assumptions and often turn Statistics, a field that is full of tons of useful applications to many different fields and disciplines, into a robotic procedural-based topic. It doesn’t have to be that way! In this section, we will introduce the concept of bootstrapping. It will be a useful tool that will allow us to estimate the variability of our statistic from sample to sample. One neat feature of bootstrapping is that it enables us to approximate the sampling distribution and estimate the distribution’s standard deviation using ONLY the information in the one selected (original) sample. It sounds just as plagued with the magical type qualities of traditional theory-based inference on initial glance but we will see that it provides an intuitive and useful way to make inferences, especially when the samples are of medium to large size. To introduce the concept of bootstrapping, we again will use the movies data set in the ggplot2movies data frame. Remember that we load this data frame into R in much the same way as we loaded flights and weather from the nycflights13 package. library(ggplot2movies) data(movies, package = &quot;ggplot2movies&quot;) Recall that you can also glance at this data frame using the View function and look at the help documentation for movies using the ? function. We will explore many other features of this data set in the chapters to come, but here we will be focusing on the rating variable corresponding to the average IMDB user rating. You may notice that this data set is quite large: 58,788 movies have data collected about them here. This will correspond to our population of ALL movies. Remember from Chapter ?? that our population is rarely known. We use this data set as our population here to show you the power of bootstrapping in estimating population parameters. We’ll see how confidence intervals built using the bootstrap distribution do at including our population parameter of interest. Here we can actually calculate these values since our population is known, but remember that in general this isn’t the case. Let’s take a look at what the distribution of our population ratings looks like. We’ll see that we will use the distribution of our sample(s) as an estimate of this population histogram. movies %&gt;% ggplot(aes(x = rating)) + geom_histogram(color = &quot;white&quot;, bins = 20) Figure 8.1: Population ratings histogram Learning check (LC8.1) Why was a histogram chosen as the plot to make for the rating variable above? (LC8.2) Why does the shape of the rating histogram tell us about how IMDB users rate movies? What stands out about the plot? It’s important to think about what our goal is here. We would like to produce a confidence interval for the population mean rating. We will have to pretend for a moment that we don’t have all 58,788 movies. Let’s say that we only have a random sample of 50 movies from this data set instead. In order to get a random sample, we can use the resample function in the mosaic package with replace = FALSE. We could also use the sample_n function from dplyr. set.seed(2017) library(mosaic) movies_sample &lt;- movies %&gt;% resample(size = 50, replace = FALSE) The resample function has filtered the data frame movies “at random” to choose only 50 rows from the larger movies data frame. We store information on these 50 movies in the movies_sample data frame. Let’s now explore what the rating variable looks like for these 50 movies: movies_sample %&gt;% ggplot(aes(x = rating)) + geom_histogram(color = &quot;white&quot;, bins = 20) Figure 8.2: Sample ratings histogram Remember that we can think of this histogram as an estimate of our population distribution histogram that we saw above. We are interested in the population mean rating and trying to find a range of plausible values for that value. A good start in guessing the population mean is to use the mean of our sample rating from the movies_sample data: (movies_sample_mean &lt;- movies_sample %&gt;% summarize(mean = mean(rating))) ## # A tibble: 1 × 1 ## mean ## &lt;dbl&gt; ## 1 5.894 Note the use of the ( ) at the beginning and the end of this creation of the movies_sample_mean object. If you’d like to print out your newly created object, you can enclose it in the parentheses as we have here. This value of 5.894 is just one guess at the population mean. The idea behind bootstrapping is to sample with replacement from the original sample to create new resamples of the same size as our original sample. Returning to our example, let’s investigate what one such resample of the movies_sample data set accomplishes. We can create one resample/bootstrap sample by using the resample function in the mosaic package. library(mosaic) boot1 &lt;- resample(movies_sample) %&gt;% arrange(orig.id) The important thing to note here is the original row numbers from the movies_sample data frame in the far right column called orig.ids. Since we are sampling with replacement, there is a strong likelihood that some of the 50 observational units are going to be selected again. You may be asking yourself what does this mean and how does this lead us to creating a distribution for the sample mean. Recall that the original sample mean of our data was calculated using the summarize function above. Learning check (LC8.3) What happens if we change the seed to our pseudo-random generation? Try it above when we used resample to describe the resulting movies_sample. (LC8.4) Why is sampling at random important from the movies data frame? Why don’t we just pick Action movies and do bootstrapping with this Action movies subset? (LC8.5) What was the purpose of assuming we didn’t have access to the full movies data set here? Before we had a calculated mean in our original sample of 5.894. Let’s calculate the mean of ratings in our bootstrapped sample: (movies_boot1_mean &lt;- boot1 %&gt;% summarize(mean = mean(rating))) ## # A tibble: 1 × 1 ## mean ## &lt;dbl&gt; ## 1 5.686 More than likely the calculated bootstrap sample mean is different than the original sample mean. This is what was meant earlier by the sample means having some variability. What we are trying to do is replicate many different samples being taken from a larger population. Our best guess at what the population looks like is multiple copies of the sample we collected. We then can sample from that larger “created” population by generating bootstrap samples. Similar to what we did in the previous section, we can repeat this process using the do function followed by an asterisk. Let’s look at 10 different bootstrap means for ratings from movies_sample. Note the use of the resample function here. do(10) * (resample(movies_sample) %&gt;% summarize(mean = mean(rating))) ## mean ## 1 5.942 ## 2 5.572 ## 3 5.828 ## 4 6.292 ## 5 6.032 ## 6 5.920 ## 7 5.996 ## 8 5.852 ## 9 6.098 ## 10 5.608 You should see some variability begin to tease its way out here. Many of the simulated means will be close to our original sample mean but many will stray pretty far away. This occurs because outliers may have been selected a couple of times in the resampling or small values were selected more than larger. There are myriad reasons why this might be the case. So what’s the next step now? Just as we repeated the repetitions thousands of times with the “Lady Tasting Tea” example, we can do a similar thing here. trials &lt;- do(10000) * summarize(resample(movies_sample), mean = mean(rating)) ggplot(data = trials, mapping = aes(x = mean)) + geom_histogram(bins = 30, color = &quot;white&quot;) Figure 8.3: Bootstrapped means histogram The shape of this resulting distribution may look familiar to you. It resembles the well-known normal (bell-shaped) curve. At this point, we can easily calculate a confidence interval. In fact, we have a couple different options. We will first use the percentiles of the distribution we just created to isolate the middle 95% of values. This will correspond to our 95% confidence interval for the population mean rating, denoted by \\(\\mu\\). (ciq_mean_rating &lt;- confint(trials, level = 0.95, method = &quot;quantile&quot;)) ## name lower upper level method estimate ## 1 mean 5.456 6.296 0.95 percentile 5.894 It’s always important at this point to interpret the results of this confidence interval calculation. In this context, we can say something like the following: Based on the sample data and bootstrapping techniques, we can be 95% confident that the true mean rating of ALL IMDB ratings is between 5.456 and 6.296. This statement may seem a little confusing to you. Another way to think about this is that this confidence interval was constructed using the sample data by a procedure that is 95% reliable. We will get invalid results 5% of the time. Just as we had a trade-off with \\(\\alpha\\) and \\(\\beta\\) with hypothesis tests, we have a similar trade-off here with setting the confidence level. To further reiterate this point, the graphic below from Diez, Barr, and Çetinkaya-Rundel (2014) shows us that if we repeated a confidence interval process 25 times with 25 different samples, we would expect about 95% of them to actually contain the population parameter of interest. This parameter is marked with a dotted vertical line. We can see that only one confidence interval does not overlap with this value. (The one marked in red.) Therefore 24 in 25 (96%), which is quite close to our 95% reliability, do include the population parameter. Figure 8.4: Confidence interval coverage plot from OpenIntro Remember that we are pretending like we don’t know what the mean IMDB rating for ALL movies is. Our population here is all of the movies listed in the movies data frame from ggplot2movies. So does our bootstrapped confidence interval here contain the actual mean value? movies %&gt;% summarize(mean_rating = mean(rating)) %&gt;% kable() 8.2 mean_rating 5.93285 We see here that the population mean does fall in our range of plausible values generated from the bootstrapped samples. We can also get an idea of how the theory-based inference techniques would have approximated this confidence interval by using the formula \\[\\bar{x} \\pm (2 * SE),\\] where \\(\\bar{x}\\) is our original sample mean and \\(SE\\) stands for standard error and corresponds to the standard deviation of the bootstrap distribution. The value of 2 here corresponds to it being a 95% confidence interval. This formula assumes that the bootstrap distribution is symmetric and bell-shaped. This is often the case with bootstrap distributions, especially those in which the original distribution of the sample is not highly skewed. Definition: standard error The standard error is the standard deviation of the sampling distribution. The sampling distribution may be approximated by the bootstrap distribution or the null distribution depending on the context. Traditional theory-based methodologies for inference also have formulas for standard errors assuming some conditions are met. To compute this type of confidence interval, we only need to make a slight modification to the confint function seen above. (The expression after the \\(\\pm\\) sign is known as the margin of error.) (cise_mean_rating &lt;- confint(trials, level = 0.95, method = &quot;stderr&quot;)) ## name lower upper level method estimate margin.of.error ## 1 mean 5.468465 6.314277 0.95 stderr 5.894 0.4229063 Based on the sample data and bootstrapping techniques, we can be 95% confident that the true mean rating of ALL IMDB ratings is between 5.4684649 and 6.3142775. Learning check (LC8.6) Reproduce the bootstrapping above using a sample of size 50 instead of 25. What changes do you see? (LC8.7) Reproduce the bootstrapping above using a sample of size 5 instead of 25. What changes do you see? (LC8.8) How does the sample size affect the analysis? (LC8.9) Why must bootstrap samples be the same size as the original sample? 8.2.1 Review of Bootstrapping We can summarize the process to generate a bootstrap distribution here in a series of steps that clearly identify the terminology we will use (R. Lock et al. 2012). Generate bootstrap samples by sampling with replacement from the original sample, using the same sample size. Compute the statistic of interest, called a bootstrap statistic, for each of the bootstrap samples. Collect the statistics for many bootstrap samples to create a bootstrap distribution. Visually, we can represent this process in the following diagram. Figure 8.5: Bootstrapping diagram from Lock5 textbook 8.3 Relation to hypothesis testing Recall that we found a statistically significant difference in the sample mean of romance movie ratings compared to the sample mean of action movie ratings. We concluded Chapter 7 by attempted to understand just how much greater we could expect the population mean romance movie rating to be as compared to the population mean action movie rating. In order to do so, we will calculate a confidence interval for the difference \\(\\mu_r - \\mu_a\\). We’ll then go back to our population parameter values and see if our confidence interval contains our parameter value. We could use bootstrapping in a way similar to that done above, except now on a difference in sample means, to create a distribution and then use the confint function with the option of quantile to determine a confidence interval for the plausible values of the difference in population means. This is an excellent programming activity and the reader is urged to try to do so. Recall what the randomization/null distribution looked like for our simulated shuffled sample means: library(ggplot2) library(dplyr) ggplot(data = rand_distn, mapping = aes(x = diffmean)) + geom_histogram(color = &quot;white&quot;, bins = 20) Figure 8.6: Simulated shuffled sample means histogram With this null distribution being quite symmetric and bell-shaped, the standard error method introduced above likely provides a good estimate of a range of plausible values for \\(\\mu_r - \\mu_a\\). Another nice option here is that we can use the standard deviation of the null/randomization distribution we just found with our hypothesis test. (std_err &lt;- rand_distn %&gt;% summarize(se = sd(diffmean))) ## # A tibble: 1 × 1 ## se ## &lt;dbl&gt; ## 1 0.03182225 Remembering that we can use the general formula of \\(statistic \\pm (2 * SE)\\) we get the following result for plausible values of the difference in population means at the 95% level. (lower &lt;- obs_diff - (2 * std_err)) ## se ## 1 0.7657673 (upper &lt;- obs_diff + (2 * std_err)) ## se ## 1 0.8930563 We can, therefore, say that we are 95% confident that the population mean rating for romance movies is between 0.766 and 0.893 points higher than for that of action movies. The important thing to check here is whether 0 is contained in the confidence interval. If it is, it is plausible that the difference in the two population means between the two groups is 0. This means that the null hypothesis is plausible. The results of the hypothesis test and the confidence interval should match as they do here. We rejected the null hypothesis with hypothesis testing and we have evidence here than the mean rating for romance movies is higher than for action movies. 8.4 Effect size The phrase effect size has been thrown around recently as an alternative to \\(p\\)-values. In combination with the confidence interval, it can be often more valuable than just looking at the results of a hypothesis test. It depends on the scientific discipline exactly what is meant by “effect size” but, in general, it refers to the magnitude of the difference between group measurements. For our two sample problem involving movies, it is the observed difference in sample means obs_diff. It’s worthy of mention here that confidence intervals are always centered at the observed statistic. In other words, if you are looking at a confidence interval and someone asks you what the “effect size” is you can simply find the midpoint of the stated confidence interval. Learning check (LC8.10) Check to see whether the difference in population mean ratings for the two genres falls in the confidence interval we found here. Are we guaranteed that it will fall in the range of plausible values? (LC8.11) Why do you think many scientific fields are shifting to preferring inclusion of confidence intervals in articles over just \\(p\\)-values and hypothesis tests? (LC8.12) Why is 95% related to a value of 2 in the margin of error? What would approximate values be for 90% and for 99%? (LC8.13) Why is a 95% confidence interval wider than a 90% confidence interval? Explain by using a concrete example from everyday life about what is meant by “confidence.” (LC8.14) How would confidence intervals correspond to one-sided hypothesis tests? (LC8.15) There is a relationship between the significance level and the confidence level. What do you think it is? (LC8.16) The moment the phrase “standard error” is mentioned, there seems to be someone that says “The standard error is \\(s\\) divided by the square root of \\(n\\).” This standard error formula is correct and used in the theory-based procedure for an inference on one mean. But… does it always work? For samp1, samp2, and samp3 below, do the following: produce a bootstrap distribution based on the sample calculate the standard deviation of the bootstrap distribution compare this value of the standard error to what you obtain when you calculate the standard deviation of the sample \\(s\\) divided by \\(\\sqrt{n}\\). df1 &lt;- data_frame(samp1 = rexp(50)) df2 &lt;- data_frame(samp2 = rnorm(100)) df3 &lt;- data_frame(samp3 = rbeta(20,5,5)) Describe how \\(s / \\sqrt{n}\\) does in approximating the standard error for these three samples and their corresponding bootstrap distributions. 8.5 Script of R code An R script file of all R code used in this chapter is available here. 8.6 What’s to come? We will see in Chapter ?? many of the same ideas we have seen with hypothesis testing and confidence intervals in the last two chapters. Regression is frequently associated both correctly and incorrectly with statistics and data analysis, so you’ll need to make sure you understand when it is appropriate and when it is not. References "],
-["9-regression-via-broom.html", "9 Regression via broom Needed packages 9.1 EXAMPLE: Alaskan Airlines delays 9.2 Correlation 9.3 Linear regression 9.4 Inference for regression 9.5 Residual analysis 9.6 Conditions for regression 9.7 Script of R code 9.8 What’s to come?", " 9 Regression via broom One of the most commonly used statistical procedures is regression. Regression, in its simplest form, focuses on trying to predict values of one numerical variable based on the values of another numerical variable using a straight line fit to data. We saw in Chapters 7 and 8 an example of analyses using a categorical predictor (movie genre–action or romance) and a numerical response (movie rating). In this chapter, we will focus on going back to the flights data frame in the nycflights13 package to look at the relationship between departure delay and arrival delay. We will also discuss the concept of correlation and how it is frequently incorrectly implied to also lead to causation. This chapter also introduces the broom package, which is a useful tool in summarizing the results of model fits in tidy format. You will see examples of the tidy, glance, and augment functions with linear regression. Needed packages library(mosaic) library(dplyr) library(ggplot2) library(knitr) library(broom) library(nycflights13) 9.1 EXAMPLE: Alaskan Airlines delays We’ll next explore the relationship/association of departure delays and arrival delays for a sample of 100 flights departing from New York City in 2013 with Alaskan Airlines. library(nycflights13) data(flights) set.seed(2017) # Load Alaska data, deleting rows that have missing departure delay # or arrival delay data alaska_flights &lt;- flights %&gt;% filter(carrier == &quot;AS&quot;) %&gt;% filter(!is.na(dep_delay) &amp; !is.na(arr_delay)) %&gt;% resample(size = 50, replace = FALSE) ggplot(data = alaska_flights, mapping = aes(x = dep_delay, y = arr_delay)) + geom_point() Figure 9.1: Departure and Arrival Flight Delays for a sample of 50 Alaskan flights from NYC Learning check (LC9.1) Does there appear to be a linear relationship with arrival delay and departure delay? In other words, could you fit a line to the data and have explain how arr_delay increases as dep_delay increases? (LC9.2) Is there only one possible line that fits the data “well”? How could you decide on which one is best if there are multiple options? 9.2 Correlation One way to measure the linearity between two numerical variables is by using correlation. In fact, the correlation coefficient is defined as just that. Definition: Correlation Coefficient The correlation coefficient measures the strength of linear association between two variables. Properties of the correlation coefficient: It is always between -1 and 1, inclusive, where -1 indicates perfect negative relationship 0 indicates no relationship +1 indicates perfect positive relationship Learning check (LC9.3) Make a guess as to the value of the correlation cofficient between arr_delay and dep_delay in the alaska_flights data frame. (LC9.4) Do you think that the correlation coefficient between arr_delay and dep_delay is the same as the correlation coefficient between dep_delay and arr_delay? Explain. We can look at a variety of different data sets and their corresponding correlation coefficients in the following plot. Figure 9.2: Different Correlation Coefficients We can calculate the correlation coefficient for our example of flight delays via alaska_flights %&gt;% summarize(correl = cor(dep_delay, arr_delay)) ## # A tibble: 1 × 1 ## correl ## &lt;dbl&gt; ## 1 0.7907993 The sample correlation coefficient is denoted by \\(r\\). In this case, \\(r = 0.7907993\\). Learning check (LC9.5) Would you quantify the value of correl calculated above as being strongly positively linear, weakly positively linear, not linear, weakly negatively linear, or strongly positively linear? Discuss your choice. If you’d like a little more practice in determining the linear relationship between two variables by quantifying a correlation coefficient, you should check out the Guess the Correlation game online. 9.2.1 Correlation does not imply causation Just because arrival delays are related to departure delays in a somewhat linear fashion, we can’t say with certaintly that arrival delays are caused entirely by departure delays. Certainly it appears that as one increases, the other tends to increase, but that might not always be the case. Causation is a tricky problem and frequently takes carefully designed experiments. These experiments remove confounding variables and only focus on the behavior of one variable in the presence of the levels of the other variable. Be careful as you read studies to make sure that the writers aren’t falling into this fallacy of correlation implying causation. If you spot one, you may want to send them a link to Spurious Correlations. Learning check (LC9.6) What are some other confounding variables besides departure delay that could attribute to an increase in arrival delays? Remember that a variable is something that has to vary! 9.3 Linear regression So we see above that there is a strong positive association between these delay variables. Let’s say that we are waiting for our flight to leave New York City on Alaskan and we are told that our flight is going to be delayed 25 minutes. What could we predict for our arrival delay based on the plot in Figure 9.1? It may be hard to pick a particular value here, especially after just going on confidence intervals in Chapter 8. One way to do this would be to fit a line that fits the data best and then use the predicted arr_delay value from that line for dep_delay = 25 as our prediction. But what is meant by “fits the data best”? The least squares/best fitting/linear regression line has been fit to the data below. Figure 9.3: Regression line fit on delays Here lm corresponds to “linear model” and we’ll see it’s use again in a bit when we find the values that define this line. 9.3.1 Understanding linear regression basics Let’s choose an arbitrary point on the graph and label it the color blue. Now consider this point’s deviation from the regression line. Do this for another point. And for another point. We could repeat this process for each of the points in our sample. The pattern that emerges here is that the regression line minimizes the sum of the squared arrow lengths (i.e., the least squares) for all of the points. As you look at these points you might think that a different line could fit the data better based on this criteria. That isn’t the case though and it can be shown via calculus (omitted here) that this line minimizes the sum of the squared residuals for these 50 points. 9.3.2 The equation of the line We can use R and the lm function to retrieve the equation of the line of best fit here in red. A simple linear regression such as this will produce two coeffients: one for the \\(y\\)-intercept and one for the slope. We can use the tidy function in the broom package to extract these coefficients from the model fit. delay_fit &lt;- lm(formula = arr_delay ~ dep_delay, data = alaska_flights) tidy(delay_fit) %&gt;% kable() term estimate std.error statistic p.value (Intercept) -14.155017 2.8094813 -5.038302 0.0000071 dep_delay 1.217666 0.1360336 8.951212 0.0000000 In general, the equation of the line of best fit for a sample is \\[\\hat{y} = b_0 + b_1 x\\]. Thus, our equation is \\(\\hat{y} = -14.1550165 + 1.2176658 \\, x\\). It is usually preferred to actually write the names of the variables instead of \\(x\\) and \\(y\\): \\[\\widehat{arr\\_delay} = -14.1550165 + 1.2176658 \\, dep\\_delay\\]. We can also extract the coefficients by using the coef function: coef(delay_fit) ## (Intercept) dep_delay ## -14.155016 1.217666 9.3.3 Interpretting the slope After you have determined your line of best fit, it is good practice to interpret the results to see if they make sense. Slope is defined as rise over run or the change in \\(y\\) for every one unit increase in \\(x\\). For our specific example, we can say that for every one minute increase in the departure delay of Alaskan Airlines flights from NYC, we can expect the corresponding arrival delay to be 1.22 minutes more. This estimate does make some practical sense. It would be strange if arrival delays went down as departure delays increased. We also expect that the longer a flight is delayed on departure, the more likely the longer a flight is delayed on arrival. Remember that we are also using data here to make a guess as to how the population of all Alaskan flights might behave with regards to departure delays and arrival delays, so just as with other sampling procedures there is also variability in the sample estimates for the regression line. 9.3.4 Predicting values Getting back to our hypothetical flight that has been delayed 25 minutes, we can use the augment function in the broom package to get the fitted arrival delay value: delay_fit %&gt;% augment(newdata = data_frame(dep_delay = 25)) ## dep_delay .fitted .se.fit ## 1 25 16.28663 3.967287 Note the use of the data_frame function here, which must be used since newdata is expected a data frame as its argument. We must also specify that we are plugging in 25 for the value of dep_delay here. We can see that the line predicted an arrival delay of 16.29 minutes based on our 25 minute departure delay. This also does make some sense since flights that aren’t delayed greatly from the beginning to tend to make up time in the air to compensate. Important note: The correlation coefficient and the slope of the regression line are not the same thing. They will always share the same sign (positive correlation coefficients correspond to positive slope coefficients and the same holds true for negative values), but you can’t make any more conclusions about them than that. For example, say we have 3 groups of points: Their regression lines have different slopes, but \\(r = 1\\) for all 3. In other words, all three groups of points have a perfect (positive) linear relationship. 9.4 Inference for regression The population least squares line is defined by the formula \\(y = \\beta_0 + \\beta_1 x + \\epsilon\\). Here \\(\\epsilon\\) corresponds to the error term. It corresponds to the part of the response variable \\(y\\) that remains unexplained after considering the predictor variable \\(x\\). Often it is standard practice to assume that this error term follows a normal distribution. We will focus on checking whether that assumption is valid in Section 9.5. In the population least squares line \\(y = \\beta_0 + \\beta_1 x + \\epsilon\\), we can see that if \\(\\beta_1 = 0\\) there is no relationship between \\(x\\) and \\(y\\). If \\(\\beta_1 = 0\\), \\(y = \\beta_0 + \\epsilon\\). Therefore, \\(y\\) does not depend on \\(x\\) at all in the equation. A hypothesis test is frequently conducted to check whether a relationship exists between two numerical variables \\(x\\) and \\(y\\). We can also use the concept of shuffling to determine standard error and conduct a hypothesis test for a population slope. Let’s go back to our example on Alaskan flights that represent a sample of all Alaskan flights departing NYC in 2013. Let’s test to see if we have evidence that a positive relationship exists between the departure delay and arrival delay for Alaskan flights. We will set up this hypothesis testing process as we have each before via the “There is Only One Test” diagram in Figure 7.1. 9.4.1 Data Our data is stored in alaska_flights and we are focused on the 50 measurements of dep_delay and arr_delay there. 9.4.2 Test Statistic \\(\\delta\\) Our test statistic here is the sample slope coefficient that we denote with \\(b_1\\). 9.4.3 Observed effect \\(\\delta^*\\) (b1_obs &lt;- tidy(delay_fit)$estimate[2]) ## [1] 1.217666 The calculated slope value from our observed sample is \\(b_1 = 1.2176658\\). 9.4.4 Model of \\(H_0\\) We are looking to see if a positive relationship exists so \\(H_A: \\beta_1 &gt; 0\\). Our null hypothesis is always in terms of equality so we have \\(\\beta_1 = 0\\). 9.4.5 Simulated Data Now to simulate the null hypothesis being true and recreating how our sample was created, we need to think about what it means for \\(\\beta_1\\) to be zero. If \\(\\beta_1 = 0\\), we said above that there is no relationship between the departure delay and arrival delay. If there is no relationship, then any one of the arrival delay values could have just as likely occurred with any of the other departure delay values instead of the one that it actually did fall with. We, therefore, have another example of shuffling in our simulating data. Tactile simulation We could use a deck of 100 note cards to create a tactile simulation of this shuffling process. We would write the 50 different values of departure delays on each of the 50 cards, one per card. We would then do the same thing for the 50 arrival delays putting them on one per card. Next, we would lay out each of the 50 departure delay cards and we would shuffle the arrival delay deck. Then, after shuffling the deck well, we would disperse the cards one per each one of the departure delay cards. We would then enter these new values in for arrival delay and compute a sample slope based on this shuffling. We could repeat this process many times, keeping track of our sample slope after each shuffle. 9.4.6 Distribution of \\(\\delta\\) under \\(H_0\\) We can build our randomization distribution in much the same way we did before using the do and shuffle functions. Here we will take advantage of the coef function we saw earlier to extract the slope and intercept coefficients. (Our focus will be on the slope here though.) rand_distn &lt;- mosaic::do(10000) * (lm(formula = shuffle(arr_delay) ~ dep_delay, data = alaska_flights) %&gt;% coef()) names(rand_distn) ## [1] &quot;Intercept&quot; &quot;dep_delay&quot; We see that the names of our columns are Intercept and dep_delay. We want to look at dep_delay since that corresponds to the slope coefficients. ggplot(data = rand_distn, mapping = aes(x = dep_delay)) + geom_histogram(color = &quot;white&quot;, bins = 20) 9.4.7 The p-value Recall that we want to see where our observed sample slope \\(\\delta^* = 1.2176658\\) falls on this distribution and then count all of the values to the right of it corresponding to \\(H_A: \\beta_0 &gt; 0\\). To get a sense for where our values falls, we can shade all values at least as big as \\(\\delta^*\\). ggplot(data = rand_distn, aes(x = dep_delay, fill = (dep_delay &gt;= b1_obs))) + geom_histogram(color = &quot;white&quot;, bins = 20) Figure 9.4: Shaded histogram to show p-value Since 1.2176658 falls far to the right of this plot, we can say that we have a \\(p\\)-value of 0. We, thus, have evidence to reject the null hypothesis in support of there being a positive association between the departure delay and arrival delay of all Alaskan flights from NYC in 2013. Learning check (LC9.7) Repeat the inference above but this time for the correlation coefficient instead of the slope. (LC9.8) Use bootstrapping with points to determine a range of possible values for the population slope comparing departure delays to arrival delays for Alaskan flights in 2013 from NYC. 9.5 Residual analysis The following diagram will help you to keep track of what is meant by a residual. Here, \\(y_i\\) is an observed value of the arr_delay variable. \\(i\\) ranges from 1 to 50. \\(\\hat{y}_i\\) is the fitted value–the arr_delay value that is being pointed to on the red line. The residual is \\[\\hat{\\epsilon}_i = y_i - \\hat{y}_i\\]. Note the order here! You start at the non-pointy end of the arrow (\\(y_i\\)) and then subtract away what comes at the point (\\(\\hat{y_i}\\)). 9.6 Conditions for regression In order for regression to be valid, we have three conditions to check: Equal variances across explanatory variable (Check residual plot for fan-shaped patterns.) Independent observations, errors, and predictor variables (Check residual plot for no time series-like patterns.) Nearly normal residuals (Check quantile-quantile plot of standardized residuals.) As you can see from the things to check after the conditions residuals will play a large role in determining whether the conditions are met. Residuals are estimates for the error term \\(\\epsilon\\) we discussed earlier, and this is a big reason why they play an important role in validating regression assumptions. Residual plot To construct a residual plot we will analyze data from the augment function in broom. Specifically, we are interested in the .fitted and .resid variables there: fits &lt;- augment(delay_fit) ggplot(data = fits, mapping = aes(x = .fitted, y = .resid)) + geom_point() + geom_abline(intercept = 0, slope = 0, color = &quot;blue&quot;) Quantile-quantile plot ggplot(data = fits, mapping = aes(sample = .resid)) + stat_qq() Checking conditions: We are looking to see if the points are scattered about the blue line at 0 relatively evenly as we look from left to right. We have some reason for concern here as the large lump of values on the left are much more dispersed than those on the right. The second condition is invalidated if there is a trigonometric pattern of up and down throughout the residual plot. That is not the case here. We look at the quantile-quantile plot (Q-Q plot for sure) for the third condition. We are looking to see if the residuals fall on a straight line with what we would expect if they were normally distributed. We see some curvature here as well. We should begin to wonder if regression was valid here with both condition 1 and condition 3 in question. We have reason to doubt whether a linear regression is valid here. Unfortunately, all too frequently regressions are run without checking these assumptions carefully. While small deviations in the assumptions can be OK, larger violations can completely invalidate the results and make any inferences improbable and questionable. 9.7 Script of R code An R script file of all R code used in this chapter is available here. 9.8 What’s to come? "],
-["10-effective-data-storytelling.html", "10 Effective Data Storytelling Concluding Remarks", " 10 Effective Data Storytelling As we’ve progressed throughout this book, you’ve seen how to work with data in a variety of ways. You’ve learned effective strategies for plotting data by understanding which types of plots work best for which combinations of variable types. You’ve summarized data in table form and calculated summary statistics for a variety of different variables. Further, you’ve seen the value of inference as a process to come to conclusions about a population by using a random sample. Lastly, you’ve explored how to use linear regression and the importance of checking the conditions required to make it a valid procedure. Throughout, you’ve learned many computational techniques and focused on reproducible research in writing R code and keeping track of your work in R Markdown. All of these steps go into making a great story using data. As the textbook comes to a close, we thought it best that you explore what stellar work is being produced by data journalists throughout the world that specialize in effective data storytelling. We recommend you read and analyze this article by Walt Hickey entitled The Dollar-And-Cents Case Against Hollywood’s Exclusion of Women. As you read over it, think carefully about how Walt is using his data, his graphics, and his analyses to paint the picture for the reader of what the story is he wants to tell. In the spirit of reproducibility, the members of 538 have also shared the data that they used to create this story and some R code here. Great data stories don’t mislead the reader, but rather engulf them in understanding the importance that data plays in our lives through the captivation of storytelling. Concluding Remarks If you’ve come to this point in the book, I’d suspect that you know a thing or two about how to work with data in R. You’ve also gained a lot of knowledge about how to use simulation techniques to determine statistical significance. The hope is that you’ve come to appreciate data manipulation, tidy data sets, and the power of statistical visualization. Actually, the data visualization part may be the most important thing here. If you can create truly beautiful graphics that display information in ways that the reader can clearly decipher, you’ve picked up a great skill. Let’s hope that that skill keeps you creating great stories with data into the near and far distant future. Thanks for coming along for the ride as we dove into modern data analysis using R! "],
+["index.html", "ModernDive 1 Preamble 1.1 Principles of this Book - For Instructors 1.2 Contribute 1.3 Getting Started - For Students Colophon", " ModernDive An Introduction to Statistical and Data Sciences via R Chester Ismay and Albert Y. Kim 2017-01-10 1 Preamble 1.1 Principles of this Book - For Instructors These are some principles we keep in mind. If you agree with them, this might be the book for you. Blur the lines between lecture and lab Laptops and open source software are rendering the lab/lecture dichotomy ever more archaic. It’s much harder for students to understand the importance of using the software if they only use it once a week or less. They forget the syntax in much the same way someone learning a foreign language forgets the rules. Focus on the entire data/science research pipeline Grolemund and Wickham’s graphic George Cobb argued for “Minimizing prerequisites to research” It’s all about data, data, data We leverage R packages for rich/complex, yet easy-to-load data sets. We’ve heard it before: “You can’t teach ggplot2 for data visualization in intro stats!” We, like David Robinson, are more optimistic and we’ve had success doing so. dplyr is a game changer for data manipulation: the verb describing your desired data action is the command name! Use simulation/resampling for intro stats, not probability/large sample approximation Reinforce concepts, not equations, formulas, and probability tables. To this end, we’re big fans of the mosaic package’s shuffle(), resample(), and do() functions for sampling and simulation. Don’t fence off students from the computation pool, throw them in! Don’t teach them coding/programming per se, but computational and algorithmic thinking. Drawing Venn diagrams delineating statistics, computer science, and data science is also ever more archaic; embrace computation! Complete reproducibility We find it frustrating when textbooks give examples but not the source code and the data itself. We not only give you the source code for all examples, but also the source code for the whole book! We encourage use of R Markdown to foster notions of reproducible research. Ultimately the best textbook is one you’ve written yourself You best know your audience, their background, and their priorities and you know best your own style and the types of examples and problems you like best. Customizability is the ultimate end. A new paradigm for textbooks? Versions, not editions? Pull requests, crowd-sourcing, and development versions? 1.2 Contribute This book is in beta testing and is currently at Version 0.1.1. If you would like to receive periodic updates on this book and other similar projects, please fill out this Google Form. The source code for this book is available for download/forking on GitHub. If you click on the release link near the top of the page there, you can download all of the source code for whichever release version you’d like to work with and use. If you find typos or other errors or have suggestions on how to better word something in the book, please create a pull request too! We also welcome issue creation. Let’s all work together to make this book as great as possible for as many students and instructors as possible. Please feel free to modify the book as you wish for your own needs! All we ask is that you list the authors field above as “Chester Ismay, Albert Y. Kim, and YOU!” We’d also appreciate if you let us know what changes you’ve made and how you’ve used the textbook. We’d love some data on what’s working well and what’s not working so well. 1.3 Getting Started - For Students This book was written using the bookdown R package from Yihui Xie (Xie 2016). In order to follow along and run the code in this book on your own, you’ll need to have access to R and RStudio. You can find more information on both of these with a simple Google search for “R” and for “RStudio.” An introduction to using R, RStudio, and R Markdown is also available in a free book here (Ismay 2016). It is recommended that you refer back to this book frequently as it has GIF screen recordings that you can follow along with as you learn. We will keep a running list of R packages you will need to have installed to complete the analysis as well here in the needed_pkgs character vector. You can check if you have all of the needed packages installed by running all of the lines below in the next chunk of R code. The last lines including the if will install them as needed (i.e., download their needed files from the internet to your hard drive and install them for your use). You can run the library function on them to load them into your current analysis. Prior to each analysis where a package is needed, you will see the corresponding library function in the text. Make sure to check the top of the chapter to see if a package was loaded there. needed_pkgs &lt;- c(&quot;nycflights13&quot;, &quot;dplyr&quot;, &quot;ggplot2&quot;, &quot;knitr&quot;, &quot;okcupiddata&quot;, &quot;dygraphs&quot;, &quot;rmarkdown&quot;, &quot;mosaic&quot;, &quot;ggplot2movies&quot;) new.pkgs &lt;- needed_pkgs[!(needed_pkgs %in% installed.packages())] if(length(new.pkgs)) { install.packages(new.pkgs, repos = &quot;http://cran.rstudio.com&quot;) } Colophon The source of the book is available here and was built with versions of R packages (and their dependent packages) given below. This may not be of importance for initial readers of this book, but the hope is you can reproduce a duplicate of this book by installing these versions of the packages. package * version date source assertthat 0.1 2013-12-06 CRAN (R 3.3.0) backports 1.0.4 2016-10-24 CRAN (R 3.3.0) base64enc 0.1-3 2015-07-28 CRAN (R 3.3.0) BH 1.62.0-1 2016-11-19 CRAN (R 3.3.2) bitops 1.0-6 2013-08-17 CRAN (R 3.3.0) caTools 1.17.1 2014-09-10 CRAN (R 3.3.0) colorspace 1.3-2 2016-12-14 CRAN (R 3.3.2) curl 2.3 2016-11-24 CRAN (R 3.3.2) DBI 0.5-1 2016-09-10 CRAN (R 3.3.0) dichromat 2.0-0 2013-01-24 CRAN (R 3.3.0) digest 0.6.11 2017-01-03 CRAN (R 3.3.2) dplyr * 0.5.0 2016-06-24 CRAN (R 3.3.0) dygraphs * 1.1.1.4 2017-01-04 CRAN (R 3.3.2) evaluate 0.10 2016-10-11 CRAN (R 3.3.0) ggdendro 0.1-20 2016-04-27 CRAN (R 3.3.0) ggplot2 * 2.2.1 2016-12-30 CRAN (R 3.3.2) ggplot2movies * 0.0.1 2015-08-25 CRAN (R 3.3.0) gridExtra 2.2.1 2016-02-29 CRAN (R 3.3.0) gtable 0.2.0 2016-02-26 CRAN (R 3.3.0) highr 0.6 2016-05-09 CRAN (R 3.3.0) hms 0.3 2016-11-22 CRAN (R 3.3.2) htmltools 0.3.5 2016-03-21 CRAN (R 3.3.0) htmlwidgets 0.8 2016-11-09 CRAN (R 3.3.2) jsonlite 1.2 2016-12-31 CRAN (R 3.3.2) knitr * 1.15.1 2016-11-22 CRAN (R 3.3.2) labeling 0.3 2014-08-23 CRAN (R 3.3.0) lattice * 0.20-34 2016-09-06 CRAN (R 3.3.2) latticeExtra 0.6-28 2016-02-09 CRAN (R 3.3.0) lazyeval 0.2.0 2016-06-12 CRAN (R 3.3.0) magrittr 1.5 2014-11-22 CRAN (R 3.3.0) markdown 0.7.7 2015-04-22 CRAN (R 3.3.0) MASS 7.3-45 2016-04-21 CRAN (R 3.3.2) Matrix * 1.2-7.1 2016-09-01 CRAN (R 3.3.2) mime 0.5 2016-07-07 CRAN (R 3.3.0) mosaic * 0.14.4 2016-07-29 CRAN (R 3.3.0) mosaicData * 0.14.0 2016-06-17 CRAN (R 3.3.0) munsell 0.4.3 2016-02-13 CRAN (R 3.3.0) nycflights13 * 0.2.1 2016-12-30 CRAN (R 3.3.2) okcupiddata * 0.1.0 2016-08-19 CRAN (R 3.3.0) plyr 1.8.4 2016-06-08 CRAN (R 3.3.0) R6 2.2.0 2016-10-05 CRAN (R 3.3.0) RColorBrewer 1.1-2 2014-12-07 CRAN (R 3.3.0) Rcpp 0.12.8 2016-11-17 CRAN (R 3.3.2) readr * 1.0.0 2016-08-03 CRAN (R 3.3.0) reshape2 1.4.2 2016-10-22 CRAN (R 3.3.0) rmarkdown 1.3 2016-12-21 CRAN (R 3.3.2) rprojroot 1.1 2016-10-29 CRAN (R 3.3.0) scales 0.4.1 2016-11-09 CRAN (R 3.3.2) stringi 1.1.2 2016-10-01 CRAN (R 3.3.0) stringr 1.1.0 2016-08-19 CRAN (R 3.3.0) tibble 1.2 2016-08-26 CRAN (R 3.3.0) tidyr 0.6.0 2016-08-12 CRAN (R 3.3.0) xts 0.9-7 2014-01-02 CRAN (R 3.3.0) yaml 2.1.14 2016-11-12 CRAN (R 3.3.2) zoo 1.7-14 2016-12-16 CRAN (R 3.3.2) Book was last updated: ## [1] &quot;By Chester on Tuesday, January 10, 2017 21:08:44 EST&quot; References "],
+["2-intro.html", "2 Introduction 2.1 Preamble 2.2 Three driving data sources 2.3 Data/science pipeline 2.4 Reproducibility 2.5 Who is this book for?", " 2 Introduction 2.1 Preamble This book is inspired by three books: “Mathematical Statistics with Resampling and R” (Chihara and Hesterberg 2011), “Intro Stat with Randomization and Simulation” (Diez, Barr, and Çetinkaya-Rundel 2014), and “R for Data Science” (Grolemund and Wickham 2016). The first book, while designed for upper-level undergraduates and graduate students, provides an excellent resource on how to use resampling to build statistical concepts like normal distributions using computers instead of focusing on memorization of formulas. The last two books also provide a path towards free alternatives to the traditionally expensive introductory statistics textbook. When looking over the vast number of introductory statistics textbooks, we found that there wasn’t one that incorporated many of the new R packages directly into the text. Additionally, there wasn’t an open-source, free textbook available that showed new learners all of the following how to use R to explore and visualize data how to use randomization and simulation to build inferential ideas how to effectively create stories using these ideas to convey information to a lay audience. We will introduce sometimes difficult statistics concepts through the medium of data visualization. In today’s world, we are bombarded with graphics that attempt to convey ideas. We will explore what makes a good graphic and what the standard ways are to convey relationships with data. You’ll also see the use of visualization to introduce concepts like mean, median, standard deviation, distributions, etc. In general, we’ll use visualization as a way of building almost all of the ideas in this book. Additionally, this book will focus on the triad of computational thinking, data thinking, and inferential thinking. We’ll see throughout the book how these three modes of thinking can build effective ways to work with, to describe, and to convey statistical knowledge. In order to do so, you’ll see the importance of literate programming to develop literate data science. In other words, you’ll see how to write code and descriptions that are useful not just for a computer to execute but also for readers to understand exactly what a statistical analysis is doing and how it works. Hal Abelson coined the phrase that we will follow throughout this book: “Programs must be written for people to read, and only incidentally for machines to execute.” 2.2 Three driving data sources Instead of hopping from one data set to the next in the text of this book, we’ve decided to focus throughout on three different data sources: flights leaving New York City in 2013 profiles of OKCupid users in San Francisco IMDB movie ratings By focusing on just three large data sources, it is our hope that you’ll be able to see how each of the chapters is interconnected. You’ll see how the data being tidy leads into data visualization and manipulation in exploratory data analysis and how those concepts tie into inference and regression. 2.3 Data/science pipeline You may think of statistics as just being a bunch of numbers. We commonly hear the phrase “statistician” when listening to broadcasts of sporting events. Statistics (in particular, data analysis), in addition to describing numbers like with baseball batting averages, plays a vital role in all of the sciences. You’ll commonly hear the phrase “statistically significant” thrown around in the media. You’ll see things that say “Science now shows that chocolate is good for you.” Underpinning these claims is data analysis. By the end of this book, you’ll be able to better understand whether these claims should be trusted or whether we should be weary. Inside data analysis are many sub-fields that we will discuss throughout this book (not necessarily in this order): data collection data manipulation data visualization data modeling inference correlation and regression interpretation of results data storytelling This can be summarized in a graphic that is commonly used by Hadley Wickham: Figure 2.1: Hadley’s workflow graphic We will begin with a discussion on what is meant by tidy data and then dig into the gray Understand portion of the cycle and conclude by talking about interpreting and discussing the results of our models via Communication. These steps are vital to any statistical analysis. But why should you care about statistics? “Why did they make me take this class?” There’s a reason so many fields require a statistics course. Scientific knowledge grows through an understanding of statistical significance and data analysis. You needn’t be intimidated by statistics. It’s not the beast that it used to be and, paired with computation, you’ll see how reproducible research in the sciences particularly increases scientific knowledge. 2.4 Reproducibility “The most important tool is the mindset, when starting, that the end product will be reproducible.” – Keith Baggerly Another large goal of this book is to help readers understand the importance of reproducible analyses. The hope is to get readers into the habit of making their analyses reproducible from the very beginning. This means we’ll be trying to help you build new habits. This will take practice and be difficult at times. You’ll see just why it is so important for you to keep track of your code and well-document it to help yourself later and any potential collaborators as well. Copying and pasting results from one program into a word processor is not the way that efficient and effective scientific research is conducted. It’s much more important for time to be spent on data collection and data analysis and not on copying and pasting plots back and forth across a variety of programs. In a traditional analyses if an error was made with the original data, we’d need to step through the entire process again: recreate the plots and copy and paste all of the new plots and our statistical analysis into your document. This is error prone and a frustrating use of time. We’ll see how to use R Markdown to get away from this tedious activity so that we can spend more time doing science. “We are talking about computational reproducibility.” - Yihui Xie Reproducibility means a lot of things in terms of different scientific fields. Are experiments conducted in a way that another researcher could follow the steps and get similar results? In this book, we will focus on what is known as computational reproducibility. This refers to being able to pass all of one’s data analysis, data sets, and conclusions to someone else and have them get exactly the same results on their machine. This allows for time to be spent doing actual science and interpreting of results and assumptions instead of the more error prone way of starting from scratch or following a list of steps that may be different from machine to machine. 2.5 Who is this book for? This book is targeted at students taking a traditional intro stats class in a small college environment using RStudio and preferably RStudio Server. We assume no prerequisites: no algebra, no calculus, and no prior programming experience. This is intended to be a gentle and nice introduction to the practice of statistics in terms of how data scientists, statisticians, data journalists, and other scientists analyze data and write stories about data. We have intentionally avoided the use of throwing formulas at you as much as possible and instead have focused on developing statistical concepts via data visualization and statistical computing. We hope this is a more intuitive experience than the way statistics has traditionally been taught in the past (and how it is commonly perceived from the outside). We additionally hope that you see the value of reproducible research via R as you continue in your studies. We understand that there will initially be growing pains in learning to program but we are here to help you and you should know that there is a huge community of R users that are always happy to help newbies along as well. Now let’s get into learning about how to create good stories about and with data! References "],
+["3-tidy.html", "3 Tidy Data 3.1 What is tidy data? 3.2 Datasets in the nycflights13 package 3.3 How is flights tidy? 3.4 Normal forms of data 3.5 What’s to come?", " 3 Tidy Data In this chapter, we’ll discuss the importance of tidy data. You may think that this means just having your data in a spreadsheet, but you’ll see that it is actually more specific than that. Data actually comes to us in a variety of formats from pictures to text to just numbers. We’ll focus on datasets that can be stored in a spreadsheet throughout this book as that is the most common way data is collected in the sciences. Having tidy data will allow us to more easily create data visualizations as we will see in Chapter 4. It will also help us with manipulating data in Chapter 5 and in all subsequent chapters when we discuss statistical inference. You may not necessarily understand the importance for tidy data immediately but it will become more and more apparent as we proceed through the book. Needed packages At the beginning of this and all subsequent chapters, we’ll always have a list of packages you should have installed and loaded. In particular we load the nycflights13 package which we’ll discuss shortly and the dplyr package for data manipulation, the subject of Chapter 5. library(nycflights13) library(dplyr) 3.1 What is tidy data? You have surely heard the word “tidy” in your life: “Tidy up your room!” “Please write your homework in a tidy way so that it is easier to grade and to provide feedback.” Marie Kondo’s best-selling book The Life-Changing Magic of Tidying Up: The Japanese Art of Decluttering and Organizing “I am not by any stretch of the imagination a tidy person, and the piles of unread books on the coffee table and by my bed have a plaintive, pleading quality to me - ‘Read me, please!’” - Linda Grant So what does it mean for your data to be tidy? Put simply, it means that your data is organized. But it’s more than just that. It means that your data follows the same standard format making it easy for others to find elements of your data, to manipulate and transform your data, and, for our purposes, continuing with the common theme: it makes it easier to visualize your data and the relationships between different variables in your data. We will follow Hadley Wickham’s definition of tidy data here (Wickham 2014): A dataset is a collection of values, usually either numbers (if quantitative) or strings (if qualitative). Values are organised in two ways. Every value belongs to a variable and an observation. A variable contains all values that measure the same underlying attribute (like height, temperature, duration) across units. An observation contains all values measured on the same unit (like a person, or a day, or a race) across attributes. Tidy data is a standard way of mapping the meaning of a dataset to its structure. A dataset is messy or tidy depending on how rows, columns and tables are matched up with observations, variables and types. In tidy data: Each variable forms a column. Each observation forms a row. Each type of observational unit forms a table. Figure 3.1: Tidy data graphic from http://r4ds.had.co.nz/tidy-data.html Reading over this definition, you can begin to think about datasets that won’t follow this nice format. This format of data is also known as “long” format. Learning check (LC3.1) Give an example dataset that doesn’t follow this format. What features of this dataset might make it difficult to visualize? How could the dataset be tweaked to make it tidy? (LC3.2) Say the following table are stock prices, how would you make this tidy? time x y z 2009-01-01 -1.346 -2.241 4.412 2009-01-02 -0.777 -2.111 1.202 2009-01-03 0.304 -7.305 -4.859 2009-01-04 2.510 0.213 0.720 2009-01-05 -0.484 -0.008 7.705 3.2 Datasets in the nycflights13 package We likely have all flown on airplanes or know someone that has. Air travel has become an ever-present aspect of our daily lives. If you live in or are visiting a relatively large city and you walk around that city’s airport, you see gates showing flight information from many different airlines. And you will frequently see that some flights are delayed because of a variety of conditions. Are there ways that we can avoid having to deal with these flight delays? We’d all like to arrive at our destinations on time whenever possible. (Unless you secretly love hanging out at airports. If you are one of these people, pretend for the moment that you are very much anticipating being at your final destination.) Throughout this book, we’re going to analyze data related to flights contained in the nycflights13 package we loaded earlier (Wickham 2016). Specifically, this package contains information about all flights that departed from NYC (e.g. EWR, JFK and LGA) in 2013 in 5 data sets: flights: information on all 336,776 flights weather: hourly meterological data for each airport planes: construction information about each plane airports: airport names and locations airlines: translation between two letter carrier codes and names We will begin by loading in the flights dataset and getting an idea of its structure. Run the following in your console data(flights) This line of code loads in the flights dataset that is stored in the nycflights13 package. This dataset and most others presented in this book will be in the “data frame” format in R. Data frames are essentially spreadsheets and allow us to look at collections of variables that are tightly coupled together. The best way to get a feel for a data frame is to use the View function in RStudio. This command will be given throughout the book as a reminder, but the actual output will be hidden. Run View(flights) in R and look over this data frame. You should slowly get into the habit of always Viewing any data frames that come your way. Learning check (LC3.3) What does any ONE row in this flights dataset refer to? A. Data on an airline B. Data on a flight C. Data on an airport D. Data on multiple flights By running View(flights), we see the different variables listed in the columns and we see that there are different types of variables. Some of the variables like distance, day, and arr_delay are what we will call quantitative variables. These variables vary in a numerical way. Other variables here are categorical. Note that if you look in the leftmost column of the View(flights) output, you will see a column of numbers. These are the row numbers of the dataset. If you glance across a row with the same number, say row 5, you can get an idea of what each row corresponds to. In other words, this will allow you to identify what object is being referred to in a given row. This is often called the observational unit. The observational unit in this example is an individual flight departing New York City in 2013. You can identify the observational unit by determining what the thing is that is being measured in each of the variables. Note: Frequently the first thing you should do when given a dataset is to identify the observation unit, specify the variables, and give the types of variables you are presented with. The glimpse() command in the dplyr package provides us with much of the above information and more: glimpse(flights) ## Observations: 336,776 ## Variables: 19 ## $ year &lt;int&gt; 2013, 2013, 2013, 2013, 2013, 2013, 2013, 20... ## $ month &lt;int&gt; 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,... ## $ day &lt;int&gt; 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,... ## $ dep_time &lt;int&gt; 517, 533, 542, 544, 554, 554, 555, 557, 557,... ## $ sched_dep_time &lt;int&gt; 515, 529, 540, 545, 600, 558, 600, 600, 600,... ## $ dep_delay &lt;dbl&gt; 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2,... ## $ arr_time &lt;int&gt; 830, 850, 923, 1004, 812, 740, 913, 709, 838... ## $ sched_arr_time &lt;int&gt; 819, 830, 850, 1022, 837, 728, 854, 723, 846... ## $ arr_delay &lt;dbl&gt; 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2... ## $ carrier &lt;chr&gt; &quot;UA&quot;, &quot;UA&quot;, &quot;AA&quot;, &quot;B6&quot;, &quot;DL&quot;, &quot;UA&quot;, &quot;B6&quot;, &quot;E... ## $ flight &lt;int&gt; 1545, 1714, 1141, 725, 461, 1696, 507, 5708,... ## $ tailnum &lt;chr&gt; &quot;N14228&quot;, &quot;N24211&quot;, &quot;N619AA&quot;, &quot;N804JB&quot;, &quot;N66... ## $ origin &lt;chr&gt; &quot;EWR&quot;, &quot;LGA&quot;, &quot;JFK&quot;, &quot;JFK&quot;, &quot;LGA&quot;, &quot;EWR&quot;, &quot;E... ## $ dest &lt;chr&gt; &quot;IAH&quot;, &quot;IAH&quot;, &quot;MIA&quot;, &quot;BQN&quot;, &quot;ATL&quot;, &quot;ORD&quot;, &quot;F... ## $ air_time &lt;dbl&gt; 227, 227, 160, 183, 116, 150, 158, 53, 140, ... ## $ distance &lt;dbl&gt; 1400, 1416, 1089, 1576, 762, 719, 1065, 229,... ## $ hour &lt;dbl&gt; 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6,... ## $ minute &lt;dbl&gt; 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, ... ## $ time_hour &lt;dttm&gt; 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2... Learning check (LC3.4) What are some examples in this dataset of categorical variables? What makes them different than quantitative variables? (LC3.5) What does int, num, and chr mean in the output above? (LC3.6) How many different columns are in this dataset? (LC3.7) How many different rows are in this dataset? We see that glimpse will give you the first few entries of each variable in a row after the variable. In addition, the type of the variable is given immediately after each variable’s name inside &lt; &gt;. Here, int and num refer to quantitative variables. In contrast, chr refers to categorical variables. One more type of variable is given here with the time_hour variable: dttm. As you may suspect, this variable corresponds to a specific date and time of day. Another nice feature of R is the help system. You can get help in R by simply entering a question mark before the name of a function or an object and you will be presented with a page showing the documentation. Note that this output help file is omitted here but can be accessed here on page 3 of the PDF document. ?glimpse ?flights Another aspect of tidy data is a description of what each variable in the dataset represents. This helps others to understand what your variable names mean and what they correspond to. If we look at the output of ?flights, we can see that a description of each variable by name is given. An important feature to ALWAYS include with your data is the appropriate units of measurement. We’ll see this further when we work with the dep_delay variable in Chapter 4. (It’s in minutes, but you’d get some really strange interpretations if you thought it was in hours or seconds. UNITS MATTER!) 3.3 How is flights tidy? We see that flights has a rectangular shape with each row corresponding to a different flight and each column corresponding to a characteristic of that flight. This matches exactly with how Hadley Wickham defined tidy data: Each variable forms a column. Each observation forms a row. But what about the third property? Each type of observational unit forms a table. We identified earlier that the observational unit in the flights dataset is an individual flight. And we have shown that this dataset consists of 336,776 flights with 19 variables. In other words, some rows of this dataset don’t refer to a measurement on an airline or on an airport. They specifically refer to characteristics/measurements on a given flight from New York City in 2013. By contrast, also included in the nycflights13 package are datasets with different observational units (Wickham 2016): weather: hourly meteorological data for each airport planes: construction information about each plane airports: airport names and locations airlines: translation between two letter carrier codes and names You may have been asking yourself what carrier refers to in the glimpse(flights) output above. The airlines dataset provides a description of this with each airline being the observational unit: data(airlines) airlines ## # A tibble: 16 × 2 ## carrier name ## &lt;chr&gt; &lt;chr&gt; ## 1 9E Endeavor Air Inc. ## 2 AA American Airlines Inc. ## 3 AS Alaska Airlines Inc. ## 4 B6 JetBlue Airways ## 5 DL Delta Air Lines Inc. ## 6 EV ExpressJet Airlines Inc. ## 7 F9 Frontier Airlines Inc. ## 8 FL AirTran Airways Corporation ## 9 HA Hawaiian Airlines Inc. ## 10 MQ Envoy Air ## 11 OO SkyWest Airlines Inc. ## 12 UA United Air Lines Inc. ## 13 US US Airways Inc. ## 14 VX Virgin America ## 15 WN Southwest Airlines Co. ## 16 YV Mesa Airlines Inc. As can be seen here when you just enter the name of an object in R, by default it will print the contents of that object to the screen. Be careful! It’s usually better to use the View() function in RStudio since larger objects may take awhile to print to the screen and it likely won’t be helpful to you to have hundreds of lines outputted. Learning check (LC3.8) Run the following block of code in R to load and view each of the four data frames in the nycflights13 package. Switch between the different tabs that have opened to view each of the four data frames. Describe in two sentences for each data frame what stands out to you and what the most important features are of each. data(weather) data(planes) data(airports) data(airlines) View(weather) View(planes) View(airports) View(airlines) 3.3.1 Identification variables There is a subtle difference between the kinds of variables that you will encounter in data frames. The airports data frame you worked with above contains data in these different kinds. Let’s pull them apart using the glimpse function: glimpse(airports) ## Observations: 1,458 ## Variables: 8 ## $ faa &lt;chr&gt; &quot;04G&quot;, &quot;06A&quot;, &quot;06C&quot;, &quot;06N&quot;, &quot;09J&quot;, &quot;0A9&quot;, &quot;0G6&quot;, &quot;0G7... ## $ name &lt;chr&gt; &quot;Lansdowne Airport&quot;, &quot;Moton Field Municipal Airport&quot;,... ## $ lat &lt;dbl&gt; 41.13, 32.46, 41.99, 41.43, 31.07, 36.37, 41.47, 42.8... ## $ lon &lt;dbl&gt; -80.62, -85.68, -88.10, -74.39, -81.43, -82.17, -84.5... ## $ alt &lt;int&gt; 1044, 264, 801, 523, 11, 1593, 730, 492, 1000, 108, 4... ## $ tz &lt;dbl&gt; -5, -6, -6, -5, -5, -5, -5, -5, -5, -8, -5, -6, -5, -... ## $ dst &lt;chr&gt; &quot;A&quot;, &quot;A&quot;, &quot;A&quot;, &quot;A&quot;, &quot;A&quot;, &quot;A&quot;, &quot;A&quot;, &quot;A&quot;, &quot;U&quot;, &quot;A&quot;, &quot;A&quot;... ## $ tzone &lt;chr&gt; &quot;America/New_York&quot;, &quot;America/Chicago&quot;, &quot;America/Chica... The variables faa and name are what we will call identification variables. They are mainly used to provide a name to the observational unit. Here the observational unit is an airport and the faa gives the code provided by the FAA for that airport while the name variable gives the longer more natural name of the airport. These ID variables differ from the other variables that are often called measurement or characteristic variables. The remaining variables (aside from faa and name) are of this type in airports. They don’t uniquely identify the observational unit, but instead describe properties of the observational unit. For organizational purposes, it is best practice to have your identification variables in the far leftmost columns of your data frame. Learning check (LC3.9) What properties of the observational unit do each of lat, lon, alt, tz, dst, and tzone describe for the airports data frame? (LC3.10) Provide the names of variables in a data frame with at least three variables in which one of them is an identification variable and the other two are not. 3.4 Normal forms of data The datasets included in the nycflights13 package are in a form that minimizes redundancy of data. We will see that there are ways to merge (or join) the different tables together easily. We are capable of doing so because each of the tables have keys in common to relate one to another. This is an important property of normal forms of data. The process of decomposing data frames into less redundant tables without losing information is called normalization. More information is available on Wikipedia. We saw an example of this above with the airlines dataset. While the flights data frame could also include a column with the names of the airlines instead of the carrier code, this would be repetitive since there is a unique mapping of the carrier code to the name of the airline/carrier. Below an example is given showing how to join the airlines data frame together with the flights data frame by linking together the two datasets via a common key of &quot;carrier&quot;. Note that this “joined” data frame is assigned to a new data frame called joined_flights. The key variable that we frequently join by is one of the identification variables mentioned above. library(dplyr) joined_flights &lt;- inner_join(x = flights, y = airlines, by = &quot;carrier&quot;) View(joined_flights) If we View this dataset, we see a new variable has been created called name. (We will see in Subsection 5.4.2 ways to change name to a more descriptive variable name.) More discussion about joining data frames together will be given in Chapter 5. We will see there that the names of the columns to be linked need not match as they did here with &quot;carrier&quot;. Review questions (RQ3.1) What are common characteristics of “tidy” datasets? (RQ3.2) What makes “tidy” datasets useful for organizing data? (RQ3.3) How many variables are presented in the table below? What does each row correspond to? (Hint: You may not be able to answer both of these questions immediately but take your best guess.) students faculty 4 2 6 3 (RQ3.4) The confusion you may have encountered in Question 4 is a common one those that work with data are commonly presented with. This dataset is not tidy. Actually, the dataset in Question 4 has three variables not the two that were presented. Make a guess as to what these variables are and present a tidy dataset instead of this untidy one given in Question 4. (RQ3.5) The actual data presented in Question 4 is given below in tidy data format: role Sociology? Type of School student TRUE Public student TRUE Public student TRUE Public student TRUE Public student FALSE Public student FALSE Public student FALSE Private student FALSE Private student FALSE Private student FALSE Private faculty TRUE Public faculty TRUE Public faculty FALSE Public faculty FALSE Private faculty FALSE Private What does each row correspond to? What are the different variables in this data frame? The Sociology? variable is known as a logical variable. What types of values does a logical variable take on? (RQ3.6) What are some advantages of data in normal forms? What are some disadvantages? 3.5 What’s to come? In Chapter 4, we will further explore the distribution of a variable in a related dataset to flights: the temp variable in the weather dataset. We’ll be interested in understanding how this variable varies in relation to the values of other variables in the dataset. We will see that visualization is often a powerful tool in helping us see what is going on in a dataset. It will be a useful way to expand on the glimpse function we have seen here for tidy data. References "],
+["4-viz.html", "4 Data Visualization via ggplot2 4.1 The Grammar of Graphics 4.2 Five Named Graphs - The 5NG 4.3 5NG#1: Scatter-plots 4.4 5NG#2: Line-graphs 4.5 5NG#3: Histograms 4.6 Facets 4.7 5NG#4: Boxplots 4.8 5NG#5: Barplots 4.9 Conclusion", " 4 Data Visualization via ggplot2 In Chapter 3, we discussed the importance of datasets being tidy. You will see in examples here why having a tidy dataset helps us immensely when plotting our data. In plotting our data, we will be able to gain valuable insights from our data that we couldn’t initially see from just looking at the raw data. We will focus on using Hadley Wickham’s ggplot2 package in doing so, which was developed to work specifically on datasets that are tidy. It provides an easy way to customize your plots and is based on data visualization theory given in The Grammar of Graphics (Wilkinson 2005). At the most basic level, graphics/plots/charts provide a nice way for us to get a sense for how quantitative variables compare in terms of their center and their spread. The most important thing to know about graphics is that they should be created to make it obvious for your audience to see the findings you want to get across. This requires a balance of not including too much in your plots, but also including enough so that relationships and interesting findings can be easily seen. As we will see, plots/graphics also help us to identify patterns and outliers in our data. We will see that a common extension of these ideas is to compare the distribution of one quantitative variable (i.e., what the spread of a variable looks like or how the variable is distributed in terms of its values) as we go across the levels of a different categorical variable. Needed packages Before we proceed with this chapter, let’s load all the necessary packages. library(ggplot2) library(nycflights13) library(knitr) library(dplyr) 4.1 The Grammar of Graphics We begin with a discussion of a theoretical framework for data visualization known as the “The Grammar of Graphics,” which serves as the basis for the ggplot2 package. Much like the way we construct sentences in any language using a linguistic grammar (nouns, verbs, subjects, objects, etc.), the theoretical framework given by Leland Wilkinson (Wilkinson 2005) allows us to specify the components of a statistical graphic. 4.1.1 Components of Grammar In short, the grammar tells us that: A statistical graphic is a mapping of data variables to aesthetic attributes of geometric objects. Specifically, we can break a graphic into the following three essential components: data: the data set comprised of variables that we map. geom: the geometric object in question. This refers to our type of objects we can observe in our plot. For example, points, lines, bars, etc. aes: aesthetic attributes of the geometric object that we can perceive on a graphic. For example, x/y position, color, shape, and size. Each assigned aesthetic attribute can be mapped to a variable in our data set. If not assigned, they are set to defaults. 4.1.2 Napolean’s March on Moscow In 1812, Napoleon led a French invasion of Russia, marching on Moscow. It was one of the biggest military disasters due in large part to the Russian winter. In 1869, a French civil engineer named Charles Joseph Minard published arguably one of the greatest statistical visualizations of all-time, which summarized this march: Figure 4.1: Minard’s Visualization of Napolean’s March This was considered a revolution in statistical graphics because between the map on top and the line graph on the bottom, there are 6 dimensions of information (i.e. variables) being displayed on a 2-dimensional page. Let’s view this graphic through the lens of the Grammar of Graphics: Table 4.1: Grammar of Map (Top) and Line-Graph (Bottom) in Minard’s Graphic of Napolean’s March data aes geom longitude x point latitude y point army size size path army direction color path data aes geom date x line &amp; text temperature y line &amp; text For example, the data variable longitude gets mapped to the x aesthetic of the points geometric objects on the map while the annotated line-graph displays date and temperature variable information via its mapping to the x and y aesthetic of the line geometric object. 4.1.3 Other Components of the Grammar There are other components of the Grammar of Graphics we can control: facet: how to break up a plot into subsets statistical transformations: this includes smoothing, binning values into a histogram, or just itself un-transformed as &quot;identity&quot;. scales both convert data units to physical units the computer can display draw a legend and/or axes, which provide an inverse mapping to make it possible to read the original data values from the graph. coordinate system for x/y values: typically cartesian, but can also be polar or map position adjustments In this text, we will only focus on the first two: faceting (introduced in Section 4.6) and statistical transformations (in a limited sense, when consider Barplots in Section 4.8); the other components are left to a more advanced text. This is not a problem when producing a plot as each of these components have default settings. There are other extra attributes that can be tweaked as well including the plot title, axes labels, and over-arching themes for the plot. In general, the Grammar of Graphics allows for customization but also a consistent framework that allows the user to easily tweak their creations as needed in order to convey a message about their data. 4.1.4 The ggplot2 Package We next introduce Hadley Wickham’s ggplot2 package, which is an implementation of the Grammar of Graphics for R (Wickham and Chang 2016). You may have noticed that a lot of previous text in this chapter is written in computer font. This is because the various components of the Grammar of Graphics are specified using the ggplot function, which expects at a bare minimal as arguments the data frame where the variables exist (the data argument) and the names of the variables to be plotted (the mapping argument). The names of the variables will be entered into the aes function as arguments where aes stands for “aesthetics”. Review questions **`paste0(\"(RQ\", chap, \".\", (rq 4.2 Five Named Graphs - The 5NG For our purposes, we will be limiting consideration to five different types of graphs (note that in this text we use the terms “graphs”, “plots”, and “charts” interchangeably). We term these five named graphs the 5NG: scatter-plots line-graphs boxplots histograms barplots With this repertoire of plots, you can visualize a wide array of data variables thrown at you. We will discuss some variations of these, but with the 5NG in your toolbox you can do big things! Something we will also stress here is that certain plots only work for categorical/logical variables and others only for quantitative variables. You’ll want to quiz yourself often as we go along on which plot makes sense a given a particular problem set-up. 4.3 5NG#1: Scatter-plots The simplest of the 5NG are scatter-plots (also called bivariate plots); they allow you to investigate the relationship between two continuous variables. While you may already be familiar with such plots, let’s view it through the lens of the Grammar of Graphics. Specifically, we will graphically investigate the relationship between the following two continuous variables in the flights data frame: dep_delay: departure delay on the horizontal “x” axis and arr_delay: arrival delay on the vertical “y” axis for Alaska Airlines flights leaving NYC in 2013. This requires paring down the flights data frame to a smaller data frame all_alaska_flights consisting of only Alaska Airlines (carrier code “AS”) flights. data(flights) all_alaska_flights &lt;- flights %&gt;% filter(carrier == &quot;AS&quot;) This code snippet makes use of functions in the dplyr package for data manipulation to achieve our goal: it takes the flights data frame and filters it to only return the rows which meet the condition carrier == &quot;AS&quot; (recall equality is specified with == and not =). You will see many more examples using this function in Chapter 5. Learning check (LC4.1) Take a look at both the flights and all_alaska_flights data frames by running View(flights) and View(all_alaska_flights) in the console. In what respect do these data frames differ? 4.3.1 Scatter-plots via geom_point We proceed to create the scatter-plot using the ggplot() function: ggplot(data = all_alaska_flights, aes(x = dep_delay, y = arr_delay)) + geom_point() Figure 4.2: Arrival Delays vs Departure Delays for Alaska Airlines flights from NYC in 2013 You are encouraged to enter Return on your keyboard after entering the +. As we add more and more elements, it will be nice to keep them indented as you see below. Note that this will not work if you begin the line with the +. Let’s break down this keeping in mind our discussion in Section 4.1: Within the ggplot() function call, we specify two of the components of the grammar: The data frame to be all_alaska_flights by setting data = all_alaska_flights The aesthetic mapping by setting aes(x = dep_delay, y = arr_delay). Specifically dep_delay maps to the x position arr_delay maps to the y position We add a layer to the ggplot() function call using the + sign The layer in question specifies the third component of the grammar: the geometric object in question. In this case the geometric object are points, set by specifying geom_point() In Figure 4.2 we see that a positive relationship exists between dep_delay and arr_delay: as departure delays increase, arrival delays tend to also increase. We also note that the majority of points fall near the point (0, 0). There is a large mass of points clustered there. (We will work more with this data set in Chapter 9, where we investigate correlation and linear regression.) Learning check (LC4.2) What are some practical reasons why dep_delay and arr_delay have a positive relationship? (LC4.3) What variables (not necessarily in the flights data frame) would you expect to have a negative correlation (i.e. a negative relationship) with dep_delay? Why? Remember that we are focusing on continuous variables here. (LC4.4) Why do you believe there is a cluster of points near (0, 0)? What does (0, 0) correspond to in terms of the Alaskan flights? (LC4.5) What are some other features of the plot that stand out to you? (LC4.6) Create a new scatter-plot using different variables in the all_alaska_flights data frame by modifying the example above. 4.3.2 Over-Plotting The large mass of points near (0, 0) can cause some confusion. This is the result of a phenomenon called over-plotting. As one may guess, this corresponds to values being plotted on top of each other over and over again. It is often difficult to know just how many values are plotted in this way when looking at a basic scatter-plot as we have here. There are two ways to address this issue: By adjusting the transparency of the points via the alpha argument By jittering the points via geom_jitter() The first way of relieving over-plotting is by changing the alpha argument to geom_point() which controls the transparency of the points. By default, this value is set to 1. We can change this value to a smaller fraction (greater than 0) to change the transparency of the points in the plot: ggplot(data = all_alaska_flights, aes(x = dep_delay, y = arr_delay)) + geom_point(alpha = 0.2) Figure 4.3: Delay scatterplot with alpha=0.2 Note how this function call is identical to the one in Section 4.3, but with geom_point() replaced with alpha = 0.2 added. The second way of relieving over-plotting is to jitter the points a bit. In other words, we are going to add just a bit of random noise to the points to better see them and remove some of the over-plotting. You can think of “jittering” as shaking the points a bit on the plot. Instead of using geom_point, we use geom_jitter to perform this shaking and specify around how much jitter to add with the width and height arguments. This corresponds to how hard you’d like to shake the plot in units corresponding to those for both the horizontal and vertical variables (in this case minutes). ggplot(data = all_alaska_flights, aes(x = dep_delay, y = arr_delay)) + geom_jitter(width = 30, height = 30) Figure 4.4: Jittered delay scatterplot Note how this function call is identical to the one in Section 4.3.1, but with geom_point() replaced with geom_jitter(). The plot in 4.4 helps us a little bit in getting a sense for the over-plotting, but with a relatively large dataset like this one (714 flights), it can be argued that changing the transparency of the points by setting alpha proved more effective. Learning check (LC4.7) Why is setting the alpha argument value useful with scatter-plots? What further information does it give you that a regular scatter-plot cannot? (LC4.8) After viewing the Figure 4.3 above, give a range of arrival times and departure times that occur most frequently? How has that region changed compared to when you observed the same plot without the alpha = 0.2 set in Figure 4.2? 4.3.3 Summary Scatter-plots display the relationship between two continuous variables and may be the most used plot today as they can provide an immediate way to see the trend in one variable versus another. If you try to create a scatter-plot where either one of the two variables is not quantitative however, you will get strange results. Be careful! With medium to large datasets, you may need to play with either geom_jitter or the alpha argument in order to get a good feel for relationships in your data. This tweaking is often a fun part of data visualization since you’ll have the chance to see different relationships come about as you make subtle changes to your plots. 4.4 5NG#2: Line-graphs The next of the 5NG is a line-graph. They are most frequently used when the x-axis represents time and the y-axis represents some other numerical variable; such plots are known as time series. Time represents a variable that is connected together by each day following the previous day. In other words, time has a natural ordering. Line-graphs should be avoided when there is not a clear sequential ordering to the explanatory variable, i.e. the x-variable or the predictor variable. Our focus turns to the temp variable in this weather dataset. By Looking over the weather dataset by typing View(weather) in the console. Running ?weather to bring up the help file. We can see that the temp variable corresponds to hourly temperature (in Fahrenheit) recordings at weather stations near airports in New York City. Instead of considering all hours in 2013 for all three airports in NYC, let’s focus on the hourly temperature at Newark airport (origin code “EWR”) for the first 15 days in January 2013. The weather data frame in the nycflights13 package contains this data, but we first need to filter it to only include those rows that correspond to Newark in the first 15 days of January. data(weather) early_january_weather &lt;- weather %&gt;% filter(origin == &quot;EWR&quot; &amp; month == 1 &amp; day &lt;= 15) This is similar to the previous use of the filter command in Section 4.3, however we now use the &amp; operator. The above selects only those rows in weather where origin == &quot;EWR&quot; and month = 1 and day &lt;= 15. Learning check (LC4.9) Take a look at both the weather and early_january_weather data frames by running View(weather) and View(early_january_weather) in the console. In what respect do these data frames differ? (LC4.10) The weather data is recorded hourly. Why does the time_hour variable correctly identify the hour of the measurement whereas the hour variable does not? 4.4.1 Line-graphs via geom_line We plot a line-graph of hourly temperature using geom_line(): ggplot(data = early_january_weather, aes(x = time_hour, y = temp)) + geom_line() Figure 4.5: Hourly Temperature in Newark for Jan 1-15 2013 Much as with the ggplot() call in Section 4.3.1, we specify the components of the Grammar of Graphics: Within the ggplot() function call, we specify two of the components of the grammar: The data frame to be early_january_weather by setting data = early_january_weather The aesthetic mapping by setting aes(x = time_hour, y = temp). Specifically time_hour (i.e. the time variable) maps to the x position temp maps to the y position We add a layer to the ggplot() function call using the + sign The layer in question specifies the third component of the grammar: the geometric object in question. In this case the geometric object is a line, set by specifying geom_line() Learning check (LC4.11) Why should line-graphs be avoided when there is not a clear ordering of the horizontal axis? (LC4.12) Why are line-graphs frequently used when time is the explanatory variable? (LC4.13) Plot a time series of a variable other than temp for Newark Airport in the first 15 days of January 2013. 4.4.2 Summary Line-graphs, just like scatter-plots, display the relationship between two continuous variables. However the variable on the x-axis (i.e. the explanatory variable) should have a natural ordering, like some notion of time. We can mislead our audience if that isn’t the case. 4.5 5NG#3: Histograms Let’s consider the temp variable in the weather data frame once again, but now unlike with the line-graphs in Section 4.4, let’s say we don’t care about the relationship of temperature to time, but rather you care about the (statistical) distribution of temperatures. We could just produce points where each of the different values appear on something similar to a number line: Figure 4.6: Strip Plot of Hourly Temperature Recordings from NYC in 2013 This gives us a general idea of how the values of temp differ. We see that temperatures vary from around 11 up to 100 degrees Fahrenheit. The area between 40 and 60 degrees appears to have more points plotted than outside that range. 4.5.1 Histograms via geom_histogram What is commonly produced instead of this strip plot is a plot known as a histogram. The histogram shows how many elements of a single numerical variable fall in specified bins. In this case, these bins may correspond to between 0-10°F, 10-20°F, etc. We produce a histogram of the hour temperatures at all three NYC airports in 2013: ggplot(data = weather, mapping = aes(x = temp)) + geom_histogram() ## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`. ## Warning: Removed 1 rows containing non-finite values (stat_bin). Figure 4.7: Histogram of Hourly Temperature Recordings from NYC in 2013 Note here: There is only one variable being mapped in aes(): the single continuous variable temp. You don’t need to compute the y-aesthetic: it gets computed automatically. We set the geometric object to be geom_histogram() We got a warning message of 1 rows containing non-finite values being removed. This is due to one of the values of temperature being missing. R is alerting us that this happened. 4.5.2 Adjusting the Bins We can adjust the number/size of the bins two ways: By adjusting the number of bins via the bins argument By adjusting the width of the bins via the binwidth argument First, we have the power to specify how many bins we would like to put the data into as an argument in the geom_histogram function. By default, this is chosen to be 30 somewhat arbitrarily; we have received a warning above our plot that this was done. ggplot(data = weather, mapping = aes(x = temp)) + geom_histogram(bins = 60, color = &quot;white&quot;) Figure 4.8: Histogram of Hourly Temperature Recordings from NYC in 2013 - 60 Bins Note the addition of the color argument. If you’d like to be able to more easily differentiate each of the bins, you can specify the color of the outline as done above. Second, instead of specifying the number of bins, we can also specify the width of the bins by using the binwidth argument in the geom_histogram function. ggplot(data = weather, mapping = aes(x = temp)) + geom_histogram(binwidth = 10, color = &quot;white&quot;) Figure 4.9: Histogram of Hourly Temperature Recordings from NYC in 2013 - Binwidth = 10 Learning check (LC4.14) What does changing the number of bins from 30 to 60 tell us about the distribution of temperatures? (LC4.15) Would you classify the distribution of temperatures as symmetric or skewed? (LC4.16) What would you guess is the “center” value in this distribution? Why did you make that choice? (LC4.17) Is this data spread out greatly from the center or is it close? Why? 4.5.3 Summary Histograms, unlike scatter-plots and line-graphs, presents information on only a single continuous variable. In particular they are visualizations of the (statistical) distribution of values. 4.6 Facets Before continuing the 5NG, we briefly introduce a new concept called faceting. Faceting is used when we’d like to create small multiples of the same plot over a different categorical variable. By default, all of the small multiples will have the same vertical axis. For example, suppose we were interested in looking at how the temperature histograms we saw in Section 4.5 varied by month. This is what is meant by “the distribution of a variable over another variable”: temp is one variable and month is the other variable. In order to look at histograms of temp for each month, we add a layer facet_wrap(~month). You can also specify how many rows you’d like the small multiple plots to be in using nrow inside of facet_wrap. ggplot(data = weather, aes(x = temp)) + geom_histogram(binwidth = 5, color = &quot;white&quot;) + facet_wrap(~ month, nrow = 4) Figure 4.10: Faceted histogram As we might expect, the temperature tends to increase as summer approaches and then decrease as winter approaches. Learning check (LC4.18) What other things do you notice about the faceted plot above? How does a faceted plot help us see how relationships between two variables? (LC4.19) What do the numbers 1-12 correspond to in the plot above? What about 25, 50, 75, 100? (LC4.20) For which types of datasets would these types of faceted plots not work well in comparing relationships between variables? Give an example describing the variability of the variables and other important characteristics. (LC4.21) Does the temp variable in the weather data set have a lot of variability? Why do you say that? 4.7 5NG#4: Boxplots While using faceted histograms can provide a way to compare distributions of a continuous variable split by groups of a categorical variable as in Chapter 4.6, an alternative plot called a boxplot (also called a side-by-side boxplot) achieves the same task and is frequently preferred. The boxplot uses the information provided in the five-number summary referred to in Appendix A. It gives a way to compare this summary information across the different levels of a categorical variable. 4.7.1 Boxplots via geom_boxplot Let’s create a boxplot to compare the monthly temperatures as we did above with the faceted histograms. ggplot(data = weather, aes(x = month, y = temp)) + geom_boxplot() Figure 4.11: Invalid boxplot specification Note the first warning that is given here. (The second one corresponds to missing values in the data frame and it is turned off on subsequent plots.) Observe that this plot does not look like what we were expecting. We were expecting to see the distribution of temperatures for each month (so 12 different boxplots). This gives us the overall boxplot without any other groupings. We can get around this by introducing a new function for our x variable: ggplot(data = weather, mapping = aes(x = factor(month), y = temp)) + geom_boxplot() Figure 4.12: Month by temp boxplot We have introduced a new function called factor() here. One of the things this function does is to convert a discrete value like month (1, 2, …, 12) into a categorical variable. The “box” part of this plot represents the 25th percentile, the median (50th percentile), and the 75th percentile. The dots correspond to outliers. (The specific formulation for these outliers is discussed in Appendix A.) The lines show how the data varies that is not in the center 50% defined by the first and third quantiles. Longer lines correspond to more variability and shorter lines correspond to less variability. Learning check (LC4.22) What does the dot at the bottom of the plot for May correspond to? Explain what might have occurred in May to produce this point. (LC4.23) Which months have the highest variability in temperature? What reasons do you think this is? (LC4.24) We looked at the distribution of a continuous variable over a categorical variable here with this boxplot. Why can’t we look at the distribution of one continuous variable over the distribution of another continuous variable? Say, temperature across pressure, for example? (LC4.25) Boxplots provide a simple way to identify outliers. Why may outliers be easier to identify when looking at a boxplot instead of a faceted histogram? 4.7.2 Summary Boxplots provide a way to compare and contrast the distribution of one quantitative variable across multiple levels of one categorical variable. One can easily look to see where the median falls across the different groups by looking at the center line in the box. You can also see how spread out the variable is across the different groups by looking at the width of the box and also how far out the lines stretch from the box. If the lines stretch far from the box but the box has a small width, the variability of the values closer to the center is much smaller than the variability of the outer ends of the variable. Lastly, outliers are even more easily identified when looking at a boxplot than when looking at a histogram. 4.8 5NG#5: Barplots Both histograms and boxplots represent ways to visualize the variability of continuous variables. Another common task is to present the distribution of a categorical variable. This is a simpler task since we will be interested in how many elements from our data fall into the different categories of the categorical variable. 4.8.1 Barplots via geom_bar Frequently, the best way to visualize these different counts (also known as frequencies) is via a barplot. Consider the distribution of airlines that flew out of New York City in 2013. Here we explore the number of flights from each airline/carrier. This can be plotted by invoking the geom_bar function in ggplot2: ggplot(data = flights, mapping = aes(x = carrier)) + geom_bar() Figure 4.13: Number of flights departing NYC in 2013 by airline To get an understanding of what the names of these airlines are corresponding to these carrier codes, we can look at the airlines data frame in the nycflights13 package. Note the use of the kable function here in the knitr package, which produces a nicely-formatted table of the values in the airlines data frame. data(airlines) kable(airlines) carrier name 9E Endeavor Air Inc. AA American Airlines Inc. AS Alaska Airlines Inc. B6 JetBlue Airways DL Delta Air Lines Inc. EV ExpressJet Airlines Inc. F9 Frontier Airlines Inc. FL AirTran Airways Corporation HA Hawaiian Airlines Inc. MQ Envoy Air OO SkyWest Airlines Inc. UA United Air Lines Inc. US US Airways Inc. VX Virgin America WN Southwest Airlines Co. YV Mesa Airlines Inc. Going back to our barplot, we see that United Air Lines, JetBlue Airways, and ExpressJet Airlines had the most flights depart New York City in 2013. To get the actual number of flights by each airline we can use the count function in the dplyr package on the carrier variable in flights, which we will introduce formally in Chapter 5. flights_table &lt;- flights %&gt;% dplyr::count(carrier) knitr::kable(flights_table) carrier n 9E 18460 AA 32729 AS 714 B6 54635 DL 48110 EV 54173 F9 685 FL 3260 HA 342 MQ 26397 OO 32 UA 58665 US 20536 VX 5162 WN 12275 YV 601 Technical note: Refer to the use of :: in both lines of code above. This is another way of ensuring the correct function is called. A count exists in a couple different packages and sometimes you’ll receive strange errors when a different instance of a function is used. This is a great way of telling R that “I want this one!”. You specify the name of the package directly before the :: and then the name of the function immediately after ::. Learning check (LC4.26) Why are histograms inappropriate for visualizing categorical variables? (LC4.27) What is the difference between histograms and barplots? (LC4.28) How many Envoy Air flights departed NYC in 2013? (LC4.29) What was the seventh highest airline in terms of departed flights from NYC in 2013? How could we better present the table to get this answer quickly. 4.8.2 Must avoid pie charts! Unfortunately, one of the most common plots seen today for categorical data is the pie chart. While they may see harmless enough, they actually present a problem in that humans are unable to judge angles well. As Naomi Robbins describes in her book “Creating More Effective Graphs” (Robbins 2013), we overestimate angles greater than 90 degrees and we underestimate angles less than 90 degrees. In other words, it is difficult for us to determine relative size of one piece of the pie compared to another. Let’s examine our previous barplot example on the number of flights departing NYC by airline. This time we will use a pie chart. As you review this chart, try to identify how much larger the portion of the pie is for ExpressJet Airlines (EV) compared to US Airways (US), what the third largest carrier is in terms of departing flights, and how many carriers have fewer flights than United Airlines (UA)? Figure 4.14: The dreaded pie chart While it is quite easy to look back at the barplot to get the answer to these questions, it’s quite difficult to get the answers correct when looking at the pie graph. Barplots can always present the information in a way that is easier for the eye to determine relative position. There may be one exception from Nathan Yau at FlowingData.com but we will leave this for the reader to decide: Figure 4.15: The only good pie chart Learning check (LC4.30) Why should pie charts be avoided and replaced by barplots? (LC4.31) What is your opinion as to why pie charts continue to be used? 4.8.3 Using barplots to compare two variables Barplots are the go-to way to visualize the frequency of different categories of a categorical variable. They make it easy to order the counts and to compare one group’s frequency to another. Another use of barplots (unfortunately, sometimes inappropriately and confusingly) is to compare two categorical variables together. Let’s examine the distribution of outgoing flights from NYC by carrier and airport. We begin by getting the names of the airports in NYC that were included in the flights dataset. Remember from Chapter 3 that this can be done by using the inner_join function (more in Chapter 5). flights_namedports &lt;- flights %&gt;% inner_join(airports, by = c(&quot;origin&quot; = &quot;faa&quot;)) After running View(flights_namedports), we see that name now corresponds to the name of the airport as referenced by the origin variable. We will now plot carrier as the horizontal variable. When we specify geom_bar, it will specify count as being the vertical variable. A new addition here is fill = name. Look over what was produced from the plot to get an idea of what this argument gives. Note that fill is an aesthetic just like x is an aesthetic. We need to make the name variable to this aesthetic. Any time you use a variable like this, you need to make sure it is wrapped inside the aes function. This is a common error! Make note of this now so you don’t fall into this problem later. ggplot(data = flights_namedports, mapping = aes(x = carrier, fill = name)) + geom_bar() Figure 4.16: Stacked barplot comparing the number of flights by carrier and airport This plot is what is known as a stacked barplot. While simple to make, it often leads to many problems. Learning check (LC4.32) What kinds of questions are not easily answered by looking at the above figure? (LC4.33) What can you say, if anything, about the relationship between airline and airport in NYC in 2013 in regards to the number of departing flights? Another variation on the stacked barplot is the side-by-side barplot. ggplot(data = flights_namedports, mapping = aes(x = carrier, fill = name)) + geom_bar(position = &quot;dodge&quot;) Figure 4.17: Side-by-side barplot comparing the number of flights by carrier and airport Learning check (LC4.34) Why might the side-by-side barplot be preferable to a stacked barplot in this case? (LC4.35) What are the disadvantages of using a side-by-side barplot, in general? Lastly, an often preferred type of barplot is the faceted barplot. We already saw this concept of faceting and small multiples in Section 4.6. This gives us a nicer way to compare the distributions across both carrier and airport/name. ggplot(data = flights_namedports, mapping = aes(x = carrier, fill = name)) + geom_bar() + facet_grid(name ~ .) Figure 4.18: Faceted barplot comparing the number of flights by carrier and airport Note how the facet_grid function arguments are written here. We are wanting the names of the airports vertically and the carrier listed horizontally. As you may have guessed, this argument and other formulas of this sort in R are in y ~ x order. We will see more examples of this in Chapter 9. Learning check (LC4.36) Why is the faceted barplot preferred to the side-by-side and stacked barplots in this case? (LC4.37) What information about the different carriers at different airports is more easily seen in the faceted barplot? 4.8.4 Summary Barplots are the preferred way of displaying categorical variables. They are easy-to-understand and to make comparisons across groups of a categorical variable. When dealing with more than one categorical variable, faceted barplots are frequently preferred over side-by-side or stacked barplots. Stacked barplots are sometimes nice to look at, but it is quite difficult to compare across the levels since the sizes of the bars are all of different sizes. Side-by-side barplots can provide an improvement on this, but the issue about comparing across groups still must be dealt with. 4.9 Conclusion 4.9.1 Resources An excellent resource as you begin to create plots using the ggplot2 package is a cheatsheet that RStudio has put together entitled “Data Visualization with ggplot2” available by clicking here or by clicking the RStudio Menu Bar -&gt; Help -&gt; Cheatsheets -&gt; “Data Visualization with ggplot2” This covers more than what we’ve discussed in this chapter but provides nice visual descriptions of what each function produces. In addition, we’ve created a mind map to help you remember which types of plots are most appropriate in a given situation by identifying the types of variables involved in the problem. It is available here and below. Figure 4.19: Mind map for Data Visualization 4.9.2 Script of R code An R script file of all R code used in this chapter is available here. 4.9.3 What’s to come? In Chapter 5, we’ll further explore data by grouping our data, creating summaries based on those groupings, filtering our data to match conditions, and other manipulations with our data including defining new columns/variables. These data manipulation procedures will go hand-in-hand with the data visualizations you’ve produced here. References "],
+["5-manip.html", "5 Data Manipulation via dplyr 5.1 The pipe %&gt;% 5.2 Five Main Verbs - The 5MV 5.3 Joining data frames 5.4 Optional: Other verbs 5.5 Conclusion", " 5 Data Manipulation via dplyr Let’s briefly recap where we have been so far and where we are headed. In Chapter 3, we discussed what it means for data to be tidy. We saw that this refers to observational units corresponding to rows and variables being stored in columns (one variable for every column). The entries in the data frame correspond to different combinations of observational units and variables. In the flights data frame, we saw that each row corresponds to a different flight leaving New York City. In other words, the observational unit of that tidy data frame is a flight. The variables are listed as columns and for flights they include both quantitative variables like dep_delay and distance but also categorical variables like carrier and origin. An entry in the table corresponds to a particular flight on a given day and a particular value of a given variable representing that flight. We saw in Chapter 4 that organizing data in this tidy way makes it easy for us to produce graphics. We can simply specify what variable/column we would like on one axis, what variable we’d like on the other axis, and what type of plot we’d like to make. We can also do things such as changing the color by another variable or change the size of our points by a fourth variable given this tidy data set. Furthermore, in Chapter 4, we hinted at some ways to summarize and manipulate data to suit your needs. This chapter expands on this by giving a variety of examples using what we call the Five Main Verbs in the dplyr package (Wickham and Francois 2016). There are more advanced operations than just these and you’ll see some examples of this near the end of the chapter. While at various points we specifically make mention to use the View() command to inspect a particular data frame, feel free to do so whenever. In fact, you should get into the habit of doing this for any data frame you work with. Needed packages Before we proceed with this chapter, let’s load all the necessary packages. library(dplyr) library(ggplot2) library(nycflights13) library(knitr) 5.1 The pipe %&gt;% Before we introduce the five main verbs, we first introduce the the pipe operator (%&gt;%). Just as the + sign was used to add layers to a plot created using ggplot(), the pipe operator allows us to chain together dplyr data manipulation functions. The pipe operator can be read as “then”. The %&gt;% operator allows us to go from one step in dplyr to the next easily so we can, for example: filter our data frame to only focus on a few rows then group_by another variable to create groups then summarize this grouped data to calculate the mean for each level of the group. The piping syntax will be our major focus throughout the rest of this book and you’ll find that you’ll quickly be addicted to the chaining with some practice. If you’d like to see more examples on using dplyr, the 5MV (in addition to some other dplyr verbs), and %&gt;% with the nycflights13 data set, you can check out Chapter 5 of Hadley and Garrett’s book (Grolemund and Wickham 2016). 5.2 Five Main Verbs - The 5MV The d in dplyr stands for data frames, so the functions here work when you are working with objects of the data frame type. It’s most important for you to focus on the 5MV: the five most commonly used functions that help us manipulate and summarize data. A description of these verbs follows with each subsection devoted to seeing an example of that verb in play (or a combination of a few verbs): filter: Pick rows based on conditions about their values summarize: Create summary measures of variables either over the entire data frame or over groups of observations on variables using group_by mutate: Create a new variable in the data frame by mutating existing ones arrange: Arrange/sort the rows based on one or more variables Just as we had the 5NG (The Five Named Graphs in Chapter 4 using ggplot2) for data visualization, we also have the 5MV here (The Five Main Verbs in dplyr) for data manipulation. All of the 5MVs follow the same syntax with the argument before the pipe %&gt;% being the name of the data frame and then the name of the verb with other arguments specifying which criteria you’d like the verb to work with in parentheses. 5.2.1 5MV#1: Filter observations using filter Figure 5.1: Filter diagram from Data Wrangling with dplyr and tidyr cheatsheet The filter function here works much like the “Filter” option in Microsoft Excel; it allows you to specify criteria about values of a variable in your data set and then chooses only those rows that match that criteria. We begin by focusing only on flights from New York City to Portland, Oregon. The dest code (or airport code) for Portland, Oregon is &quot;PDX&quot;. Run the following and look at the resulting spreadsheet to ensure that only flights heading to Portland are chosen here: portland_flights &lt;- flights %&gt;% filter(dest == &quot;PDX&quot;) View(pdx_flights) Note the following: The ordering of the commands: Take the data frame flights then filter the data frame so that only those where the dest equals &quot;PDX&quot; are included. The double equal sign == You are almost guaranteed to make the mistake at least once of only including one equals sign. Let’s see what happens when we make this error: portland_flights &lt;- flights %&gt;% filter(dest = &quot;PDX&quot;) Error: filter() takes unnamed arguments. Do you need `==`? You can combine multiple criteria together using operators that make comparisons: | corresponds to “or” &amp; corresponds to “and” We can often skip the use of &amp; and just separate our conditions with a comma. You’ll see this in the example below. In addition, you can use other mathematical checks (similar to ==): &gt; corresponds to “greater than” &lt; corresponds to “less than” &gt;= corresponds to “greater than or equal to” &lt;= corresponds to “less than or equal to” != corresponds to “not equal to” To see many of these in action, let’s select all flights that left JFK airport heading to Burlington, Vermont (&quot;BTV&quot;) or Seattle, Washington (&quot;SEA&quot;) in the months of October, November, or December. Run the following btv_sea_flights_fall &lt;- flights %&gt;% filter(origin == &quot;JFK&quot;, (dest == &quot;BTV&quot; | dest == &quot;SEA&quot;), month &gt;= 10) View(btv_sea_flights_fall) Note how even though colloquially speaking one might say “all flights leaving Burlington, Vermont and Seattle, Washington”, in terms of computer operations, we really mean “all flights leaving Burlington, Vermont or Seattle, Washington”, because for a given row in the data, dest can either be: “BTV”, “SEA”, or something else, but not “BTV” and “SEA” at the same time. Another example uses the ! to pick rows that DON’T match a condition. Here we are selecting rows corresponding to flights that didn’t go to Burlington, VT or Seattle, WA. not_BTV_SEA &lt;- flights %&gt;% filter(!(dest == &quot;BTV&quot; | dest == &quot;SEA&quot;)) View(not_BTV_SEA) As a final note we point out that filter() should often be the first verb you’ll apply to your data. This cleans your data set to only those rows you care about, or put differently, it narrows down the scope to just the observational units your care about. Learning check (LC5.1) What’s another way using ! we could filter only the rows that are not going to Burlington, VT nor Seattle, WA in the flights data frame? Test this out using the code above. 5.2.2 5MV#2: Summarize variables using summarize Figure 5.2: Summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet Figure 5.3: Another summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet We saw in Subsection ?? a way to calculate the standard deviation and mean of the temperature variable temp in the weather data frame of nycflights. We can do so in one step using the summarize function in dplyr: summary_temp &lt;- weather %&gt;% summarize(mean = mean(temp), std_dev = sd(temp)) summary_temp ## # A tibble: 1 × 2 ## mean std_dev ## &lt;dbl&gt; &lt;dbl&gt; ## 1 NA NA We’ve created a small data frame here called summary_temp that includes both the mean and the std_dev of the temp variable in weather. Notice as shown in Figures 5.2 and 5.3, the data frame weather went from many rows to a single row of just the summary values in the data frame summary_temp. But why are the mean and standard deviation missing, i.e. NA? Remember that by default the mean and sd functions do not ignore missing values. We need to specify the argument na.rm=TRUE (rm is short for “remove”): summary_temp &lt;- weather %&gt;% summarize(mean = mean(temp, na.rm = TRUE), std_dev = sd(temp, na.rm = TRUE)) summary_temp ## # A tibble: 1 × 2 ## mean std_dev ## &lt;dbl&gt; &lt;dbl&gt; ## 1 55.2 17.78 If we’d like to access either of these values directly we can use the $ to specify a column in a data frame. For example: summary_temp$mean ## [1] 55.2 You’ll often encounter issues with missing values NA. In fact, an entire branch of the field of statistics deals with missing data. However, it is not good practice to include a na.rm = TRUE in your summary commands by default; you should attempt to run them without this argument. The idea being you should at the very least be alerted to the presence of missing values and consider what the impact on the analysis might be if you ignore these values. In other words, na.rm = TRUE should only be used when necessary. What other summary functions can we use inside the summarize() verb? Any function in R that takes a vector of values and returns just one. Here are just a few: min() and max(): the minimum and maximum values respectively IQR(): Interquartile range sum(): the sum n(): a count of the number of rows/observations in each group. This particular summary function will make more sense in the group_by chapter. Learning check (LC5.2) Say a doctor is studying the effect of smoking on lung cancer of a large number of patients who have records measured at five year intervals. He notices that a large number of patients have missing data points because the patient has died, so he chooses to ignore these patients in his analysis. What is wrong with this doctor’s approach? (LC5.3) Modify the above summarize function to be use the n() summary function: summarize(count=n()). What does the returned value correspond to? (LC5.4) Why doesn’t the following code work? You may want to run the code line by line: summary_temp &lt;- weather %&gt;% summarize(mean = mean(temp, na.rm = TRUE)) %&gt;% summarize(std_dev = sd(temp, na.rm = TRUE)) 5.2.3 5MV#3: Group rows using group_by Figure 5.4: Group by and summarize diagram from Data Wrangling with dplyr and tidyr cheatsheet However, it’s often more useful to summarize a variable based on the groupings of another variable. Let’s say similarly to the previous section, we are interested in the mean and standard deviation of temperatures but grouped by month. This concept can equivalently be articulated as: we want the mean and standard deviation of temperatures split by month. sliced by month. aggregated by month. collapsed over month. We believe that you will be amazed at just how simple this is. Run the following code: summary_monthly_temp &lt;- weather %&gt;% group_by(month) %&gt;% summarize(mean = mean(temp, na.rm = TRUE), std_dev = sd(temp, na.rm = TRUE)) summary_monthly_temp ## # A tibble: 12 × 3 ## month mean std_dev ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; ## 1 1 35.64 10.185 ## 2 2 34.15 6.940 ## 3 3 39.81 6.225 ## 4 4 51.67 8.785 ## 5 5 61.59 9.609 ## 6 6 72.14 7.603 ## 7 7 80.01 7.148 ## 8 8 74.40 5.171 ## 9 9 67.43 8.476 ## 10 10 60.03 8.830 ## 11 11 45.11 10.502 ## 12 12 38.37 9.941 This code is identical to the previous code that created summary_temp, but there is an extra group_by(month) spliced in between. By simply grouping the weather data set by month first and then passing this new data frame into summarize we get a resulting data frame that shows the mean and standard deviation temperature for each month in New York City. Since each row in summary_monthly_temp represents a summary of different rows in weather, the observational units have changed. It is important to note that group_by doesn’t actually change the data frame. It simply sets meta-data (data about the data), specifically the group structure of the data. It is only after we apply the summarize function that the data frame actually changes. If we would like to remove this group structure meta-data, we can pipe a resulting data frame into the ungroup() function. We now revisit the n() counting summary function we introduced in the previous section. For example, suppose we’d like to get a sense for how many flights departed each of the three airports in New York City: by_origin &lt;- flights %&gt;% group_by(origin) %&gt;% summarize(count = n()) by_origin ## # A tibble: 3 × 2 ## origin count ## &lt;chr&gt; &lt;int&gt; ## 1 EWR 120835 ## 2 JFK 111279 ## 3 LGA 104662 We see that Newark (&quot;EWR&quot;) had the most flights departing in 2013 followed by &quot;JFK&quot; and lastly by LaGuardia (&quot;LGA&quot;). Note there is a subtle but important difference between sum() and n(). While sum() simply adds up a large set of numbers, the latter counts the number of times each of many different values occur. You are not limited to grouping by one variable! Say you wanted to know the number of flights leaving each of the three New York City airports for each month, we can also group by a second variable month: group_by(origin, month). Run the following: by_monthly_origin &lt;- flights %&gt;% group_by(origin, month) %&gt;% summarize(count = n()) View(by_monthly_origin) Learning check (LC5.5) Recall from Chapter 4 when we looked at plots of temperatures by months in NYC. What does the standard deviation column in the summary_monthly_temp data frame tell us about temperatures in New York City throughout the year? (LC5.6) What code would be required to get the mean and standard deviation temperature for each day in 2013 for NYC? (LC5.7) Recreate by_monthly_origin, but instead of grouping via group_by(origin, month), group variables in a different order group_by(month, origin). What differs in the resulting data set? (LC5.8) How could we identify how many flights left each of the three airports for each carrier? (LC5.9) How does the filter operation differ from a group_by followed by a summarize? 5.2.4 5MV#4: Create new variables/change old variables using mutate Figure 5.5: Mutate diagram from Data Wrangling with dplyr and tidyr cheatsheet When looking at the flights data set, there are some clear additional variables that could be calculated based on the values of variables already in the data set. Passengers are often frustrated when their flights departs late, but change their mood a bit if pilots can make up some time during the flight to get them to their destination close to when they expected to land. This is commonly referred to as “gain” and we will create this variable using the mutate function. Note that we have also overwritten the flights data frame with what it was before as well as an additional variable gain here. flights &lt;- flights %&gt;% mutate(gain = arr_delay - dep_delay) Why did we overwrite flights instead of assigning the resulting data frame to a new object, like flights_with_gain? As a rough rule of thumb, as long as you are not losing information that you might need later, its acceptable practice to overwrite data frames. However, if you overwrite existing variables and/or change the observational units, recovering the original information might prove difficult. It this case, it might make sense to create a new data object. Let’s look at summary measures of this gain variable and even plot it in the form of a histogram: gain_summary &lt;- flights %&gt;% summarize( min = min(gain, na.rm = TRUE), q1 = quantile(gain, 0.25, na.rm = TRUE), median = quantile(gain, 0.5, na.rm = TRUE), q3 = quantile(gain, 0.75, na.rm = TRUE), max = max(gain, na.rm = TRUE), mean = mean(gain, na.rm = TRUE), sd = sd(gain, na.rm = TRUE), missing = sum(is.na(gain)) ) gain_summary ## # A tibble: 1 × 8 ## min q1 median q3 max mean sd missing ## &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;dbl&gt; &lt;int&gt; ## 1 -109 -17 -7 3 196 -5.66 18.04 9430 We’ve recreated the summary function we saw in Chapter 4 here using the summarize function in dplyr. ggplot(data = flights, mapping = aes(x = gain)) + geom_histogram(color = &quot;white&quot;, bins = 20) Figure 5.6: Histogram of gain variable We can also create multiple columns at once and even refer to columns that were just created in a new column. Hadley produces one such example in Chapter 5 of “R for Data Science” (Grolemund and Wickham 2016): flights &lt;- flights %&gt;% mutate( gain = arr_delay - dep_delay, hours = air_time / 60, gain_per_hour = gain / hours ) Learning check (LC5.10) What do positive values of the gain variable in flights correspond to? What about negative values? And what about a zero value? (LC5.11) Could we create the dep_delay and arr_delay columns by simply subtracting dep_time from sched_dep_time and similarly for arrivals? Try the code out and explain any differences between the result and what actually appears in flights. (LC5.12) What can we say about the distribution of gain? Describe it in a few sentences using the plot and the gain_summary data frame values. 5.2.5 5MV#5: Reorder the data frame using arrange As you may have thought about with the data frames we’ve worked with so far in the book, one of the most common things you’d like to do is sort the data frames by a specific variable in a column. Have you ever been asked to calculate a median by hand? This requires you to put the data in order from smallest to highest in value. The dplyr package has a function called arrange that we will use to sort/reorder our data according to the values of the specified variable. This is often used after we have used the group_by and summarize functions as we will see. Let’s suppose we were interested in determining the most frequent destination airports from New York City in 2013: freq_dest &lt;- flights %&gt;% group_by(dest) %&gt;% summarize(num_flights = n()) freq_dest ## # A tibble: 105 × 2 ## dest num_flights ## &lt;chr&gt; &lt;int&gt; ## 1 ABQ 254 ## 2 ACK 265 ## 3 ALB 439 ## 4 ANC 8 ## 5 ATL 17215 ## 6 AUS 2439 ## 7 AVL 275 ## 8 BDL 443 ## 9 BGR 375 ## 10 BHM 297 ## # ... with 95 more rows You’ll see that by default the values of dest are displayed in alphabetical order here. We are interested in finding those airports that appear most: freq_dest %&gt;% arrange(num_flights) ## # A tibble: 105 × 2 ## dest num_flights ## &lt;chr&gt; &lt;int&gt; ## 1 LEX 1 ## 2 LGA 1 ## 3 ANC 8 ## 4 SBN 10 ## 5 HDN 15 ## 6 MTJ 15 ## 7 EYW 17 ## 8 PSP 19 ## 9 JAC 25 ## 10 BZN 36 ## # ... with 95 more rows This is actually giving us the opposite of what we are looking for. It tells us the least frequent destination airports first. To switch the ordering to be descending instead of ascending we use the desc function: freq_dest %&gt;% arrange(desc(num_flights)) ## # A tibble: 105 × 2 ## dest num_flights ## &lt;chr&gt; &lt;int&gt; ## 1 ORD 17283 ## 2 ATL 17215 ## 3 LAX 16174 ## 4 BOS 15508 ## 5 MCO 14082 ## 6 CLT 14064 ## 7 SFO 13331 ## 8 FLL 12055 ## 9 MIA 11728 ## 10 DCA 9705 ## # ... with 95 more rows 5.3 Joining data frames Another common task is joining/merging two different data sets. For example, in the flights data, the variable carrier lists the carrier code for the different flights. While &quot;UA&quot; and &quot;AA&quot; might be somewhat easy to guess for some (United and American Airlines), what are “VX”, “HA”, and “B6”? This information is provided in a separate data frame airlines. View(airlines) We see that in airports, carrier is the carrier code while name is the full name of the airline. Using this table, we can see that “VX”, “HA”, and “B6” correspond to Virgin America, Hawaiian Airlines, and JetBlue respectively. However, will we have to continually look up the carrier’s name for each flight in the airlines data set? No! Instead of having to manually do this, we can have R automatically do this “looking up” for us. Note that the values in the variable carrier in flights match the values in the variable carrier in airlines. In this case, we can use the variable carrier as a key variable to join/merge/match the two data frames by. Hadley and Garrett (Grolemund and Wickham 2016) created the following diagram to help us understand how the different data sets are linked: Figure 5.7: Data relationships in nycflights13 from R for Data Science 5.3.1 Joining by Key Variables In both flights and airlines, the key variable we want to join/merge/match the two data frames with has the same name in both data sets: carriers. We make use of the inner_join() function to join by the variable carrier. flights_joined &lt;- flights %&gt;% inner_join(airlines, by=&quot;carrier&quot;) View(flights) View(flights_joined) We observed that the flights and flights_joined are identical except that flights_joined has an additional variable name whose values were drawn from airlines. A visual representation of the inner_join is given below (Grolemund and Wickham 2016): Figure 5.8: Diagram of inner join from R for Data Science There are more complex joins available, but the inner_join will solve nearly all of the problems you’ll face in our experience. 5.3.2 Joining by Key Variables with Different Names Say instead, you are interested in all the destinations of flights from NYC in 2013 and ask yourself: “What cities are these airports in?” “Is &quot;ORD&quot; Orlando?” “Where is &quot;FLL&quot;? The airports data frame contains airport codes: View(airports) However, looking at both the airports and flights and the visual representation of the relations between the data frames in Figure 5.8, we see that in: airports the airport code is in the variable faa flights the airport code is in the variable origin So to join these two data sets, our inner_join operation involves a by argument that accounts for the different names: flights %&gt;% inner_join(airports, by = c(&quot;dest&quot; = &quot;faa&quot;)) Let’s construct the sequence of commands that computes the number of flights from NYC to each destination but also includes information about each destination airport: named_dests &lt;- flights %&gt;% group_by(dest) %&gt;% summarize(num_flights = n()) %&gt;% arrange(desc(num_flights)) %&gt;% inner_join(airports, by = c(&quot;dest&quot; = &quot;faa&quot;)) %&gt;% rename(airport_name = name) View(named_dests) In case you didn’t know, &quot;ORD&quot; is the airport code of Chicago O’Hare airport and &quot;FLL&quot; is the main airport in Fort Lauderdale, Florida, which we can now see in our named_freq_dests data frame. Learning check (LC5.13) Looking at Figure 5.7, when joining flights and weather, or in order words match the hourly weather values with each flight, why do we need to join by all of year, month, day, hour, and origin, and not just hour? (LC5.14) What surprises you about the top 10 destinations from NYC in 2013? 5.4 Optional: Other verbs 5.4.1 Select variables using select Figure 5.9: Select diagram from Data Wrangling with dplyr and tidyr cheatsheet We’ve seen that the flights data frame in the nycflights13 package contains many different variables. The names function gives a listing of all the columns in a data frame; in our case you would run names(flights). You can also identify these variables by running the glimpse function in the dplyr package: glimpse(flights) However, say you only want to consider two of these variables, say carrier and flight. You can select these: flights %&gt;% select(carrier, flight) Another one of these variables is year. If you remember the original description of the flights data frame (or by running ?flights), you’ll remember that this data correspond to flights in 2013 departing New York City. The year variable isn’t really a variable here in that it doesn’t vary… flights actually comes from a larger data set that covers many years. We may want to remove the year variable from our data set since it won’t be helpful for analysis in this case. We can deselect year by using the - sign: flights_no_year &lt;- flights %&gt;% select(-year) names(flights_no_year) Or we could specify a ranges of columns: flight_arr_times &lt;- flights %&gt;% select(month:day, arr_time:sched_arr_time) flight_arr_times The select function can also be used to reorder columns in combination with the everything helper function. Let’s suppose we’d like the hour, minute, and time_hour variables, which appear at the end of the flights data set, to actually appear immediately after the day variable: flights_reorder &lt;- flights %&gt;% select(month:day, hour:time_hour, everything()) names(flights_reorder) in this case everything() picks up all remaining variables. Lastly, the helper functions starts_with, ends_with, and contains can be used to choose column names that match those conditions: flights_begin_a &lt;- flights %&gt;% select(starts_with(&quot;a&quot;)) flights_begin_a flights_delays &lt;- flights %&gt;% select(ends_with(&quot;delay&quot;)) flights_delays flights_time &lt;- flights %&gt;% select(contains(&quot;time&quot;)) flights_time 5.4.2 Rename variables using rename Another useful function is rename, which as you may suspect renames one column to another name. Suppose we wanted dep_time and arr_time to be departure_time and arrival_time instead in the flights_time data frame: flights_time_new &lt;- flights %&gt;% select(contains(&quot;time&quot;)) %&gt;% rename(departure_time = dep_time, arrival_time = arr_time) names(flights_time) It’s easy to forget if the new name comes before or after the equals sign. I usually remember this as “New Before, Old After” or NBOA. You’ll receive an error if you try to do it the other way: Error: Unknown variables: departure_time, arrival_time. 5.4.3 Find the top number of values using top_n We can also use the top_n function which automatically tells us the most frequent num_flights. We specify the top 10 airports here: named_dests %&gt;% top_n(n = 10, wt = num_flights) We’ll still need to arrange this by num_flights though: named_dests %&gt;% top_n(n = 10, wt = num_flights) %&gt;% arrange(desc(num_flights)) Note: Remember that I didn’t pull the n and wt arguments out of thin air. They can be found by using the ? function on top_n. We can go one stop further and tie together the group_by and summarize functions we used to find the most frequent flights: ten_freq_dests &lt;- flights %&gt;% group_by(dest) %&gt;% summarize(num_flights = n()) %&gt;% top_n(n = 10) %&gt;% arrange(desc(num_flights)) View(ten_freq_dests) Learning check (LC5.15) What are some ways to select all three of the dest, air_time, and distance variables from flights? Give the code showing how to do this in at least three different ways. (LC5.16) How could one use starts_with, ends_with, and contains to select columns from the flights data frame? Provide three different examples in total: one for starts_with, one for ends_with, and one for contains. (LC5.17) Why might we want to use the select function on a data frame? paste0(&quot;(LC&quot;, chap, &quot;.&quot;, (lc &lt;- lc + 1), &quot;)&quot;) Create a new data frame that shows the top 5 airports with the largest arrival delays from NYC in 2013. 5.5 Conclusion 5.5.1 Resources As we saw with the RStudio cheatsheet on data visualization, RStudio has also created a cheatsheet for data manipulation entitled “Data Wrangling with dplyr and tidyr” available By clicking here Or by clicking the RStudio Menu Bar -&gt; Help -&gt; Cheatsheets -&gt; “Data Manipulation with dplyr, tidyr” We will focus only on the dplyr functions in this book, but you are encouraged to also explore tidyr if you are presented with data that is not in the tidy format that we have specified as the preferred option for our purposes. 5.5.2 Script of R code An R script file of all R code used in this chapter is available here. 5.5.3 What’s to come? This concludes the Data Exploration unit of this book. You should be pretty proficient in both plotting variables (or multiple variables together) in various data sets and manipulating data as we’ve done in this chapter. You are encouraged to step back through the code in earlier chapters and make changes as you see fit based on your updated knowledge. In Chapter 6, we’ll begin to build the pieces needed to understand how this unit of Data Exploration can tie into statistical inference in the Inference part of the book. Remember that the focus throughout is on data visualization and we’ll see that next when we discuss sampling, resampling, and bootstrapping. These ideas will lead us into hypothesis testing and confidence intervals. References "],
+["6-sim.html", "6 Simulating Randomness via mosaic 6.1 Random sampling 6.2 Visualizing sampling 6.3 Simulation 6.4 Review of mosaic simulation functions 6.5 Conclusion", " 6 Simulating Randomness via mosaic In this chapter we will introduce new concepts that will serve as the basis for the remainder of the text: sampling and resampling. We will see that the tools that you learned in the Data Exploration part of this book (tidy data, data visualization, and data manipulation) will also play an important role here. As mentioned before, the concepts throughout this text all build into a culmination allowing you to create better stories with data. We begin with some helpful definitions that will help us better understand why statistical inference exists and why it is needed. We will then progress with introducing the second of our main data sets (in addition to the nycflights13 data you’ve been working with) about OKCupid dating profiles to see how one can think of the distribution of a sample being an approximation of the distribution of the population. We will also focus on representative, random samples versus convenience samples in this context. We then shift to a famous example from statistics lore on a lady tasting tea. This section will focus on introducing concepts without a lot of statistical jargon. The chapter will conclude with a summary of the different functions introduced in the mosaic package in this chapter. Needed packages library(dplyr) library(ggplot2) library(okcupiddata) library(mosaic) library(knitr) 6.1 Random sampling Whenever you hear the phrases “random sampling” or just “sampling” (with regards to statistics), you should think about tasting soup. This likely sounds a little bonkers. Let’s dig into why tasting soup is such an excellent analogy to random sampling. 6.1.1 Tasting soup Figure 6.1: A bowl of Indian chicken and vegetable soup Imagine that you have invited a group of friends over to try a new recipe for soup that you’ve never made before. As in the image above downloaded from here, you’d like to make a bowl of Indian chicken soup with lots of different kinds of vegetables included. You’ve carefully followed along with the recipe but you are concerned that you don’t have a lot of experience making foods from India. It is coming near the end of the prescribed time to cook given in the recipe. You begin to wonder: “Did I add too much curry spice?” “Are the carrots cooked enough?” “Does this actually taste good?” How can we answer these questions? Does it matter where we take a bite of soup from? Is there anything we should do to the soup before we taste? Is one taste enough? 6.1.2 Common terms The process of sampling brings with it many common terms that we define now. As you read over these definitions, think about how they each apply to the tasting soup example above. Definition: population The population is the (usually) large pool of observational units that we are interested in. Definition: sample A sample is a smaller collection of observational units that is selected from the population. Definition: sampling Sampling refers to the process of selecting observations from a population. There are both random and non-random ways this can be done. Definition: representative sample A sample is said be a representative sample if the characteristics of observational units selected are a good approximation of the characteristics from the original population. Definition: bias Bias corresponds to a favoring of one group in a population over another group. Definition: generalizability Generalizability refers to the largest group in which it makes sense to make inferences about from the sample collected. This is directly related to how the sample was selected. Definition: parameter A parameter is a calculation based on one or more variables measured in the population. Parameters are almost always denoted symbolically using Greek letters such as \\(\\mu\\), \\(\\pi\\), \\(\\sigma\\), \\(\\rho\\), and \\(\\beta\\). Definition: statistic A statistic is a calculated based on one or more variables measured in the sample. Parameters are usually denoted by lower case Arabic letters with other symbols added sometimes. These include \\(\\bar{x}\\), \\(\\hat{p}\\), \\(s\\), \\(p\\), and \\(b\\). Learning check (LC6.1) Explain in your own words how tasting soup relates to the concepts of sampling covered here. (LC6.2) Describe a different scenario (not food or drink related) that is analogous to sampling concepts covered here. Let’s explore these terms for our tasting soup example: Population - the entire container of soup that we have cooked. Sample - any smaller portion of soup collected that isn’t the whole container of soup. We could say that each spoonful of soup represents one sample. Sampling - the process of selecting spoonfuls from the container of soup Representative sample - A sample we select will only be representative if it tastes like what the soup tastes like in general. If we only select a carrot in our spoonful, we might not have a representative sample. Bias - As we noted with the carrot selection example above, we may select a sample that is not representative. If you watch chefs cook or if you frequently cook, you’ll be sure to stir the soup before you taste it. Generalizability - If we stir our soup before we taste a spoonful (and if we make sure we don’t just pick our favorite item in the soup), results from our sample can be generalized (by and large) to the larger pot of soup. When we say “Yum! This is good!” after a couple spoonfuls, we can be pretty confident that each bowl of soup for our friends will taste good too. Parameter - An example here is could be the proportion of curry entered into the entire pot of soup. A measurement of how salty the pot of soup is on average is also a parameter. How crunchy, on average, the carrots are in the pot of soup is one more example. Statistic - To convert a parameter to a statistic, you need only to think about the same measurement on a spoonful: The proportion of curry to non-curry in a spoonful of soup How salty the spoonful of soup is that we collected as our sample How crunchy the carrots are in our spoonful of soup Learning check (LC6.3) Why isn’t our population all bowls of soup? All bowls of Indian chicken soup? (LC6.4) Describe a way in which we could select a sample of flights from nycflights13 that is not representative. (LC6.5) If we treat all of the flights in nycflights13 as the population, give examples of three parameters we could calculate. (LC6.6) If we treat all of the flights in nycflights13 as the population, give examples of three statistics we could calculate. (LC6.7) What biases might we see if we only select flights to Boston when we are interested in looking at mean flight delays from NYC? 6.2 Visualizing sampling Let’s explore how sampling and these other terms relate to working with data and data visualization. Here we introduce the okcupiddata R package (Kim and Escobedo-Land 2016). Note that permission to use this data to create the R package was explicitly granted by OkCupid. More information about this package is available here. The profiles data frame in this R data package contains data about 59,946 OkCupid users who were living within 25 miles of San Francisco, had active profiles on June 26, 2012, were online in the previous year, and had at least one picture in their profile. We will be focusing on the height variable, which corresponds to a self-reported height for each individual on their profile. Note that this is measured in inches. library(okcupiddata) data(profiles) Let’s take a look at the distribution of height using a histogram and ggplot2: library(ggplot2) ggplot(data = profiles, mapping = aes(x = height)) + geom_histogram(bins = 20, color = &quot;white&quot;) We see here that this being self-reported data has led to the data being a little messy. Learning check (LC6.8) Why does the histogram go all the way back to 0 for height and all the way up to 100? To clean up the data a bit, let’s focus on just looking at heights between 55 inches and 85 inches. Remember that the filter function in dplyr allows us to focus on a subset of rows. The specific subset of rows we are interested in corresponds to the argument to the filter function. We will create a new data frame called profiles_subset that contains all rows with heights between 55 and 85 inclusive. library(dplyr) profiles_subset &lt;- profiles %&gt;% filter(between(height, 55, 85)) Next, let’s produce the same histogram as above but using the profiles_subset data frame instead. ggplot(data = profiles_subset, mapping = aes(x = height)) + geom_histogram(bins = 20, color = &quot;white&quot;) We can think of this data as representing the population of interest. Let’s now take a random sample of size 100 from this population and look to see if this sample represents the overall shape of the population. In other words, we are going to use data visualization as our guide to understand the representativeness of the sample selected. library(mosaic) set.seed(2017) profiles_sample1 &lt;- profiles_subset %&gt;% resample(size = 100, replace = FALSE) The set.seed function is used to ensure that all users get the same random sample when they run the code above. It is a way of interfacing with the pseudo-random number generation scheme that R uses to generate “random” numbers. If that command was not run, you’d obtain a different random sample than someone else if you ran the code above for the first time. We have introduced the resample function from the mosaic package here (Pruim, Kaplan, and Horton 2016). This function can be used for both sampling with and without replacement. Here we have chosen to sample without replacement. In other words, after the first row is chosen from the profiles_subset data frame at random it is kept out of the further 99 samples. Let’s now visualize the 100 values of the height variable in the profiles_sample1 data frame. To keep this visualization on the same horizontal scale as our original population presented in profiles_subset we can use the coord_cartesian function along with the c function to specify the limits on the horizontal axis. ggplot(data = profiles_sample1, mapping = aes(x = height)) + geom_histogram(bins = 20, color = &quot;white&quot;, fill = &quot;red&quot;) + coord_cartesian(xlim = c(55, 85)) Learning check (LC6.9) Does this random sample of height represent the population height variable well? Explain why or why not in a couple of sentences. We now repeat this process of sampling to look to see how another random sample of height compares to the original population distribution. profiles_sample2 &lt;- profiles_subset %&gt;% resample(size = 100, replace = FALSE) ggplot(data = profiles_sample2, mapping = aes(x = height)) + geom_histogram(bins = 20, color = &quot;black&quot;, fill = &quot;yellow&quot;) + coord_cartesian(xlim = c(55, 85)) Remember that a sample can never truly quantify all of the properties of a population since it contains less data and, thus, less information. We can use the overall shape as a good guess as to the representativeness of the sample in regards to the population though. We see that the above two random samples of size 100 have roughly the same shape as the original population height data. Let’s next explore what is known as a convenience sample and how its distribution compares to the population distribution. A convenience sample is a sample that is chosen conveniently by the person selecting the sample. While certainly less work, convenience samples are generally not representative of the population since they will exclude some (usually large) portion of the population. Let’s look at values of height in our profiles_subset population that are larger than 6 feet tall (72 inches) and have that be the sample we choose. profiles_sample3 &lt;- profiles_subset %&gt;% filter(height &gt;= 72) ggplot(data = profiles_sample3, mapping = aes(x = height)) + geom_histogram(bins = 20, color = &quot;white&quot;, fill = &quot;blue&quot;) + coord_cartesian(xlim = c(55, 85)) This is a clear example of a sample that is not representative of the population. The population height variable is roughly symmetric, whereas this distribution is right-skewed. Further, since it only selects large heights it has completely excluded the small and middle heights. We have seen here that data visualization provides an excellent tool in judging the representativeness of a sample. 6.2.1 Sampling distribution The representativeness of a sample plays an even larger role than just looking at the shapes of distributions. Let’s suppose we were interested in estimating the mean height of all profiles in the profiles_subset data frame. To do so, we could look at the mean of the height variable in the profiles_sample1 data frame: profiles_sample1 %&gt;% summarize(mean(height)) ## mean(height) ## 1 68.45 But, we could also use profiles_sample2: profiles_sample2 %&gt;% summarize(mean(height)) ## mean(height) ## 1 68.2 Or maybe even profiles_sample3: profiles_sample3 %&gt;% summarize(mean(height)) ## mean(height) ## 1 73.38 We see a clear difference here in looking at the mean of height in profiles_sample3 versus profiles_sample1 and profiles_sample2. This comes from the bias that is used in choosing only the top heights for profiles_sample3. If we had chosen to use this sample as our only sample, we would be quite a ways off from what the actual mean height in our population of profiles_subset is. We also see that even random samples produce means that aren’t exactly the same. This sampling variability can be shown via what is called a sampling distribution. This is defined as the behavior of a statistic under repeated sampling. To build this sampling distribution for this example, we’ve created an interactive app using the shiny R package below that is available at http://ismay.shinyapps.io/okcupidheights/. You can specify the sample size you’d like to work with (100 is chosen by default) and then generate a random sample. You then can see the mean of this generated sample plotted in the bottom visualization. Repeating this process many times, you can start to see the shape of the sampling distribution take form. Figure 6.2: Sampling distribution app 6.2.2 Repeated sampling via do We have looked at two random samples above, but using mosaic we can repeat this process over and over again with the do function. Below, we repeat this sampling process 10,000 times. We can then plot the different values of the sample means to get a sense for what a reasonable range of values for the population parameter mean height is in the profiles_subset data frame. sample_means &lt;- do(10000) * (profiles_subset %&gt;% resample(size = 100, replace = FALSE) %&gt;% summarize(mean_height = mean(height))) ggplot(data = sample_means, mapping = aes(x = mean_height)) + geom_histogram(color = &quot;white&quot;, bins = 20) Note how the range of sample mean height values is much more narrow than the original range of height in the profiles_subset data frame. We also see a characteristic shape to this distribution of mean_height: the normal curve. This idea is commonly associated with statistics and you hopefully have a good sense of how this distribution comes about. As before, if you aren’t quite sure of this yet, go back and explore the shiny app above a bit more. We see that many values for the sample mean appear near the center of the distribution and a few values appear out in the tails providing the bell-shaped distribution linked with the normal distribution. You’ll see more examples of this in the chapters to come and in Appendix B. Learning check (LC6.10) Why do the sample mean values have a much smaller spread than the original population data? You may want to play with the shiny app above a bit to understand why this is the case. (LC6.11) Why is random sampling so important here to create a distribution of sample means that provide a range of plausible values for the population mean height? 6.3 Simulation We next will introduce the ideas behind hypothesis testing that we will delve into more formally in the chapters to come. What follows is taken from a book entitled The Lady Tasting Tea (Salsburg 2001): It was a summer afternoon in Cambridge, England, in the late 1920s. A group of university dons, their wives, and some guests were sitting around an outdoor table for afternoon tea. One of the women was insisting that tea tasted different depending upon whether the tea was poured into the milk or whether the milk was poured into the tea. The scientific minds among the men scoffed at this as sheer nonsense. What could be the difference? They could not conceive of any difference in the chemistry of the mixtures that could exist. A thin, short man, with thick glasses and a Vandyke beard beginning to turn gray, pounced on the problem. “Let us test the proposition,” he said excitedly. He began to outline an experiment in which the lady who insisted there was a difference would be presented with a sequence of cups of tea, in some of which the milk had been poured into the tea and in others of which the tea had been poured into the milk… So it was that sunny summer afternoon in Cambridge. The lady might or might not have been correct about the tea infusion. The fun would be in finding a way to determine if she was right, and, under the direction of the man with the Vandyke beard, they began to discuss how they might make that determination. Enthusiastically, many of them joined with him in setting up the experiment. Within a few minutes, they were pouring different patterns of infusion in a place where the lady could not see which cup was which. Then, with an air of finality, the man with the Vandyke beard presented her with her first cup. She sipped for a minute and declared that it was one where the milk had been poured into the tea. He noted her response without comment and presented her with the second cup… The man with the Vandyke beard was Ronald Aylmer Fisher, who was in his late thirties at the time. He would later be knighted Sir Ronald Fisher. In 1935, he wrote a book entitled The Design of Experiments, and he described the experiment of the lady tasting tea in the second chapter of that book. In his book, Fisher discusses the lady and her belief as a hypothetical problem. He considers the various ways in which an experiment might be designed to determine if she could tell the difference. The problem in designing the experiment is that, if she is given a single cup of tea, she has a 50 percent chance of guessing correctly which infusion was used, even if she cannot tell the difference. If she is given two cups of tea, she still might guess correctly. In fact, if she knew that the two cups of tea were each made with a different infusion, one guess could be completely right (or completely wrong). Similarly, even if she could tell the difference, there is some chance that she might have made a mistake, that one of the cups was not mixed as well or that the infusion was made when the tea was not hot enough. She might be presented with a series of ten cups and correctly identify only nine of them, even if she could tell the difference. In his book, Fisher discusses the various possible outcomes of such an experiment. He describes how to decide how many cups should be presented and in what order and how much to tell the lady about the order of presentations. He works out the probabilities of different outcomes, depending upon whether the lady is or is not correct. Nowhere in this discussion does he indicate that such an experiment was ever run. Nor does he describe the outcome of an actual experiment. It’s amazing that there is no actual evidence that such an event actually took place. This problem is a great introduction into inference though and we can proceed by testing to see how likely it is for a person to guess correctly, say, 9 out of 10 times, assuming that person is just guessing. In other words, is the person just lucky or do we have reason to suspect that they can actually detect whether milk was put in first or not? We need to think about this problem from the standpoint of hypothesis testing. First, we’ll need to identify some important parts of a hypothesis test before we proceed with the analysis. Learning check (LC6.12) What does “by chance” mean in this context? (LC6.13) What is our observed statistic? (LC6.14) What is this statistic trying to estimate? (LC6.15) How could we test to see whether the person is just guessing or if they have some special talent of identifying milk before tea or vice-versa? Let’s begin with an experiment. I will flip a coin 10 times. Your job is to try to predict the sequence of my 10 flips. Write down 10 H’s and T’s corresponding to your predictions. We could compare your guesses with my actual flips and then we will note how many correct guesses you have. You may be asking yourself how this models a way to test whether the person was just guessing or not. All we are trying to do is see how likely it is to have 9 matches out of 10 if the person was truly guessing. When we say “truly guessing” we are assuming that we have a 50/50 chance of guessing correctly. This can be modeled using a coin flip and then seeing whether we guessed correctly for each of the coin flips. If we guessed correctly, we can think of that as a “success.” We often don’t have time to do the physical flipping over and over again and we’d like to be able to do more than just 20 different simulations or so. Luckily, we can use R to simulate this process many times. The mosaic package includes a function called rflip(), which can be used to flip one coin. Well, not exactly. It uses pseudo-random number generation to “flip” a virtual coin. In order for us all to get the same results here, we can set the seed of the pseudo-random number generator. Let’s see an example of this: (Remember to load the mosaic package!) library(mosaic) set.seed(2017) do(1) * rflip(1) ## n heads tails prop ## 1 1 1 0 1 This shows us the proportion of “successes” in one flip of a coin. The do function in the mosaic package will be useful and you can begin to understand what it does with another example. do(13) * rflip(10) ## n heads tails prop ## 1 10 4 6 0.4 ## 2 10 5 5 0.5 ## 3 10 5 5 0.5 ## 4 10 7 3 0.7 ## 5 10 5 5 0.5 ## 6 10 7 3 0.7 ## 7 10 5 5 0.5 ## 8 10 4 6 0.4 ## 9 10 7 3 0.7 ## 10 10 2 8 0.2 ## 11 10 4 6 0.4 ## 12 10 5 5 0.5 ## 13 10 4 6 0.4 We’ve now done a simulation of what actually happened when you flipped a coin ten times. We have 13 different simulations of flipping a coin 10 times. Note here that heads now corresponds to the number of correct guesses and tails corresponds to the number of incorrect guesses. (This can be tricky to understand at first since we’ve done a switch on what the meaning of “heads” and ``tails&quot; are.) If you look at the output above for our simulation of 13 student guesses, we can begin to get a sense for what an “expected” sample proportion of successes may be. Around five out of 10 seems to be the most likely value. What does this say about what we actually observed with a success rate of 9/10? To better answer this question, we can simulate 10,000 student guesses and then look at the distribution of the simulated sample proportion of successes, also known as the null distribution. library(dplyr) simGuesses &lt;- do(10000) * rflip(10) simGuesses %&gt;% group_by(heads) %&gt;% summarize(count = n()) ## # A tibble: 11 × 2 ## heads count ## &lt;dbl&gt; &lt;int&gt; ## 1 0 9 ## 2 1 98 ## 3 2 431 ## 4 3 1197 ## 5 4 2016 ## 6 5 2459 ## 7 6 2066 ## 8 7 1211 ## 9 8 408 ## 10 9 91 ## 11 10 14 We can see here that we have created a count of how many of each of the 10,000 sets of 10 flips resulted in 0, 1, 2, \\(\\ldots\\), up to 10 heads. Note the use of the group_by and summarize functions from Chapter 5 here. In addition, we can plot the distribution of these simulated heads using the ideas from Chapter 4. heads is a quantitative variable. Think about which type of plot is most appropriate here before reading further. We already have an idea as to an appropriate plot by the data summarization that we did in the chunk above. We’d like to see how many heads occurred in the 10,000 sets of 10 flips. In other words, we’d like to see how frequently 9 or more heads occurred in the 10 flips: library(ggplot2) simGuesses %&gt;% ggplot(aes(x = heads)) + geom_histogram(binwidth = 1, color = &quot;white&quot;) Figure 6.3: Histogram of number of heads in simulation - needs tweaking This horizontal axis labels are a little confusing here. What does 2.5 or 7.5 heads mean? In simGuesses, heads is a numerical variable. Thus, ggplot is expecting the values to be on a continuous scale. We can switch the scale to be discrete by invoking the factor function and using geom_bar instead of geom_histogram: library(ggplot2) simGuesses %&gt;% ggplot(aes(x = factor(heads))) + geom_bar() Figure 6.4: Barplot of number of heads in simulation You’ll frequently need to make this conversion to factor when making a barplot with quantitative variables. Remember from “Getting Used to R, RStudio, and R Markdown” (Ismay 2016), that a factor variable is useful when there is a natural ordering to the variable and it only takes on discrete values and not fractional values like 2.5. Our heads variable has a natural ordering: 0, 1, 2, \\(\\ldots\\), 10. Again, note that the shape of these number of heads follows what appears to be a normal distribution. We’ll see in a related example that if appropriate conditions/assumptions are met with the data that we can expect to see a normal distribution result. When these conditions aren’t met, the simulation methodology we’ve presented here still works well whereas the traditional normal-based methods start to fall apart. We will delve further into hypothesis testing in the next few chapters. This null distribution in combination with the sampling distribution concept covered earlier will be of utmost importance going forward. 6.4 Review of mosaic simulation functions In this chapter, we’ve discussed three functions in the mosaic package useful in understanding the stepping stones to statistical inference: do, rflip, and resample. We will also work with the shuffle function in later chapters and we summarize it here for your reference later. do: Its main use is in replicating a process many times. It has one argument n which specifies how many times to replicate the process. It then uses *, which can be read as “times”, and whatever follows immediately after it as the process. rflip: This is used to simulate the flipping of a coin many times. By default, it flips a fair coin one time giving an equal chance to heads and tails. It is frequently used with do() * to simulate many coin flips in multiple sets. resample: This is used to sample from a larger data set with or without replacement. When we are thinking about the concept of random sampling, we sample without replacement. We can also sample with replacement corresponding to the values being replaced into the pool to draw from with the possibility that they are drawn again in the resample. This will be of great importance when we discuss bootstrapping with confidence intervals. shuffle: Its main purpose is to permute the values of one variable across the values of another variable. This acts in much the same way as shuffling a deck of cards and then presenting the shuffled deck to two (or more) players. Learning check (LC6.16) Recreate rflip using only the resample function and specifying the appropriate arguments. (LC6.17) Recreate shuffle using only the resample function and specifying the appropriate arguments. 6.5 Conclusion 6.5.1 Script of R code An R script file of all R code used in this chapter is available here. 6.5.2 What’s to come? This chapter has served as an introduction into inferential techniques that will be discussed in greater detail in Chapter 7 for hypothesis testing and in Chapter 8 for confidence intervals. In these chapters, we will see how we can use a related concept of resampling when working with the distributions of two groups. All of these concepts will be further reinforced in Chapter 9 as well. References "],
+["7-hypo.html", "7 Hypothesis Testing 7.1 When Inference Is Not Needed 7.2 Basics of Hypothesis Testing 7.3 Criminal trial analogy 7.4 Types of Errors in Hypothesis Testing 7.5 Statistical Significance 7.6 EXAMPLE: Revisiting the Lady Tasting Tea 7.7 EXAMPLE: Comparing two means 7.8 Building theory-based methods using computation 7.9 Conclusion", " 7 Hypothesis Testing We saw some of the main concepts of hypothesis testing introduced in Chapter 6. We will expand further on these ideas here and also provide a framework for understanding hypothesis tests in general. Instead of presenting you with lots of different formulas and scenarios, we hope to build a way to think about all hypothesis tests. You can then adapt to different scenarios as needed down the road when you encounter different statistical situations. The same can be said for confidence intervals. There is one general framework that applies to all confidence intervals and we will elaborate on this further in Chapter 8. The specifics may change slightly for each variation, but the important idea is to understand the general framework so that you can apply it to more specific problems. We believe that this approach is much better in the long-term than teaching you specific tests and confidence intervals rigorously. You can find fully-worked out examples for five common hypothesis tests and their corresponding confidence intervals in Appendix B. We recommend that you carefully review these examples as they also cover how the general frameworks apply to traditional normal-based methodologies like the \\(t\\)-test and normal-theory confidence intervals. You’ll see there that these methods are just approximations for the general computational frameworks, but require conditions to be met for their results to be valid. The general frameworks using randomization, simulation, and bootstrapping do not hold the same sorts of restrictions and further advance computational thinking, which is one big reason for their emphasis throughout this textbook. Needed packages library(dplyr) library(ggplot2) library(mosaic) library(knitr) library(nycflights13) 7.1 When Inference Is Not Needed Before we delve into the two techniques of inference (hypothesis testing and confidence intervals), it’s good to remember that there are cases where you need not perform a rigorous statistical inference. An important and time-saving skill is to ALWAYS do exploratory data analysis using dplyr and ggplot2 before thinking about running a hypothesis test. Let’s look at such an example selecting a sample of flights traveling to Boston and to San Francisco from New York City in the flights data frame in the nycflights13 package. (We will remove flights with missing data first using na.omit and then sample 100 flights going to each of the two airports.) library(nycflights13) data(flights) bos_sfo &lt;- flights %&gt;% na.omit() %&gt;% filter(dest %in% c(&quot;BOS&quot;, &quot;SFO&quot;)) %&gt;% group_by(dest) %&gt;% sample_n(100) Suppose we were interested in seeing if the air_time to SFO in San Francisco was statistically greater than the air_time to BOS in Boston. As suggested, let’s begin with some exploratory data analysis to get a sense for how the two variables of air_time and dest relate for these two destination airports: library(dplyr) bos_sfo_summary &lt;- bos_sfo %&gt;% group_by(dest) %&gt;% summarize(mean_time = mean(air_time), sd_time = sd(air_time)) kable(bos_sfo_summary) dest mean_time sd_time BOS 38.35 5.727 SFO 345.61 15.355 Looking at these results, we can clearly see that SFO air_time is much larger than BOS air_time. The standard deviation is also extremely informative here. Learning check (LC7.1) Could we make the same type of immediate conclusion that SFO had a statistically greater air_time if, say, its corresponding standard deviation was 200 minutes? What about 100 minutes? Explain. To further understand just how different the air_time variable is for BOS and SFO, let’s look at a boxplot: library(ggplot2) ggplot(data = bos_sfo, mapping = aes(x = dest, y = air_time)) + geom_boxplot() Since there is no overlap at all, we can conclude that the air_time for San Francisco flights is statistically greater (at any level of significance) than the air_time for Boston flights. This is a clear example of not needing to do anything more than some simple descriptive statistics to get an appropriate inferential conclusion. This is one reason why you should ALWAYS investigate the sample data first using dplyr and ggplot2 via exploratory data analysis. As you get more and more practice with hypothesis testing, you’ll be better able to determine in many cases whether or not the results will be statistically significant. There are circumstances where it is difficult to tell, but you should always try to make a guess FIRST about significance after you have completed your data exploration and before you actually begin the inferential techniques. 7.2 Basics of Hypothesis Testing In a hypothesis test, we will use data from a sample to help us decide between two competing hypotheses about a population. We make these hypotheses more concrete by specifying them in terms of at least one population parameter of interest. We refer to the competing claims about the population as the null hypothesis, denoted by \\(H_0\\), and the alternative (or research) hypothesis, denoted by \\(H_a\\). The roles of these two hypotheses are NOT interchangeable. The claim for which we seek significant evidence is assigned to the alternative hypothesis. The alternative is usually what the experimenter or researcher wants to establish or find evidence for. Usually, the null hypothesis is a claim that there really is “no effect” or “no difference.” In many cases, the null hypothesis represents the status quo or that nothing interesting is happening. We assess the strength of evidence by assuming the null hypothesis is true and determining how unlikely it would be to see sample results/statistics as extreme (or more extreme) as those in the original sample. Hypothesis testing brings about many weird and incorrect notions in the scientific community and society at large. One reason for this is that statistics has traditionally been thought of as this magic box of algorithms and procedures to get to results and this has been readily apparent if you do a Google search of “flowchart statistics hypothesis tests”. There are so many different complex ways to determine which test is appropriate. You’ll see that we don’t need to rely on these complicated series of assumptions and procedures to conduct a hypothesis test any longer. These methods were introduced in a time when computers weren’t powerful. Your cellphone (in 2016) has more power than the computers that sent NASA astronauts to the moon after all. We’ll see that ALL hypothesis tests can be broken down into the following framework given by Allen Downey here: Figure 7.1: Hypothesis Testing Framework Before we hop into this framework, we will provide another way to think about hypothesis testing that may be useful. 7.3 Criminal trial analogy We can think of hypothesis testing in the same context as a criminal trial in the United States. A criminal trial in the United States is a familiar situation in which a choice between two contradictory claims must be made. The accuser of the crime must be judged either guilty or not guilty. Under the U.S. system of justice, the individual on trial is initially presumed not guilty. Only STRONG EVIDENCE to the contrary causes the not guilty claim to be rejected in favor of a guilty verdict. The phrase “beyond a reasonable doubt” is often used to set the cutoff value for when enough evidence has been given to convict. Theoretically, we should never say “The person is innocent.” but instead “There is not sufficient evidence to show that the person is guilty.” Now let’s compare that to how we look at a hypothesis test. The decision about the population parameter(s) must be judged to follow one of two hypotheses. We initially assume that \\(H_0\\) is true. The null hypothesis \\(H_0\\) will be rejected (in favor of \\(H_a\\)) only if the sample evidence strongly suggests that \\(H_0\\) is false. If the sample does not provide such evidence, \\(H_0\\) will not be rejected. The analogy to “beyond a reasonable doubt” in hypothesis testing is what is known as the significance level. This will be set before conducting the hypothesis test and is denoted as \\(\\alpha\\). Common values for \\(\\alpha\\) are 0.1, 0.01, and 0.05. 7.3.1 Two possible conclusions Therefore, we have two possible conclusions with hypothesis testing: Reject \\(H_0\\) Fail to reject \\(H_0\\) Gut instinct says that “Fail to reject \\(H_0\\)” should say “Accept \\(H_0\\)” but this technically is not correct. Accepting \\(H_0\\) is the same as saying that a person is innocent. We cannot show that a person is innocent; we can only say that there was not enough substantial evidence to find the person guilty. When you run a hypothesis test, you are the jury of the trial. You decide whether there is enough evidence to convince yourself that \\(H_a\\) is true (“the person is guilty”) or that there was not enough evidence to convince yourself \\(H_a\\) is true (“the person is not guilty”). You must convince yourself (using statistical arguments) which hypothesis is the correct one given the sample information. Important note: Therefore, DO NOT WRITE “Accept \\(H_0\\)” any time you conduct a hypothesis test. Instead write “Fail to reject \\(H_0\\).” 7.4 Types of Errors in Hypothesis Testing Unfortunately, just as a jury or a judge can make an incorrect decision in regards to a criminal trial by reaching the wrong verdict, there is some chance we will reach the wrong conclusion via a hypothesis test about a population parameter. As with criminal trials, this comes from the fact that we don’t have complete information, but rather a sample from which to try to infer about a population. The possible erroneous conclusions in a criminal trial are an innocent person is convicted (found guilty) or a guilty person is set free (found not guilty). The possible errors in a hypothesis test are rejecting \\(H_0\\) when in fact \\(H_0\\) is true (Type I Error) or failing to reject \\(H_0\\) when in fact \\(H_0\\) is false (Type II Error). The risk of error is the price researchers pay for basing an inference about a population on a sample. With any reasonable sample-based procedure, there is some chance that a Type I error will be made and some chance that a Type II error will occur. To help understand the concepts of Type I error and Type II error, observe the following table: Figure 7.2: Type I and Type II errors If we are using sample data to make inferences about a parameter, we run the risk of making a mistake. Obviously, we want to minimize our chance of error; we want a small probability of drawing an incorrect conclusion. The probability of a Type I Error occurring is denoted by \\(\\alpha\\) and is called the significance level of a hypothesis test The probability of a Type II Error is denoted by \\(\\beta\\). Formally, we can define \\(\\alpha\\) and \\(\\beta\\) in regards to the table above, but for hypothesis tests instead of a criminal trial. \\(\\alpha\\) corresponds to the probability of rejecting \\(H_0\\) when, in fact, \\(H_0\\) is true. \\(\\beta\\) corresponds to the probability of failing to reject \\(H_0\\) when, in fact, \\(H_0\\) is false. Ideally, we want \\(\\alpha = 0\\) and \\(\\beta = 0\\), meaning that the chance of making an error does not exist. When we have to use incomplete information (sample data), it is not possible to have both \\(\\alpha = 0\\) and \\(\\beta = 0\\). We will always have the possibility of at least one error existing when we use sample data. Usually, what is done is that \\(\\alpha\\) is set before the hypothesis test is conducted and then the evidence is judged against that significance level. Common values for \\(\\alpha\\) are 0.05, 0.01, and 0.10. If \\(\\alpha = 0.05\\), we are using a testing procedure that, used over and over with different samples, rejects a TRUE null hypothesis five percent of the time. So if we can set \\(\\alpha\\) to be whatever we want, why choose 0.05 instead of 0.01 or even better 0.0000000000000001? Well, a small \\(\\alpha\\) means the test procedure requires the evidence against \\(H_0\\) to be very strong before we can reject \\(H_0\\). This means we will almost never reject \\(H_0\\) if \\(\\alpha\\) is very small. If we almost never reject \\(H_0\\), the probability of a Type II Error – failing to reject \\(H_0\\) when we should – will increase! Thus, as \\(\\alpha\\) decreases, \\(\\beta\\) increases and as \\(\\alpha\\) increases, \\(\\beta\\) decreases. We, therefore, need to strike a balance in \\(\\alpha\\) and \\(\\beta\\) and the common values for \\(\\alpha\\) of 0.05, 0.01, and 0.10 usually lead to a nice balance. Learning check (LC7.2) Reproduce the table above about errors, but for a hypothesis test, instead of the one provided for a criminal trial. 7.4.1 Logic of Hypothesis Testing Take a random sample (or samples) from a population (or multiple populations) If the sample data are consistent with the null hypothesis, do not reject the null hypothesis. If the sample data are inconsistent with the null hypothesis (in the direction of the alternative hypothesis), reject the null hypothesis and conclude that there is evidence the alternative hypothesis is true (based on the particular sample collected). 7.5 Statistical Significance The idea that sample results are more extreme than we would reasonably expect to see by random chance if the null hypothesis were true is the fundamental idea behind statistical hypothesis tests. If data at least as extreme would be very unlikely if the null hypothesis were true, we say the data are statistically significant. Statistically significant data provide convincing evidence against the null hypothesis in favor of the alternative, and allow us to generalize our sample results to the claim about the population. Learning check (LC7.3) What is wrong about saying “The defendant is innocent.” based on the US system of criminal trials? (LC7.4) What is the purpose of hypothesis testing? (LC7.5) What are some flaws with hypothesis testing? How could we alleviate them? 7.6 EXAMPLE: Revisiting the Lady Tasting Tea Recall the “There is Only One Test” diagram from earlier: Figure 7.3: Hypothesis Testing Framework We will now walk through how each of the steps to the diagram apply to determining whether the lady tasting tea was actually better than chance at determining whether or not milk was added first. We will see that the process of creating a null distribution is a statistical way to quantifying surprise. 7.6.1 Data Let’s assume as we did in Chapter 6 that the lady is correct in determining whether milk was added first or not in 9 out of 10 trials. Our data, therefore, may look something like Correct Correct Correct Incorrect Correct Correct Correct Correct Correct Correct 7.6.2 Test Statistic \\(\\delta\\) We are interested in the number of Correct out of our 10 trials. We can denote this number of successes using the symbol \\(t\\), where \\(t\\) corresponds to total. This is our test statistic \\(\\delta\\) in this case. 7.6.3 Observed effect \\(\\delta^*\\) The actual observed value of the test statistic from our observed sample is \\(\\hat{t}_{obs} = 9\\). Thus, \\(\\delta^* = 9\\). 7.6.4 Model of \\(H_0\\) Our null hypothesis is that the lady is only as good as chance at guessing correctly. Hypotheses always correspond to parameters and are denoted with Greek letters. Thus, symbolically, we have \\(H_0: \\tau = 5\\). Since we are assuming chance and we have 10 flips with 0.5 probability of success of each flip, we have \\(\\tau = 10 \\times 0.5 = 5\\). 7.6.5 Simulated Data We now want to use this null hypothesis to simulate the test statistic assuming that the null hypothesis is true. Therefore, we want to figure out a way to simulate 10 trials, getting either the choice Correct or Incorrect, assuming that the probability of success (getting it Correct) in any given trial is 0.5. Tactile simulation When you are presented with a hypothesis testing problem, frequently the most challenging portion is setting up how to simulate the data assuming the null hypothesis is true. To facilitate with this, setting up a tactile, hands on experiment can help. In this case, flipping a fair coin is a great way to simulate this process. This simulates how the sample could be collected assuming the null hypothesis is true. To simulate 10 trials, we could flip the fair coin and record Heads as Correct and Tails as Incorrect. Some simulated data using this coin flipping procedure may look like the following. Note that this data frame is not tidy, but is a convenient way to look at the results of the simulation in this wide format. The numbers on the fair left correspond to the number of the trial. Table 7.1: A table of three sets of 10 coin flips sample1 sample2 sample3 1 Correct Correct Correct 2 Correct Incorrect Incorrect 3 Incorrect Incorrect Correct 4 Incorrect Incorrect Correct 5 Correct Incorrect Incorrect 6 Correct Incorrect Correct 7 Incorrect Incorrect Correct 8 Incorrect Correct Incorrect 9 Incorrect Correct Incorrect 10 Incorrect Correct Incorrect We then use the formula for the Test Statistic to determine the simulated test statistic for each of these simulated samples. So in this case we have \\(t_1 = 4\\), \\(t_2 = 4\\), \\(t_3 = 5\\) 7.6.6 Distribution of \\(\\delta\\) under \\(H_0\\) We could continue this process, say, 10,000 times by flipping a coin in sets of 10 for 10,000 repetitions and counting and taking note of how many heads out of 10 we have for each set. It’s at this point that you surely realize that a computer can do this procedure much faster and more efficient than the tactile experiment with a coin. Recall that we’ve already created the distribution of 10,000 such coin flips and we’ve stored these values in the heads variable in the simGuesses data frame: library(ggplot2) ggplot(data = simGuesses, aes(x = factor(heads))) + geom_bar() 7.6.7 The p-value Definition: \\(p\\)-value: The p-value is the probability of observing a sample statistic as extreme or more extreme than what was observed, assuming that the null hypothesis of a by chance operation is true. This definition may be a little intimidating the first time you read it, but it’s important to come back to this “The Lady Tasting Tea” problem whenever you encounter \\(p\\)-values as you begin to learn about the concept. Here the \\(p\\)-value corresponds to how many times in our null distribution of heads 9 or more heads occurred. We can use another neat feature of R to calculate the \\(p\\)-value for this problem. Note that “more extreme” in this case corresponds to looking at values of 9 or greater since our alternative hypothesis invokes a right-tail test corresponding to a “greater than” hypothesis of \\(H_a: \\tau &gt; 5\\). In other words, we are looking to see how likely it is for the lady to pick 9 or more correct instead of 9 or less correct. We’d like to go in the right direction. pvalue_tea &lt;- simGuesses %&gt;% filter(heads &gt;= 9) %&gt;% nrow() / nrow(simGuesses) Let’s walk through each step of this calculation: First, pvalue_tea will be the name of our calculated \\(p\\)-value and the assignment operator &lt;- directs us to this naming. We are working with the simGuesses data frame here so that comes immediately before the pipe operator. We would like to only focus on the rows in our simGuesses data frame that have heads values of 9 or 10. This represents simulated statistics “as extreme or more extreme” than what we observed (9 correct guesses out of 10). To get a glimpse of what we have up to this point, run simGuesses %&gt;% filter(heads &gt;= 9) %&gt;% View(). Now that we have changed the focus to only those rows that have number of heads out of 10 flips corresponding to 9 or more, we count how many of those there are. The function nrow gives how many entries are in this filtered data frame and lastly we calculate the proportion that are at least as extreme as our observed value of 9 by dividing by the number of total simulations (10,000). We can see that the observed statistic of 9 correct guesses is not a likely outcome assuming the null hypothesis is true. Only around 1% of the outcomes in our 10,000 simulations fall at or above 9 successes. We have evidence supporting the conclusion that the person is actually better than just guessing at random at determining whether milk has been added first or not. To better visualize this we can also make use of blue shading on the histogram corresponding to the \\(p\\)-value: library(ggplot2) ggplot(data = simGuesses, aes(x = factor(heads), fill = (heads &gt;= 9))) + geom_bar() + labs(x = &quot;heads&quot;) Figure 7.4: Barplot of heads with p-value highlighted This helps us better see just how few of the values of heads are at our observed value or more extreme. This idea of a \\(p\\)-value can be extended to the more traditional methods using normal and \\(t\\) distributions in the traditional way that introductory statistics has been presented. These traditional methods were used because statisticians haven’t always been able to do 10,000 simulations on the computer within seconds. We’ll elaborate on this more in a few sections. Learning check (LC7.6) How could we make Table 7.1 into a tidy data frame? (LC7.7) What is meant by “pseudo-random number generation?” (LC7.8) How can simulation be used to help us address the question of whether or not an observed result is statistically significant? (LC7.9) In Chapter 4, we noted that barplots should be used when creating a plot of categorical variables. Why are we using barplots to make a plot of a numerical variable heads in this chapter? 7.7 EXAMPLE: Comparing two means 7.7.1 Randomization/Permutation We will now focus on building hypotheses looking at the difference between two population means in an example. We will denote population means using the Greek symbol \\(\\mu\\) (pronounced “mu”). Thus, we will be looking to see if one group “out-performs” another group. This is quite possibly the most common type of statistical inference and serves as a basis for many other types of analyses when comparing the relationship between two variables. Our null hypothesis will be of the form \\(H_0: \\mu_1 = \\mu_2\\), which can also be written as \\(H_0: \\mu_1 - \\mu_2 = 0\\). Our alternative hypothesis will be of the form \\(H_0: \\mu_1 \\star \\mu_2\\) (or \\(H_a: \\mu_1 - \\mu_2 \\, \\star \\, 0\\)) where \\(\\star\\) = \\(&lt;\\), \\(\\ne\\), or \\(&gt;\\) depending on the context of the problem. You needn’t focus on these new symbols too much at this point. It will just be a shortcut way for us to describe our hypotheses. As we saw earlier, simulation is a valuable tool when conducting inferences based on one population variable. We will see that the process of randomization (also known as permutation) will be valuable in conducting tests comparing quantitative values from two groups. 7.7.2 Comparing Action and Romance Movies The movies data set in the ggplot2movies package contains information on a large number of movies that have been rated by users of IMDB.com (Wickham 2015). We are interested in the question here of whether Action movies are rated higher on IMDB than Romance movies. We will first need to do a little bit of data manipulation using the ideas from Chapter 5 to get the data in the form that we would like: library(dplyr) library(ggplot2movies) (movies_trimmed &lt;- movies %&gt;% select(title, year, rating, Action, Romance)) ## # A tibble: 58,788 × 5 ## title year rating Action Romance ## &lt;chr&gt; &lt;int&gt; &lt;dbl&gt; &lt;int&gt; &lt;int&gt; ## 1 $ 1971 6.4 0 0 ## 2 $1000 a Touchdown 1939 6.0 0 0 ## 3 $21 a Day Once a Month 1941 8.2 0 0 ## 4 $40,000 1996 8.2 0 0 ## 5 $50,000 Climax Show, The 1975 3.4 0 0 ## 6 $pent 2000 4.3 0 0 ## 7 $windle 2002 5.3 1 0 ## 8 &#39;15&#39; 2002 6.7 0 0 ## 9 &#39;38 1987 6.6 0 0 ## 10 &#39;49-&#39;17 1917 6.0 0 0 ## # ... with 58,778 more rows Note that Action and Romance are binary variables here. To remove any overlap of movies (and potential confusion) that are both Action and Romance, we will remove them from our population: movies_trimmed &lt;- movies_trimmed %&gt;% filter(!(Action == 1 &amp; Romance == 1)) We will now create a new variable called genre that specifies whether a movie in our movies_trimmed data frame is an &quot;Action&quot; movie, a &quot;Romance&quot; movie, or &quot;Neither&quot;. We aren’t really interested in the &quot;Neither&quot; category here so we will exclude those rows as well. Lastly, the Action and Romance columns are not needed anymore since they are encoded in the genre column. movies_trimmed &lt;- movies_trimmed %&gt;% mutate(genre = ifelse(Action == 1, &quot;Action&quot;, ifelse(Romance == 1, &quot;Romance&quot;, &quot;Neither&quot;))) %&gt;% filter(genre != &quot;Neither&quot;) %&gt;% select(-Action, -Romance) We are left with 8878 movies in our population data set that focuses on only &quot;Action&quot; and &quot;Romance&quot; movies. Learning check (LC7.10) Why are the different genre variables stored as binary variables (1s and 0s) instead of just listing the genre as a column of values like “Action”, “Comedy”, etc.? (LC7.11) What complications could come above with us excluding action romance movies? Should we question the results of our hypothesis test? Explain. Let’s now visualize the distributions of rating across both levels of genre. Think about what type(s) of plot is/are appropriate here before you proceed: library(ggplot2) ggplot(data = movies_trimmed, aes(x = genre, y = rating)) + geom_boxplot() Figure 7.5: Rating vs genre in the population We can see that the middle 50% of ratings for &quot;Action&quot; movies is more spread out than that of &quot;Romance&quot; movies in the population. &quot;Romance&quot; has outliers at both the top and bottoms of the scale though. We are initially interested in comparing the mean rating across these two groups so a faceted histogram may also be useful: ggplot(data = movies_trimmed, mapping = aes(x = rating)) + geom_histogram(binwidth = 1, color = &quot;white&quot;, fill = &quot;dodgerblue&quot;) + facet_grid(genre ~ .) Figure 7.6: Faceted histogram of genre vs rating Important note: Remember that we hardly ever have access to the population values as we do here. This example and the nycflights13 data set were used to create a common flow from chapter to chapter. In nearly all circumstances, we’ll be needing to use only a sample of the population to try to infer conclusions about the unknown population parameter values. These examples do show a nice relationship between statistics (where data is usually small and more focused on experimental settings) and data science (where data is frequently large and collected without experimental conditions). 7.7.3 Sampling \\(\\rightarrow\\) Randomization We can use hypothesis testing to investigate ways to determine, for example, whether a treatment has an effect over a control and other ways to statistically analyze if one group performs better than, worse than, or different than another. We will also use confidence intervals to determine the size of the effect, if it exists. You’ll see more on this in Chapter 8. We are interested here in seeing how we can use a random sample of action movies and a random sample of romance movies from movies to determine if a statistical difference exists in the mean ratings of each group. Learning check (LC7.12) Define the relevant parameters here in terms of the populations of movies. 7.7.4 Data Let’s select a random sample of 34 action movies and a random sample of 34 romance movies. (The number 34 was chosen somewhat arbitrarily here.) library(dplyr) library(mosaic) set.seed(2016) movies_genre_sample &lt;- movies_trimmed %&gt;% group_by(genre) %&gt;% sample_n(34) We can now observe the distributions of our two sample ratings for both groups. Remember that these plots should be rough approximations of our population distributions of movie ratings for &quot;Action&quot; and &quot;Romance&quot; in our population of all movies in the movies data frame. ggplot(data = movies_genre_sample, aes(x = genre, y = rating)) + geom_boxplot() Figure 7.7: Genre vs rating for our sample ggplot(data = movies_genre_sample, mapping = aes(x = rating)) + geom_histogram(binwidth = 1, color = &quot;white&quot;, fill = &quot;dodgerblue&quot;) + facet_grid(genre ~ .) Figure 7.8: Genre vs rating for our sample as faceted histogram Learning check (LC7.13) What single value could we change to improve the approximation using the sample distribution on the population distribution? Do we have reason to believe, based on the sample distributions of rating over the two groups of genre, that there is a significant difference between the mean rating for action movies compared to romance movies? It’s hard to say just based on the plots. The boxplot does show that the median sample rating is higher for romance movies, but the histogram isn’t as clear. The two groups have somewhat differently shaped distributions but they are both over similar values of rating. It’s often useful to calculate the mean and standard deviation as well, conditioned on the two levels. summary_ratings &lt;- movies_genre_sample %&gt;% group_by(genre) %&gt;% summarize(mean = mean(rating), std_dev = sd(rating), n = n()) summary_ratings %&gt;% kable() genre mean std_dev n Action 5.197 1.465 34 Romance 6.027 1.202 34 Learning check (LC7.14) Why did we not specify na.rm = TRUE here as we did in Chapter 5? We see that the sample mean rating for romance movies, \\(\\bar{x}_{r}\\), is greater than the similar measure for action movies, \\(\\bar{x}_a\\). But is it statistically significantly greater (thus, leading us to conclude that the means are statistically different)? The standard deviation can provide some insight here but with these standard deviations being so similar it’s still hard to say for sure. Learning check (LC7.15) Why might the standard deviation provide some insight about the means being statistically different or not? 7.7.5 Model of \\(H_0\\) The hypotheses we specified can also be written in another form to better give us an idea of what we will be simulating to create our null distribution. \\(H_0: \\mu_r - \\mu_a = 0\\) \\(H_a: \\mu_r - \\mu_a \\ne 0\\) 7.7.6 Test Statistic \\(\\delta\\) We are, therefore, interested in seeing whether the difference in the sample means, \\(\\bar{x}_r - \\bar{x}_a\\), is statistically different than 0. R has a built-in command that can calculate the difference in these two sample means. 7.7.7 Observed effect \\(\\delta^*\\) mean_ratings &lt;- movies_genre_sample %&gt;% group_by(genre) %&gt;% summarize(mean = mean(rating)) obs_diff &lt;- diff(mean_ratings$mean) We see here that the diff function calculates \\(\\bar{x}_r - \\bar{x}_a = 6.0265 - 5.1971 = 0.8294\\). We will now proceed similarly to how we conducted the hypothesis test above for the Lady Tasting Tea using simulation. Our goal is figure out a random process with which to simulate the null hypothesis being true. Earlier in this chapter, we used flipping of a fair coin as the random process we were simulating with the null hypothesis being true (\\(H_0: \\tau = 5\\)). 7.7.8 Simulated Data Tactile simulation Here, with us assuming the two population means are equal (\\(H_0: \\mu_r - \\mu_a = 0\\)), we can look at this from a tactile point of view by using index cards. There are \\(n_r = 34\\) data elements corresponding to romance movies and \\(n_a = 34\\) for action movies. We can write the 34 ratings from our sample for romance movies on one set of 34 index cards and the 34 ratings for action movies on another set of 34 index cards. (Note that the sample sizes need not be the same.) The next step is to put the two stacks of index cards together, creating a new set of 68 cards. If we assume that the two population means are equal, we are saying that there is no association between ratings and genre (romance vs action). We can use the index cards to create two new stacks for romance and action movies. First, we must shuffle all the cards thoroughly. After doing so, in this case with equal values of sample sizes, we split the deck in half. We then calculate the new sample mean rating of the romance deck, and also the new sample mean rating of the action deck. This creates one simulation of the samples that were collected originally. We next want to calculate a statistic from these two samples. Instead of actually doing the calculation using index cards, we can use R as we have before to simulate this process. library(mosaic) shuffled_ratings &lt;- movies_trimmed %&gt;% mutate(rating = shuffle(rating)) %&gt;% group_by(genre) %&gt;% summarize(mean = mean(rating)) diff(shuffled_ratings$mean) ## [1] -0.02288 Learning check (LC7.16) How would the tactile shuffling of index cards change if we had different samples of say 20 action movies and 60 romance movies? Describe each step that would change. (LC7.17) Why are we taking the difference in the means of the cards in the new shuffled decks? 7.7.9 Distribution of \\(\\delta\\) under \\(H_0\\) The only new command here is shuffle from the mosaic package, which does what we would expect it to do. It simulates a shuffling of the ratings between the two levels of genre just as we could have done with index cards. We can now proceed in a similar way to what we have done previously with the Lady Tasting Tea example by repeating this process many times to create a null distribution of simulated differences in sample means. set.seed(2016) many_shuffles &lt;- do(10000) * (movies_trimmed %&gt;% mutate(rating = shuffle(rating)) %&gt;% group_by(genre) %&gt;% summarize(mean = mean(rating)) ) It is a good idea here to View the many_shuffles data frame via View(many_shuffles). We need to figure out a way to subtract the first value of mean from the second value of mean for each of the 10,000 simulations. This is a little tricky but the group_by function comes to our rescue here: rand_distn &lt;- many_shuffles %&gt;% group_by(.index) %&gt;% summarize(diffmean = diff(mean)) We can now plot the distribution of these simulated differences in means: ggplot(data = rand_distn, aes(x = diffmean)) + geom_histogram(color = &quot;white&quot;, bins = 20) Figure 7.9: Simulated differences in means histogram 7.7.10 The p-value Remember that we are interested in seeing where our observed sample mean difference of 0.8294 falls on this null/randomization distribution. We are interested in simply a difference here so “more extreme” corresponds to values in both tails on the distribution. Let’s shade our null distribution to show a visual representation of our \\(p\\)-value: ggplot(data = rand_distn, aes(x = diffmean, fill = (abs(diffmean) &gt;= obs_diff))) + geom_histogram(color = &quot;white&quot;, bins = 20) Figure 7.10: Shaded histogram to show p-value You may initially think there is an error here, but remember that the observed difference in means was 0.8294. It falls far outside the range of simulated differences. We can add a vertical line to represent both it and its negative (since this is a two-tailed test) instead: ggplot(data = rand_distn, aes(x = diffmean)) + geom_histogram(color = &quot;white&quot;, bins = 100) + geom_vline(xintercept = obs_diff, color = &quot;red&quot;) + geom_vline(xintercept = -obs_diff, color = &quot;red&quot;) Figure 7.11: Histogram with vertical lines corresponding to observed statistic Based on this plot, we have no values as extreme or more extreme than our observed effect in both directions so we have evidence supporting the conclusion that the mean rating for romance movies is different from that of action movies. (It doesn’t really matter what significance level was chosen in this case. Think about why.) The next important idea is to better understand just how much higher of a mean rating can we expect the romance movies to have compared to that of action movies. This can be addressed by creating a 95% confidence interval as we will explore in Chapter 8. Learning check (LC7.18) Conduct the same analysis comparing action movies versus romantic movies using the median rating instead of the mean rating? Make sure to use the %&gt;% as much as possible. What was different and what was the same? (LC7.19) What conclusions can you make from viewing the faceted histogram looking at rating versus genre that you couldn’t see when looking at the boxplot? (LC7.20) Describe in a paragraph how we used Allen Downey’s diagram to conclude if a statistical difference existed between mean movie ratings for action and romance movies. (LC7.21) Why are we relatively confident that the distributions of the sample ratings will be good approximations of the population distributions of ratings for the two genres? (LC7.22) Using the definition of “\\(p\\)-value”, write in words what the \\(p\\)-value represents for the hypothesis test above comparing the mean rating of romance to action movies. (LC7.23) What is the value of the \\(p\\)-value for the hypothesis test comparing the mean rating of romance to action movies? (LC7.24) Do the results of the hypothesis test match up with the original plots we made looking at the population of movies? Why or why not? 7.7.11 Summary To review, these are the steps one would take whenever you’d like to do a hypothesis test comparing values from the distributions of two groups: Simulate many samples using a random process that matches the way the original data were collected and that assumes the null hypothesis is true. Collect the values of a sample statistic for each sample created using this random process to build a randomization distribution. Assess the significance of the original sample by determining where its sample statistic lies in the randomization distribution. If the proportion of values as extreme or more extreme than the observed statistic in the randomization distribution is smaller than the pre-determined significance level \\(\\alpha\\), we reject \\(H_0\\). Otherwise, we fail to reject \\(H_0\\). (If no significance level is given, one can assume \\(\\alpha = 0.05\\).) 7.8 Building theory-based methods using computation As a point of reference, we will now discuss the traditional theory-based way to conduct the hypothesis test for determining if there is a statistically significant difference in the sample mean rating of Action movies versus Romance movies. This method and ones like it work very well when the assumptions are met in order to run the test. They are based on probability models and distributions such as the normal and \\(t\\)-distributions. These traditional methods have been used for many decades back to the time when researchers didn’t have access to computers that could run 10,000 simulations in under a minute. They had to base their methods on probability theory instead. Many fields and researchers continue to use these methods and that is the biggest reason for their inclusion here. It’s important to remember that a \\(t\\)-test or a \\(z\\)-test is really just an approximation of what you have seen in this chapter already using simulation and randomization. The focus here is on understanding how the shape of the \\(t\\)-curve comes about without digging big into the mathematical underpinnings. 7.8.1 EXAMPLE: \\(t\\)-test for two independent samples What is commonly done in statistics is the process of normalization. What this entails is calculating the mean and standard deviation of a variable. Then you subtract the mean from each value of your variable and divide by the standard deviation. The most common normalization is known as the \\(z\\)-score. The formula for a \\(z\\)-score is \\[Z = \\frac{x - \\mu}{\\sigma},\\] where \\(x\\) represent the value of a variable, \\(\\mu\\) represents the mean of the variable, and \\(\\sigma\\) represents the standard deviation of the variable. Thus, if your variable has 10 elements, each one has a corresponding \\(z\\)-score that gives how many standard deviations away that value is from its mean. \\(z\\)-scores are normally distributed with mean 0 and standard deviation 1. They have the common, bell-shaped pattern seen below. Recall, that we hardly ever know the mean and standard deviation of the population of interest. This is almost always the case when considering the means of two independent groups. To help account for us not knowing the population parameter values, we can use the sample statistics instead, but this comes with a bit of a price in terms of complexity. Another form of normalization occurs when we need to use the sample standard deviations as estimates for the unknown population standard deviations. This normalization is often called the \\(t\\)-score. For the two independent samples case like what we have for comparing action movies to romance movies, the formula is \\[T =\\dfrac{ (\\bar{x}_1 - \\bar{x}_2) - (\\mu_1 - \\mu_2)}{ \\sqrt{\\dfrac{{s_1}^2}{n_1} + \\dfrac{{s_2}^2}{n_2}} }\\] There is a lot to try to unpack here. \\(\\bar{x}_1\\) is the sample mean response of the first group \\(\\bar{x}_2\\) is the sample mean response of the second group \\(\\mu_1\\) is the population mean response of the first group \\(\\mu_2\\) is the population mean response of the second group \\(s_1\\) is the sample standard deviation of the response of the first group \\(s_2\\) is the sample standard deviation of the response of the second group \\(n_1\\) is the sample size of the first group \\(n_2\\) is the sample size of the second group Assuming that the null hypothesis is true (\\(H_0: \\mu_1 - \\mu_2 = 0\\)), \\(T\\) is said to be distributed following a \\(t\\) distribution with degrees of freedom equal to the smaller value of \\(n_1 - 1\\) and \\(n_2 - 1\\). The “degrees of freedom” can be thought of measuring how different the \\(t\\) distribution will be as compared to a normal distribution. Small sample sizes lead to small degrees of freedom and, thus, \\(t\\) distributions that have more values in the tails of their distributions. Large sample sizes lead to large degrees of freedom and, thus, \\(t\\) distributions that closely align with the standard normal, bell-shaped curve. So, assuming \\(H_0\\) is true, our formula simplifies a bit: \\[T =\\dfrac{ \\bar{x}_1 - \\bar{x}_2}{ \\sqrt{\\dfrac{{s_1}^2}{n_1} + \\dfrac{{s_2}^2}{n_2}} }.\\] We have already built an approximation for what we think the distribution of \\(\\delta = \\bar{x}_1 - \\bar{x}_2\\) looks like using randomization above. Recall this distribution: ggplot(data = rand_distn, aes(x = diffmean)) + geom_histogram(color = &quot;white&quot;, bins = 20) Figure 7.12: Simulated differences in means histogram If we’d like to have a guess as to what the distribution of \\(T\\) might look like instead, we need only to divide every value in rand_distn by \\[\\sqrt{\\dfrac{{s_1}^2}{n_1} + \\dfrac{{s_2}^2}{n_2}}.\\] As we did before, we will assign Romance to be group 1 and Action to be group 2. (This was done since Romance comes second alphabetically and the reason why we have a number mismatch below with 1 and 2.) Remember that we’ve already calculated these values: kable(summary_ratings) genre mean std_dev n Action 5.197 1.465 34 Romance 6.027 1.202 34 We will create some shortcuts here so you can see the value being calculated for the denominator of \\(T\\). s1 &lt;- summary_ratings$std_dev[2] s2 &lt;- summary_ratings$std_dev[1] n1 &lt;- summary_ratings$n[2] n2 &lt;- summary_ratings$n[1] Here, we have \\(s_1 = 1.2021\\), \\(s_2 = 1.4648\\), \\(n_1 = 34\\), and \\(n_2 = 34\\). We can calculate the denominator via (denom_T &lt;- sqrt( (s1^2 / n1) + (s2^2 / n2) )) ## [1] 0.325 Now if we divide all of the values of diffmean in rand_distn by denom_T we can have a simulated distribution of \\(T\\) test statistics instead: rand_distn &lt;- rand_distn %&gt;% mutate(t_stat = diffmean / denom_T * 10) ggplot(data = rand_distn, aes(x = t_stat)) + geom_histogram(color = &quot;white&quot;, bins = 20) Figure 7.13: Simulated T statistics histogram We see that the shape of this distribution is the same as that of diffmean. The scale has changed though with t_stat having less spread than diffmean. A traditional \\(t\\)-test doesn’t look at this simulated distribution, but instead it looks at the \\(t\\)-curve with degrees of freedom equal to 33 (the minimum of \\(n_1 = 34 - 1 = 33\\) and \\(n_2 = 34 - 1 = 33\\)). This curve is frequently called a density curve and this is the reason why we specify the use of y = ..density.. here in the geom_histogram. We now overlay what this \\(t\\)-curve looks like on top of the histogram showing the simulated \\(T\\) statistics. ggplot(data = rand_distn, mapping = aes(x = t_stat)) + geom_histogram(aes(y = ..density..), color = &quot;white&quot;, binwidth = 0.3) + stat_function(fun = dt, args = list(df = min(n1 - 1, n2 - 1)), color = &quot;royalblue&quot;, size = 2) We can see that the curve does a good job of approximating the randomization distribution here. (More on when to expect for this to be the case when we discuss conditions for the \\(t\\)-test in a bit.) To calculate the \\(p\\)-value in this case, we need to figure out how much of the total area under the \\(t\\)-curve is at our observed \\(T\\)-statistic or more, plus also adding the area under the curve at the negative value of the observed \\(T\\)-statistic or below. (Remember this is a two-tailed test so we are looking for a difference–values in the tails of either direction.) Just as we converted all of the simulated values to \\(T\\)-statistics, we must also do so for our observed effect \\(\\delta^*\\): (t_obs &lt;- obs_diff / denom_T) ## [1] 2.552 So graphically we are interested in finding the percentage of values that are at or above 2.5522 or at or below -2.5522. ggplot(data = rand_distn, mapping = aes(x = t_stat)) + stat_function(fun = dt, args = list(df = min(n1 - 1, n2 - 1)), color = &quot;royalblue&quot;, size = 2) + geom_vline(xintercept = t_obs, color = &quot;red&quot;) + geom_vline(xintercept = -t_obs, color = &quot;red&quot;) At this point, you should make a guess as to what a reasonable value may be for the \\(p\\)-value. Let’s say the \\(p\\)-value is 0.01 or so. To actually perform this calculation by hand, you’d need to do some calculus. Let’s have R do it for us instead using the pt function. pt(t_obs, df = min(n1 - 1, n2 - 1), lower.tail = FALSE) + pt(-t_obs, df = min(n1 - 1, n2 - 1), lower.tail = TRUE) ## [1] 0.01552 7.8.2 Conditions for t-test In order for the results of the \\(t\\)-test to be valid, three conditions must be met: Independent observations in both samples Nearly normal populations OR large sample sizes (\\(n \\ge 30\\)) Independently selected samples Condition 1: This is met since we sampled at random using R from our population. Condition 2: Recall from Figure 7.6, that we know how the populations are distributed. Both of them are close to normally distributed. If we are a little concerned about this assumption, we also do have samples of size larger than 30 (\\(n_1 = n_2 = 34\\)). Condition 3: This is met since there is no natural pairing of a movie in the Action group to a movie in the Romance group. Since all three conditions are met, we can be reasonably certain that the theory-based test will match the results of the randomization-based test using shuffling. Remember that theory-based tests can produce some incorrect results in these assumptions are not carefully checked. The only assumption for randomization and computational-based methods is that the sample is selected at random. They are our preference and we strongly believe they should be yours as well, but it’s important to also see how the theory-based tests can be done and used as an approximation for the computational techniques until at least more researchers are using these techniques that utilize the power of computers. 7.9 Conclusion 7.9.1 Script of R code An R script file of all R code used in this chapter is available here. 7.9.2 What’s to come? This chapter examined the basics of hypothesis testing with terminology and also an example of how to apply the “There is Only One Test” diagram to the Lady Tasting Tea example presented in Chapter 6 and to an example on comparing the IMDB ratings of action movies and romance movies. We’ll see in Chapter 8 how we can provide a range of possible values for an unknown population parameter instead of just running a Yes/No decision from a hypothesis test. We will see in Chapter 9 many of the same ideas we have seen with hypothesis testing and confidence intervals in the last two chapters. Regression is frequently associated both correctly and incorrectly with statistics and data analysis, so you’ll need to make sure you understand when it is appropriate and when it is not. References "],
+["8-ci.html", "8 Confidence Intervals 8.1 Bootstrapping 8.2 Relation to hypothesis testing 8.3 Effect size 8.4 Conclusion", " 8 Confidence Intervals Definition: Confidence Interval A confidence interval gives a range of plausible values for a parameter. It depends on a specified confidence level with higher confidence levels corresponding to wider confidence intervals and lower confidence levels corresponding to narrower confidence intervals. Common confidence levels include 90%, 95%, and 99%. Usually we don’t just begin chapters with a definition, but confidence intervals are simple to define and play an important role in the sciences and any field that uses data. You can think of a confidence interval as playing the role of a net when fishing. Instead of just trying to catch a fish with a single spear (estimating an unknown parameter by using a single point estimate/statistic), we can use a net to try to provide a range of possible locations for the fish (use a range of possible values based around our statistic to make a plausible guess as to the location of the parameter). Needed packages library(dplyr) library(ggplot2) library(mosaic) library(knitr) 8.1 Bootstrapping Just as we did in Chapter 7 with the Lady Tasting Tea when making hypotheses about a population total with which we would like to test which one is more plausible, we can also use computation to infer conclusions about a population quantitative statistic such as the mean. In this case, we will focus on constructing confidence intervals to produce plausible values for a population mean. (We can do a similar analysis for a population median or other summary measure as well.) Traditionally, the way to construct confidence intervals for a mean is to assume a normal distribution for the population or to invoke the Central Limit Theorem and get, what often appears to be magic, results. (This is similar to what was done in Section 7.8.) These methods are often not intuitive, especially for those that lack a strong mathematical background. They also come with their fair share of assumptions and often turn Statistics, a field that is full of tons of useful applications to many different fields and disciplines, into a robotic procedural-based topic. It doesn’t have to be that way! In this section, we will introduce the concept of bootstrapping. It will be a useful tool that will allow us to estimate the variability of our statistic from sample to sample. One neat feature of bootstrapping is that it enables us to approximate the sampling distribution and estimate the distribution’s standard deviation using ONLY the information in the one selected (original) sample. It sounds just as plagued with the magical type qualities of traditional theory-based inference on initial glance but we will see that it provides an intuitive and useful way to make inferences, especially when the samples are of medium to large size. To introduce the concept of bootstrapping, we again will use the movies data set in the ggplot2movies data frame. Remember that we load this data frame into R in much the same way as we loaded flights and weather from the nycflights13 package. library(ggplot2movies) data(movies, package = &quot;ggplot2movies&quot;) Recall that you can also glance at this data frame using the View function and look at the help documentation for movies using the ? function. We will explore many other features of this data set in the chapters to come, but here we will be focusing on the rating variable corresponding to the average IMDB user rating. You may notice that this data set is quite large: 58,788 movies have data collected about them here. This will correspond to our population of ALL movies. Remember from Chapter 6 that our population is rarely known. We use this data set as our population here to show you the power of bootstrapping in estimating population parameters. We’ll see how confidence intervals built using the bootstrap distribution perform at including our population parameter of interest. Here we can actually calculate these values since our population is known, but remember that in general this isn’t the case. Let’s take a look at what the distribution of our population ratings looks like. We’ll see that we will use the distribution of our sample(s) as an estimate of this population histogram. movies %&gt;% ggplot(aes(x = rating)) + geom_histogram(color = &quot;white&quot;, bins = 20) Figure 8.1: Population ratings histogram Learning check (LC8.1) Why was a histogram chosen as the plot to make for the rating variable above? (LC8.2) What does the shape of the rating histogram tell us about how IMDB users rate movies? What stands out about the plot? It’s important to think about what our goal is here. We would like to produce a confidence interval for the population mean rating. We will have to pretend for a moment that we don’t have all 58,788 movies. Let’s say that we only have a random sample of 50 movies from this data set instead. In order to get a random sample, we can use the resample function in the mosaic package with replace = FALSE. We could also use the sample_n function from dplyr. set.seed(2017) library(mosaic) library(dplyr) movies_sample &lt;- movies %&gt;% resample(size = 50, replace = FALSE) The resample function has filtered the data frame movies “at random” to choose only 50 rows from the larger movies data frame. We store information on these 50 movies in the movies_sample data frame. Let’s now explore what the rating variable looks like for these 50 movies: movies_sample %&gt;% ggplot(aes(x = rating)) + geom_histogram(color = &quot;white&quot;, bins = 20) Figure 8.2: Sample ratings histogram Remember that we can think of this histogram as an estimate of our population distribution histogram that we saw above. We are interested in the population mean rating and trying to find a range of plausible values for that value. A good start in guessing the population mean is to use the mean of our sample rating from the movies_sample data: (movies_sample_mean &lt;- movies_sample %&gt;% summarize(mean = mean(rating))) ## # A tibble: 1 × 1 ## mean ## &lt;dbl&gt; ## 1 5.894 Note the use of the ( ) at the beginning and the end of this creation of the movies_sample_mean object. If you’d like to print out your newly created object, you can enclose it in the parentheses as we have here. This value of 5.894 is just one guess at the population mean. The idea behind bootstrapping is to sample with replacement from the original sample to create new resamples of the same size as our original sample. Returning to our example, let’s investigate what one such resample of the movies_sample data set accomplishes. We can create one resample/bootstrap sample by using the resample function in the mosaic package. boot1 &lt;- resample(movies_sample) %&gt;% arrange(orig.id) The important thing to note here is the original row numbers from the movies_sample data frame in the far right column called orig.ids. Since we are sampling with replacement, there is a strong likelihood that some of the 50 observational units are going to be selected again. You may be asking yourself what does this mean and how does this lead us to creating a distribution for the sample mean. Recall that the original sample mean of our data was calculated using the summarize function above. Learning check (LC8.3) What happens if we change the seed to our pseudo-random generation? Try it above when we used resample to describe the resulting movies_sample. (LC8.4) Why is sampling at random important from the movies data frame? Why don’t we just pick Action movies and do bootstrapping with this Action movies subset? (LC8.5) What was the purpose of assuming we didn’t have access to the full movies data set here? Before we had a calculated mean in our original sample of 5.894. Let’s calculate the mean of ratings in our bootstrapped sample: (movies_boot1_mean &lt;- boot1 %&gt;% summarize(mean = mean(rating))) ## # A tibble: 1 × 1 ## mean ## &lt;dbl&gt; ## 1 5.686 More than likely the calculated bootstrap sample mean is different than the original sample mean. This is what was meant earlier by the sample means having some variability. What we are trying to do is replicate many different samples being taken from a larger population. Our best guess at what the population looks like is multiple copies of the sample we collected. We then can sample from that larger “created” population by generating bootstrap samples. Similar to what we did in the previous section, we can repeat this process using the do function followed by an asterisk. Let’s look at 10 different bootstrap means for ratings from movies_sample. Note the use of the resample function here. do(10) * (resample(movies_sample) %&gt;% summarize(mean = mean(rating))) ## mean ## 1 5.942 ## 2 5.572 ## 3 5.828 ## 4 6.292 ## 5 6.032 ## 6 5.920 ## 7 5.996 ## 8 5.852 ## 9 6.098 ## 10 5.608 You should see some variability begin to tease its way out here. Many of the simulated means will be close to our original sample mean but many will stray pretty far away. This occurs because outliers may have been selected a couple of times in the resampling or small values were selected more than larger. There are myriad reasons why this might be the case. So what’s the next step now? Just as we repeated the repetitions thousands of times with the “Lady Tasting Tea” example, we can do a similar thing here: trials &lt;- do(10000) * summarize(resample(movies_sample), mean = mean(rating)) ggplot(data = trials, mapping = aes(x = mean)) + geom_histogram(bins = 30, color = &quot;white&quot;) Figure 8.3: Bootstrapped means histogram The shape of this resulting distribution may look familiar to you. It resembles the well-known normal (bell-shaped) curve. At this point, we can easily calculate a confidence interval. In fact, we have a couple different options. We will first use the percentiles of the distribution we just created to isolate the middle 95% of values. This will correspond to our 95% confidence interval for the population mean rating, denoted by \\(\\mu\\). (ciq_mean_rating &lt;- confint(trials, level = 0.95, method = &quot;quantile&quot;)) ## name lower upper level method estimate ## 1 mean 5.456 6.296 0.95 percentile 5.894 It’s always important at this point to interpret the results of this confidence interval calculation. In this context, we can say something like the following: Based on the sample data and bootstrapping techniques, we can be 95% confident that the true mean rating of ALL IMDB ratings is between 5.456 and 6.296. This statement may seem a little confusing to you. Another way to think about this is that this confidence interval was constructed using the sample data by a procedure that is 95% reliable. We will get invalid results 5% of the time. Just as we had a trade-off with \\(\\alpha\\) and \\(\\beta\\) with hypothesis tests, we have a similar trade-off here with setting the confidence level. To further reiterate this point, the graphic below from Diez, Barr, and Çetinkaya-Rundel (2014) shows us that if we repeated a confidence interval process 25 times with 25 different samples, we would expect about 95% of them to actually contain the population parameter of interest. This parameter is marked with a dotted vertical line. We can see that only one confidence interval does not overlap with this value. (The one marked in red.) Therefore 24 in 25 (96%), which is quite close to our 95% reliability, do include the population parameter. Figure 8.4: Confidence interval coverage plot from OpenIntro Remember that we are pretending like we don’t know what the mean IMDB rating for ALL movies is. Our population here is all of the movies listed in the movies data frame from ggplot2movies. So does our bootstrapped confidence interval here contain the actual mean value? movies %&gt;% summarize(mean_rating = mean(rating)) ## # A tibble: 1 × 1 ## mean_rating ## &lt;dbl&gt; ## 1 5.933 We see here that the population mean does fall in our range of plausible values generated from the bootstrapped samples. We can also get an idea of how the theory-based inference techniques would have approximated this confidence interval by using the formula \\[\\bar{x} \\pm (2 * SE),\\] where \\(\\bar{x}\\) is our original sample mean and \\(SE\\) stands for standard error and corresponds to the standard deviation of the bootstrap distribution. The value of 2 here corresponds to it being a 95% confidence interval. (95% of the values in a normal distribution fall within 2 standard deviations of the mean.) This formula assumes that the bootstrap distribution is symmetric and bell-shaped. This is often the case with bootstrap distributions, especially those in which the original distribution of the sample is not highly skewed. Definition: standard error The standard error is the standard deviation of the sampling distribution. The sampling distribution may be approximated by the bootstrap distribution or the null distribution depending on the context. Traditional theory-based methodologies for inference also have formulas for standard errors, assuming some conditions are met. To compute this type of confidence interval, we only need to make a slight modification to the confint function seen above. (The expression after the \\(\\pm\\) sign is known as the margin of error.) (cise_mean_rating &lt;- confint(trials, level = 0.95, method = &quot;stderr&quot;)) ## name lower upper level method estimate margin.of.error ## 1 mean 5.468 6.314 0.95 stderr 5.894 0.4229 Based on the sample data and bootstrapping techniques, we can be 95% confident that the true mean rating of ALL IMDB ratings is between 5.4685 and 6.3143. Learning check (LC8.6) Reproduce the bootstrapping above using a sample of size 50 instead of 25. What changes do you see? (LC8.7) Reproduce the bootstrapping above using a sample of size 5 instead of 25. What changes do you see? (LC8.8) How does the sample size affect the analysis above? (LC8.9) Why must bootstrap samples be the same size as the original sample? 8.1.1 Review of Bootstrapping We can summarize the process to generate a bootstrap distribution here in a series of steps that clearly identify the terminology we will use (R. Lock et al. 2012). Generate bootstrap samples by sampling with replacement from the original sample, using the same sample size. Compute the statistic of interest, called a bootstrap statistic, for each of the bootstrap samples. Collect the statistics for many bootstrap samples to create a bootstrap distribution. Visually, we can represent this process in the following diagram. Figure 8.5: Bootstrapping diagram from Lock5 textbook 8.2 Relation to hypothesis testing Recall that we found a statistically significant difference in the sample mean of romance movie ratings compared to the sample mean of action movie ratings. We concluded Chapter 7 by attempting to understand just how much greater we could expect the population mean romance movie rating to be compared to the population mean action movie rating. In order to do so, we will calculate a confidence interval for the difference \\(\\mu_r - \\mu_a\\). We’ll then go back to our population parameter values and see if our confidence interval contains our parameter value. We could use bootstrapping in a way similar to that done above, except now on a difference in sample means, to create a distribution and then use the confint function with the option of quantile to determine a confidence interval for the plausible values of the difference in population means. This is an excellent programming activity and the reader is urged to try to do so. Recall what the randomization/null distribution looked like for our simulated shuffled sample means: library(ggplot2) library(dplyr) ggplot(data = rand_distn, mapping = aes(x = diffmean)) + geom_histogram(color = &quot;white&quot;, bins = 20) Figure 8.6: Simulated shuffled sample means histogram With this null distribution being quite symmetric and bell-shaped, the standard error method introduced above likely provides a good estimate of a range of plausible values for \\(\\mu_r - \\mu_a\\). Another nice option here is that we can use the standard deviation of the null/randomization distribution we just found with our hypothesis test. (std_err &lt;- rand_distn %&gt;% summarize(se = sd(diffmean))) ## # A tibble: 1 × 1 ## se ## &lt;dbl&gt; ## 1 0.03182 We can use the general formula of \\(statistic \\pm (2 * SE)\\) for a confidence interval to obtain the following result for plausible values of the difference in population means at the 95% level. (lower &lt;- obs_diff - (2 * std_err)) ## se ## 1 0.7658 (upper &lt;- obs_diff + (2 * std_err)) ## se ## 1 0.8931 We can, therefore, say that we are 95% confident that the population mean rating for romance movies is between 0.766 and 0.893 points higher than for that of action movies. The important thing to check here is whether 0 is contained in the confidence interval. If it is, it is plausible that the difference in the two population means between the two groups is 0. This means that the null hypothesis is plausible. The results of the hypothesis test and the confidence interval should match as they do here. We rejected the null hypothesis with hypothesis testing and we have evidence here that the mean rating for romance movies is higher than for action movies. 8.3 Effect size The phrase effect size has been thrown around recently as an alternative to \\(p\\)-values. In combination with the confidence interval, it can be often more valuable than just looking at the results of a hypothesis test. It depends on the scientific discipline exactly what is meant by “effect size” but, in general, it refers to the magnitude of the difference between group measurements. For our two sample problem involving movies, it is the observed difference in sample means obs_diff. It’s worthy of mention here that confidence intervals are always centered at the observed statistic. In other words, if you are looking at a confidence interval and someone asks you what the “effect size” is you can simply find the midpoint of the stated confidence interval. Learning check (LC8.10) Check to see whether the difference in population mean ratings for the two genres falls in the confidence interval we found here. Are we guaranteed that it will fall in the range of plausible values? (LC8.11) Why do you think many scientific fields are shifting to preferring inclusion of confidence intervals in articles over just \\(p\\)-values and hypothesis tests? (LC8.12) Why is 95% related to a value of 2 in the margin of error? What would approximate values be for 90% and for 99%? (LC8.13) Why is a 95% confidence interval wider than a 90% confidence interval? Explain by using a concrete example from everyday life about what is meant by “confidence.” (LC8.14) How would confidence intervals correspond to one-sided hypothesis tests? (LC8.15) There is a relationship between the significance level and the confidence level. What do you think it is? (LC8.16) The moment the phrase “standard error” is mentioned, there seems to be someone that says “The standard error is \\(s\\) divided by the square root of \\(n\\).” This standard error formula is used in the theory-based procedure for an inference on one mean. But… does it always work? For samp1, samp2, and samp3 below, do the following: produce a bootstrap distribution based on the sample calculate the standard deviation of the bootstrap distribution compare this value of the standard error to what you obtain when you calculate the standard deviation of the sample \\(s\\) divided by \\(\\sqrt{n}\\). df1 &lt;- data_frame(samp1 = rexp(50)) df2 &lt;- data_frame(samp2 = rnorm(100)) df3 &lt;- data_frame(samp3 = rbeta(20,5,5)) Describe how \\(s / \\sqrt{n}\\) does in approximating the standard error for these three samples and their corresponding bootstrap distributions. 8.4 Conclusion 8.4.1 Script of R code An R script file of all R code used in this chapter is available here. 8.4.2 What’s to come? We will see in Chapter 9 many of the same ideas we have seen with hypothesis testing and confidence intervals in the last two chapters. Regression is frequently associated both correctly and incorrectly with statistics and data analysis, so you’ll need to make sure you understand when it is appropriate and when it is not. References "],
+["9-regress.html", "9 Regression via broom 9.1 EXAMPLE: Alaskan Airlines delays 9.2 Correlation 9.3 Linear regression 9.4 Inference for regression 9.5 Residual analysis 9.6 Conditions for regression 9.7 Conclusion", " 9 Regression via broom One of the most commonly used statistical procedures is regression. Regression, in its simplest form, focuses on trying to predict values of one numerical variable based on the values of another numerical variable using a straight line fit to data. We saw in Chapters 7 and 8 an example of analyses using a categorical predictor (movie genre–action or romance) and a numerical response (movie rating). In this chapter, we will focus on going back to the flights data frame in the nycflights13 package to look at the relationship between departure delay and arrival delay. We will also discuss the concept of correlation and how it is frequently incorrectly implied to also lead to causation. This chapter also introduces the broom package, which is a useful tool in summarizing the results of model fits in tidy format. You will see examples of the tidy, glance, and augment functions with linear regression. Needed packages library(mosaic) library(dplyr) library(ggplot2) library(knitr) library(broom) library(nycflights13) 9.1 EXAMPLE: Alaskan Airlines delays We’ll next explore the relationship/association of departure delays and arrival delays for a sample of 100 flights departing from New York City in 2013 with Alaskan Airlines. library(nycflights13) data(flights) set.seed(2017) # Load Alaska data, deleting rows that have missing departure delay # or arrival delay data alaska_flights &lt;- flights %&gt;% filter(carrier == &quot;AS&quot;) %&gt;% filter(!is.na(dep_delay) &amp; !is.na(arr_delay)) %&gt;% resample(size = 50, replace = FALSE) ggplot(data = alaska_flights, mapping = aes(x = dep_delay, y = arr_delay)) + geom_point() Figure 9.1: Departure and Arrival Flight Delays for a sample of 50 Alaskan flights from NYC Learning check (LC9.1) Does there appear to be a linear relationship with arrival delay and departure delay? In other words, could you fit a line to the data and have it explain well how arr_delay increases as dep_delay increases? (LC9.2) Is there only one possible line that fits the data “well”? How could you decide on which one is best if there are multiple options? 9.2 Correlation One way to measure the linearity between two numerical variables is by using correlation. In fact, the correlation coefficient is defined as just that. Definition: Correlation Coefficient The correlation coefficient measures the strength of linear association between two variables. Properties of the correlation coefficient: It is always between -1 and 1, inclusive, where -1 indicates perfect negative relationship 0 indicates no relationship +1 indicates perfect positive relationship Learning check (LC9.3) Make a guess as to the value of the correlation cofficient between arr_delay and dep_delay in the alaska_flights data frame. (LC9.4) Do you think that the correlation coefficient between arr_delay and dep_delay is the same as the correlation coefficient between dep_delay and arr_delay? Explain. We can look at a variety of different data sets and their corresponding correlation coefficients in the following plot. Figure 9.2: Different Correlation Coefficients We can calculate the correlation coefficient for our example of flight delays via alaska_flights %&gt;% summarize(correl = cor(dep_delay, arr_delay)) ## # A tibble: 1 × 1 ## correl ## &lt;dbl&gt; ## 1 0.7908 The sample correlation coefficient is denoted by \\(r\\). In this case, \\(r = 0.7908\\). Learning check (LC9.5) Would you quantify the value of correl calculated above as being strongly positively linear, weakly positively linear, not linear, weakly negatively linear, or strongly positively linear? Discuss your choice. If you’d like a little more practice in determining the linear relationship between two variables by quantifying a correlation coefficient, you should check out the Guess the Correlation game online. 9.2.1 Correlation does not imply causation Just because arrival delays are related to departure delays in a somewhat linear fashion, we can’t say with certaintly that arrival delays are caused entirely by departure delays. Certainly it appears that as one increases, the other tends to increase, but that might not always be the case. Causation is a tricky problem and frequently takes carefully designed experiments. These experiments remove confounding variables and only focus on the behavior of one variable in the presence of the levels of the other variable(s). Be careful as you read studies to make sure that the writers aren’t falling into this fallacy of correlation implying causation. If you spot one, you may want to send them a link to Spurious Correlations. Learning check (LC9.6) What are some other confounding variables besides departure delay that could attribute to an increase in arrival delays? Remember that a variable is something that has to vary! 9.3 Linear regression So we see above that there is a strong positive association between these delay variables. Let’s say that we are waiting for our flight to leave New York City on Alaskan and we are told that our flight is going to be delayed 25 minutes. What could we predict for our arrival delay based on the plot in Figure 9.1? It may be hard to pick a particular value here, especially after just going over confidence intervals in Chapter 8. One way to do this would be to fit a line that fits the data best and then use the predicted arr_delay value from that line for dep_delay = 25 as our prediction. But what is meant by “fits the data best”? The least squares/best fitting/linear regression line has been fit to the data below. Figure 9.3: Regression line fit on delays Here lm corresponds to “linear model” and we’ll see its use again in a bit when we find the values that define this line. 9.3.1 Understanding linear regression basics Let’s choose an arbitrary point on the graph and label it the color blue. Now consider this point’s deviation from the regression line. Do this for another point. And for another point. We could repeat this process for each of the points in our sample. The pattern that emerges here is that the regression line minimizes the sum of the squared arrow lengths (i.e., the least squares) for all of the points. As you look at these points you might think that a different line could fit the data better based on this criteria. That isn’t the case though and it can be shown via calculus (omitted here) that this line minimizes the sum of the squared residuals for these 50 points. 9.3.2 The equation of the line We can use R and the lm function to retrieve the equation of the line of best fit here in red. A simple linear regression such as this will produce two coeffients: one for the \\(y\\)-intercept and one for the slope. We can use the tidy function in the broom package to extract these coefficients from the model fit. delay_fit &lt;- lm(formula = arr_delay ~ dep_delay, data = alaska_flights) tidy(delay_fit) %&gt;% kable() term estimate std.error statistic p.value (Intercept) -14.155 2.809 -5.038 0 dep_delay 1.218 0.136 8.951 0 In general, the equation of the line of best fit for a sample is \\[\\hat{y} = b_0 + b_1 x.\\] Thus, our equation is \\(\\hat{y} = -14.155 + 1.2177 \\, x.\\) It is usually preferred to actually write the names of the variables instead of \\(x\\) and \\(y\\): \\[\\widehat{arr\\_delay} = -14.155 + 1.2177 \\, dep\\_delay.\\] We can also extract the coefficients by using the coef function: coef(delay_fit) ## (Intercept) dep_delay ## -14.155 1.218 9.3.3 Interpreting the slope After you have determined your line of best fit, it is good practice to interpret the results to see if they make sense. Slope is defined as rise over run or the change in \\(y\\) for every one unit increase in \\(x\\). For our specific example, we can say that for every one minute increase in the departure delay of Alaskan Airlines flights from NYC, we can expect the corresponding arrival delay to be 1.22 minutes more. This estimate does make some practical sense. It would be strange if arrival delays went down as departure delays increased. We also expect that the longer a flight is delayed on departure, the more likely the longer a flight is delayed on arrival. Remember that we are also using data here to make a guess as to how the population of all Alaskan flights might behave with regards to departure delays and arrival delays, so just as with other sampling procedures there is also variability in the sample estimates for the regression line. 9.3.4 Predicting values Getting back to our hypothetical flight that has been delayed 25 minutes, we can use the augment function in the broom package to get the fitted arrival delay value: delay_fit %&gt;% augment(newdata = data_frame(dep_delay = 25)) ## dep_delay .fitted .se.fit ## 1 25 16.29 3.967 Note the use of the data_frame function here, which must be used since newdata is expecting a data frame as its argument. We must also specify that we are plugging in 25 for the value of dep_delay here. We can see that the line predicted an arrival delay of 16.29 minutes based on our 25 minute departure delay. This also does make some sense since flights that aren’t delayed greatly from the beginning to tend to make up time in the air to compensate. Important note: The correlation coefficient and the slope of the regression line are not the same thing. They will always share the same sign (positive correlation coefficients correspond to positive slope coefficients and the same holds true for negative values), but you can’t make any more conclusions about them than that. For example, say we have 3 groups of points: Their regression lines have different slopes, but \\(r = 1\\) for all 3. In other words, all three groups of points have a perfect (positive) linear relationship. 9.4 Inference for regression The population least squares line is defined by the formula \\(y = \\beta_0 + \\beta_1 x + \\epsilon\\). Here \\(\\epsilon\\) represents the error term. It corresponds to the part of the response variable \\(y\\) that remains unexplained after considering the predictor variable \\(x\\). Often it is standard practice to assume that this error term follows a normal distribution. We will focus on checking whether that assumption is valid in Section 9.5. In the population least squares line \\(y = \\beta_0 + \\beta_1 x + \\epsilon\\), we can see that if \\(\\beta_1 = 0\\) there is no relationship between \\(x\\) and \\(y\\). If \\(\\beta_1 = 0\\), \\(y = \\beta_0 + \\epsilon\\). Therefore, \\(y\\) does not depend on \\(x\\) at all in the equation. A hypothesis test is frequently conducted to check whether a relationship exists between two numerical variables \\(x\\) and \\(y\\). We can also use the concept of shuffling to determine the standard error of our null distribution and conduct a hypothesis test for a population slope. Let’s go back to our example on Alaskan flights that represent a sample of all Alaskan flights departing NYC in 2013. Let’s test to see if we have evidence that a positive relationship exists between the departure delay and arrival delay for Alaskan flights. We will set up this hypothesis testing process as we have each before via the “There is Only One Test” diagram in Figure 7.1. 9.4.1 Data Our data is stored in alaska_flights and we are focused on the 50 measurements of dep_delay and arr_delay there. 9.4.2 Test Statistic \\(\\delta\\) Our test statistic here is the sample slope coefficient that we denote with \\(b_1\\). 9.4.3 Observed effect \\(\\delta^*\\) (b1_obs &lt;- tidy(delay_fit)$estimate[2]) ## [1] 1.218 The calculated slope value from our observed sample is \\(b_1 = 1.2177\\). 9.4.4 Model of \\(H_0\\) We are looking to see if a positive relationship exists so \\(H_a: \\beta_1 &gt; 0\\). Our null hypothesis is always in terms of equality so we have \\(H_0: \\beta_1 = 0\\). 9.4.5 Simulated Data Now to simulate the null hypothesis being true and recreating how our sample was created, we need to think about what it means for \\(\\beta_1\\) to be zero. If \\(\\beta_1 = 0\\), we said above that there is no relationship between the departure delay and arrival delay. If there is no relationship, then any one of the arrival delay values could have just as likely occurred with any of the other departure delay values instead of the one that it actually did fall with. We, therefore, have another example of shuffling in our simulating of data. Tactile simulation We could use a deck of 100 note cards to create a tactile simulation of this shuffling process. We would write the 50 different values of departure delays on each of the 50 cards, one per card. We would then do the same thing for the 50 arrival delays putting them on one per card. Next, we would lay out each of the 50 departure delay cards and we would shuffle the arrival delay deck. Then, after shuffling the deck well, we would disperse the cards one per each one of the departure delay cards. We would then enter these new values in for arrival delay and compute a sample slope based on this shuffling. We could repeat this process many times, keeping track of our sample slope after each shuffle. 9.4.6 Distribution of \\(\\delta\\) under \\(H_0\\) We can build our randomization distribution in much the same way we did before using the do and shuffle functions. Here we will take advantage of the coef function we saw earlier to extract the slope and intercept coefficients. (Our focus will be on the slope here though.) rand_distn &lt;- mosaic::do(10000) * (lm(formula = shuffle(arr_delay) ~ dep_delay, data = alaska_flights) %&gt;% coef()) names(rand_distn) ## [1] &quot;Intercept&quot; &quot;dep_delay&quot; We see that the names of our columns are Intercept and dep_delay. We want to look at dep_delay since that corresponds to the slope coefficients. ggplot(data = rand_distn, mapping = aes(x = dep_delay)) + geom_histogram(color = &quot;white&quot;, bins = 20) 9.4.7 The p-value Recall that we want to see where our observed sample slope \\(\\delta^* = 1.2177\\) falls on this distribution and then count all of the values to the right of it corresponding to \\(H_a: \\beta_0 &gt; 0\\). To get a sense for where our values falls, we can shade all values at least as big as \\(\\delta^*\\). ggplot(data = rand_distn, aes(x = dep_delay, fill = (dep_delay &gt;= b1_obs))) + geom_histogram(color = &quot;white&quot;, bins = 20) Figure 9.4: Shaded histogram to show p-value Since 1.2177 falls far to the right of this plot, we can say that we have a \\(p\\)-value of 0. We, thus, have evidence to reject the null hypothesis in support of there being a positive association between the departure delay and arrival delay of all Alaskan flights from NYC in 2013. Learning check (LC9.7) Repeat the inference above but this time for the correlation coefficient instead of the slope. (LC9.8) Use bootstrapping (of points) to determine a range of possible values for the population slope comparing departure delays to arrival delays for Alaskan flights in 2013 from NYC. 9.5 Residual analysis The following diagram will help you to keep track of what is meant by a residual. Here, \\(y_i\\) is an observed value of the arr_delay variable. \\(i\\) ranges from 1 to 50. For this example, it is the vertical component of the blue dot. \\(\\hat{y}_i\\) is the fitted value–the arr_delay value that is being pointed to on the red line. The residual is \\[\\hat{\\epsilon}_i = y_i - \\hat{y}_i.\\] Note the order here! You start at the non-pointy end of the arrow (\\(y_i\\)) and then subtract away what comes at the point (\\(\\hat{y_i}\\)). 9.6 Conditions for regression In order for regression to be valid, we have three conditions to check: Equal variances across explanatory variable (Check residual plot for fan-shaped patterns.) Independent observations, errors, and predictor variables (Check residual plot for no time series-like patterns.) Nearly normal residuals (Check quantile-quantile plot of standardized residuals.) As you can see from the things to check after the conditions residuals will play a large role in determining whether the conditions are met. Residuals are estimates for the error term \\(\\epsilon\\) we discussed earlier, and this is a big reason why they play an important role in validating regression assumptions. Residual plot To construct a residual plot we will analyze data from the augment function in broom. Specifically, we are interested in the .fitted and .resid variables there: fits &lt;- augment(delay_fit) ggplot(data = fits, mapping = aes(x = .fitted, y = .resid)) + geom_point() + geom_abline(intercept = 0, slope = 0, color = &quot;blue&quot;) Quantile-quantile plot ggplot(data = fits, mapping = aes(sample = .resid)) + stat_qq() Checking conditions: We are looking to see if the points are scattered about the blue line at 0 relatively evenly as we look from left to right. We have some reason for concern here as the large lump of values on the left are much more dispersed than those on the right. The second condition is invalidated if there is a trigonometric pattern of up and down throughout the residual plot. That is not the case here. We look at the quantile-quantile plot (Q-Q plot for sure) for the third condition. We are looking to see if the residuals fall on a straight line with what we would expect if they were normally distributed. We see some curvature here as well. We should begin to wonder if regression was valid here with both condition 1 and condition 3 in question. We have reason to doubt whether a linear regression is valid here. Unfortunately, all too frequently regressions are run without checking these assumptions carefully. While small deviations from the assumptions can be OK, larger violations can completely invalidate the results and make any inferences improbable and questionable. 9.7 Conclusion 9.7.1 Script of R code An R script file of all R code used in this chapter is available here. 9.7.2 What’s to come? In the last chapter of the textbook, we’ll summarize the purpose of this book as well as present an excellent example of what goes into making an effective story via data. "],
+["10-effective-data-storytelling.html", "10 Effective Data Storytelling Concluding Remarks", " 10 Effective Data Storytelling As we’ve progressed throughout this book, you’ve seen how to work with data in a variety of ways. You’ve learned effective strategies for plotting data by understanding which types of plots work best for which combinations of variable types. You’ve summarized data in table form and calculated summary statistics for a variety of different variables. Further, you’ve seen the value of inference as a process to come to conclusions about a population by using a random sample. Lastly, you’ve explored how to use linear regression and the importance of checking the conditions required to make it a valid procedure. All throughout, you’ve learned many computational techniques and focused on reproducible research in writing R code and keeping track of your work in R Markdown. All of these steps go into making a great story using data. As the textbook comes to a close, we thought it best that you explore what stellar work is being produced by data journalists throughout the world that specialize in effective data storytelling. We recommend you read and analyze this article by Walt Hickey entitled The Dollar-And-Cents Case Against Hollywood’s Exclusion of Women. As you read over it, think carefully about how Walt is using his data, his graphics, and his analyses to paint the picture for the reader of what the story is he wants to tell. In the spirit of reproducibility, the members of FiveThirtyEight have also shared the data that they used to create this story and some R code here. A vignette showing how to reproduce one of the plots at the end of the article using dplyr, ggplot2, and other packages in Hadley’s tidyverse is available here. Great data stories don’t mislead the reader, but rather engulf them in understanding the importance that data plays in our lives through the captivation of storytelling. Concluding Remarks If you’ve come to this point in the book, I’d suspect that you know a thing or two about how to work with data in R. You’ve also gained a lot of knowledge about how to use simulation techniques to determine statistical significance and how these techniques build an intuition about traditional inferential methods like the \\(t\\)-test. The hope is that you’ve come to appreciate data manipulation, tidy data sets, and the power of data visualization. Actually, the data visualization part may be the most important thing here. If you can create truly beautiful graphics that display information in ways that the reader can clearly decipher, you’ve picked up a great skill. Let’s hope that that skill keeps you creating great stories with data into the near and far distant future. Thanks for coming along for the ride as we dove into modern data analysis using R! "],
 ["A-appendixA.html", "A Statistical Background A.1 Basic statistical terms", " A Statistical Background A.1 Basic statistical terms A.1.1 Mean The mean is the most commonly reported measure of center. It is commonly called the “average” though this term can be a little ambiguous. The mean is the sum of all of the data elements divided by how many elements there are. If we have \\(n\\) data points, the mean is given by: \\[Mean = \\frac{x_1 + x_2 + \\cdots + x_n}{n}\\] A.1.2 Median The median is calculated by first sorting a variable’s data from smallest to largest. After sorting the data, the middle element in the list is the median. If the middle falls between two values, then the median is the mean of those two values. A.1.3 Standard deviation We will next discuss the standard deviation of a sample data set pertaining to one variable. The formula can be a little intimidating at first but it is important to remember that it is essentially a measure of how far to expect a given data value is from its mean: \\[Standard \\, deviation = \\sqrt{\\frac{(x_1 - Mean)^2 + (x_2 - Mean)^2 + \\cdots + (x_n - Mean)^2}{n - 1}}\\] A.1.4 Five-number summary The five-number summary consists of five values: minimum, first quantile (25th percentile), median (50th percentile), third quantile (75th) quantile, and maximum. The quantiles are calculated as first quantile (\\(Q_1\\)): the median of the first half of the sorted data third quantile (\\(Q_3\\)): the median of the second half of the sorted data The interquartile range is defined as \\(Q_3 - Q_1\\) and is a measure of how spread out the middle 50% of values is. The five-number summary is not influenced by the presence of outliers in the ways that the mean and standard deviation are. It is, thus, recommended for skewed data sets. A.1.5 Distribution The distribution of a variable/data set corresponds to generalizing patterns in the data set. It often shows how frequently elements in the data set appear. It shows how the data varies and gives some information about where a typical element in the data might fall. Distributions are most easily seen through data visualization. A.1.6 Outliers Outliers correspond to values in the data set that fall far outside the range of “ordinary” values. In regards to a boxplot (by default), they correspond to values below \\(Q_1 - (1.5 * IQR)\\) or above \\(Q_3 + (1.5 * IQR)\\). Note that these terms (aside from Distribution) only apply to quantitative variables. "],
-["B-appendixB.html", "B Inference Examples B.1 Needed packages B.2 One Mean B.3 One Proportion B.4 Two Proportions B.5 Two Means (Independent Samples) B.6 Two Means (Paired Samples)", " B Inference Examples This appendix is designed to provide you with example of the five basic hypothesis tests and their corresponding confidence intervals. Traditional theory-based methods as well as computational-based methods are presented. B.1 Needed packages library(dplyr) library(ggplot2) library(mosaic) library(knitr) library(readr) B.2 One Mean B.2.1 Problem Statement The National Survey of Family Growth conducted by the Centers for Disease Control gathers information on family life, marriage and divorce, pregnancy, infertility, use of contraception, and men’s and women’s health. One of the variables collected on this survey is the age at first marriage. 5,534 randomly sampled US women between 2006 and 2010 completed the survey. The women sampled here had been married at least once. Do we have evidence that the mean age of first marriage for all US women from 2006 to 2010 is greater than 23 years? (Tweaked a bit from Diez, Barr, and Çetinkaya-Rundel 2014 [Chapter 4]) B.2.2 Competing Hypotheses B.2.2.1 In words Null hypothesis: The mean age of first marriage for all US women from 2006 to 2010 is equal to 23 years. Alternative hypothesis: The mean age of first marriage for all US women from 2006 to 2010 is greater than 23 years. B.2.2.2 In symbols (with annotations) \\(H_0: \\mu = \\mu_{0}\\), where \\(\\mu\\) represents the mean age of first marriage for all US women from 2006 to 2010 and \\(\\mu_0\\) is 23. \\(H_A: \\mu &gt; 23\\) B.2.2.3 Set \\(\\alpha\\) It’s important to set the significance level before starting the testing using the data. Let’s set the significance level at 5% here. B.2.3 Exploring the sample data #download.file(&quot;http://ismayc.github.io/teaching/sample_problems/ageAtMar.csv&quot;, # destfile = &quot;data/ageAtMar.csv&quot;, # method = &quot;curl&quot;) ageAtMar &lt;- read_csv(&quot;data/ageAtMar.csv&quot;) ## Parsed with column specification: ## cols( ## age = col_integer() ## ) age_summ &lt;- ageAtMar %&gt;% summarize(sample_size = n(), mean = mean(age), sd = sd(age), minimum = min(age), lower_quartile = quantile(age, 0.25), median = median(age), upper_quartile = quantile(age, 0.75), max = max(age)) kable(age_summ) sample_size mean sd minimum lower_quartile median upper_quartile max 5534 23.44019 4.721365 10 20 23 26 43 The histogram below also shows the distribution of age. ageAtMar %&gt;% ggplot(aes(x = age)) + geom_histogram(binwidth = 3, color = &quot;white&quot;) B.2.3.1 Guess about statistical significance We are looking to see if the observed sample mean of 23.4401879 is statistically greater than \\(\\mu_0 = 23\\). They seem to be quite close, but we have a large sample size here. Let’s guess that the large sample size will lead us to reject this practically small difference. B.2.4 Non-traditional methods B.2.4.1 Bootstrapping for Hypothesis Test In order to look to see if the observed sample mean of 23.4401879 is statistically greater than \\(\\mu_0 = 23\\), we need to account for the sample size. We also need to determine a process that replicates how the original sample of size 5534 was selected. We can use the idea of bootstrapping to simulate the population from which the sample came and then generate samples from that simulated population to account for sampling variability. Recall how bootstrapping would apply in this context: Sample with replacement from our original sample of 5534 women and repeat this process 10,000 times, calculate the mean for each of the 10,000 bootstrap samples created in Step 1., combine all of these bootstrap statistics calculated in Step 2 into a boot_distn object, and shift the center of this distribution over to the null value of 23. (This is needed since it will be centered at 23.4401879 via the process of bootstrapping.) set.seed(2016) mu0 &lt;- 23 shift &lt;- mu0 - age_summ$mean null_distn &lt;- do(10000) * resample(ageAtMar, replace = TRUE) %&gt;% mutate(age = age + shift) %&gt;% summarize(mean_age = mean(age)) null_distn %&gt;% ggplot(aes(x = mean_age)) + geom_histogram(bins = 30, color = &quot;white&quot;) We can next use this distribution to observe our \\(p\\)-value. Recall this is a right-tailed test so we will be looking for values that are greater than or equal to 23.4401879 for our \\(p\\)-value. obs_mean &lt;- age_summ$mean null_distn %&gt;% ggplot(aes(x = mean_age)) + geom_histogram(bins = 30, color = &quot;white&quot;) + geom_vline(color = &quot;red&quot;, xintercept = obs_mean) B.2.4.1.1 Calculate \\(p\\)-value pvalue &lt;- null_distn %&gt;% filter( mean_age &gt;= obs_mean ) %&gt;% nrow() / nrow(null_distn) pvalue ## [1] 0 So our \\(p\\)-value is 0 and we reject the null hypothesis at the 5% level. You can also see this from the histogram above that we are far into the tail of the null distribution. B.2.4.2 Bootstrapping for Confidence Interval We can also create a confidence interval for the unknown population parameter \\(\\mu\\) using our sample data using bootstrapping. Note that we don’t need to shift this distribution since we want the center of our confidence interval to be our point estimate \\(\\bar{x}_{obs} = 23.4401879\\). boot_distn &lt;- do(10000) * resample(ageAtMar, replace = TRUE) %&gt;% summarize(mean_age = mean(age)) boot_distn %&gt;% ggplot(aes(x = mean_age)) + geom_histogram(bins = 30, color = &quot;white&quot;) boot_distn %&gt;% summarize(lower = quantile(mean_age, probs = 0.025), upper = quantile(mean_age, probs = 0.975)) ## lower upper ## 1 23.31821 23.56361 We see that 23 is not contained in this confidence interval as a plausible value of \\(\\mu\\) (the unknown population mean) and the entire interval is larger than 23. This matches with our hypothesis test results of rejecting the null hypothesis in favor of the alternative (\\(\\mu &gt; 23\\)). Interpretation: We are 95% confident the true mean age of first marriage for all US women from 2006 to 2010 is between and . B.2.5 Traditional methods B.2.5.1 Check conditions Remember that in order to use the shortcut (formula-based, theoretical) approach, we need to check that some conditions are met. Independent observations: The observations are collected independently. The cases are selected independently through random sampling so this condition is met. Approximately normal: The distribution of the response variable should be normal or the sample size should be at least 30. The histogram for the sample above does show some skew. The Q-Q plot below also shows some skew. ggplot(data = ageAtMar, mapping = aes(sample = age)) + stat_qq() The sample size here is quite large though (\\(n = 5534\\)) so both conditions are met. B.2.5.2 Test statistic The test statistic is a random variable based on the sample data. Here, we want to look at a way to estimate the population mean \\(\\mu\\). A good guess is the sample mean \\(\\bar{X}\\). Recall that this sample mean is actually a random variable that will vary as different samples are (theoretically, would be) collected. We are looking to see how likely is it for us to have observed a sample mean of \\(\\bar{x}_{obs} = 23.4401879\\) or larger assuming that the population mean is 23 (assuming the null hypothesis is true). If the conditions are met and assuming \\(H_0\\) is true, we can “standardize” this original test statistic of \\(\\bar{X}\\) into a \\(T\\) statistic that follows a \\(t\\) distribution with degrees of freedom equal to \\(df = n - 1\\): \\[ T =\\dfrac{ \\bar{X} - \\mu_0}{ S / \\sqrt{n} } \\sim t (df = n - 1) \\] where \\(S\\) represents the standard deviation of the sample and \\(n\\) is the sample size. B.2.5.2.1 Observed test statistic While one could compute this observed test statistic by “hand”, the focus here is on the set-up of the problem and in understanding which formula for the test statistic applies. We can use the t.test function to perform this analysis for us. t.test(x = ageAtMar$age, alternative = &quot;greater&quot;, mu = 23) ## ## One Sample t-test ## ## data: ageAtMar$age ## t = 6.9357, df = 5533, p-value = 0.000000000002252 ## alternative hypothesis: true mean is greater than 23 ## 95 percent confidence interval: ## 23.33578 Inf ## sample estimates: ## mean of x ## 23.44019 We see here that the \\(t_{obs}\\) value is around 6.94. Recall that for large sample sizes the \\(t\\) distribution is essentially the standard normal distribution and this is why the statistic is reported as Z. B.2.5.3 Compute \\(p\\)-value The \\(p\\)-value—the probability of observing an \\(t_{obs}\\) value of 6.94 or more in our null distribution of a \\(t\\) with 5433 degrees of freedom—is essentially 0. This can also be calculated in R directly: pt(6.936, df = nrow(ageAtMar) - 1, lower.tail = FALSE) ## [1] 0.000000000002247382 We can also use the \\(N(0, 1)\\) distribution here: pnorm(6.936, lower.tail = FALSE) ## [1] 0.000000000002016788 B.2.5.4 State conclusion We, therefore, have sufficient evidence to reject the null hypothesis. Our initial guess that our observed sample mean was statistically greater than the hypothesized mean has supporting evidence here. Based on this sample, we have evidence that the mean age of first marriage for all US women from 2006 to 2010 is greater than 23 years. B.2.5.5 Confidence interval The confidence interval reported above with t.test is known as a one-sided confidence interval and gives the lowest value one could expect \\(\\mu\\) to be with 95% confidence. We usually want a range of values so we can use alternative = &quot;two.sided&quot; to get the similar values compared to the bootstrapping process: t.test(x = ageAtMar$age, alternative = &quot;two.sided&quot;, mu = 23)$conf ## [1] 23.31577 23.56461 ## attr(,&quot;conf.level&quot;) ## [1] 0.95 B.2.6 Comparing results Observing the bootstrap distribution that were created, it makes quite a bit of sense that the results are so similar for traditional and non-traditional methods in terms of the \\(p\\)-value and the confidence interval since these distributions look very similar to normal distributions. The conditions also being met (the large sample size was the driver here) leads us to better guess that using any of the methods whether they are traditional (formula-based) or non-traditional (computational-based) will lead to similar results. B.3 One Proportion B.3.1 Problem Statement The CEO of a large electric utility claims that 80 percent of his 1,000,000 customers are satisfied with the service they receive. To test this claim, the local newspaper surveyed 100 customers, using simple random sampling. 73 were satisfied and the remaining were unsatisfied. Based on these findings from the sample, can we reject the CEO’s hypothesis that 80% of the customers are satisfied? [Tweaked a bit from http://stattrek.com/hypothesis-test/proportion.aspx?Tutorial=AP] B.3.2 Competing Hypotheses B.3.2.1 In words Null hypothesis: The proportion of all customers of the large electric utility satisfied with service they receive is equal 0.80. Alternative hypothesis: The proportion of all customers of the large electric utility satisfied with service they receive is different from 0.80. B.3.2.2 In symbols (with annotations) \\(H_0: \\pi = p_{0}\\), where \\(\\pi\\) represents the proportion of all customers of the large electric utility satisfied with service they receive and \\(p_0\\) is 0.8. \\(H_A: \\pi \\ne 0.8\\) B.3.2.3 Set \\(\\alpha\\) It’s important to set the significance level before starting the testing using the data. Let’s set the significance level at 5% here. B.3.3 Exploring the sample data elec &lt;- c(rep(&quot;satisfied&quot;, 73), rep(&quot;unsatisfied&quot;, 27)) %&gt;% as_data_frame() %&gt;% rename(&quot;satisfy&quot; = value) The bar graph below also shows the distribution of satisfy. ggplot(data = elec, aes(x = satisfy)) + geom_bar() B.3.3.1 Guess about statistical significance We are looking to see if the sample proportion of 0.73 is statistically different from \\(p_0 = 0.8\\) based on this sample. They seem to be quite close, and our sample size is not huge here (\\(n = 100\\)). Let’s guess that we do not have evidence to reject the null hypothesis. B.3.4 Non-traditional methods B.3.4.1 Simulation for Hypothesis Test In order to look to see if 0.73 is statistically different from 0.8, we need to account for the sample size. We also need to determine a process that replicates how the original sample of size 100 was selected. We can use the idea of an unfair coin to simulate this process. We will simulate flipping an unfair coin (with probability of success 0.8 matching the null hypothesis) 100 times. Then we will keep track of how many heads come up in those 100 flips. Our simulated statistic matches with how we calculated the original statistic \\(\\hat{p}\\): the number of heads (satisfied) out of our total sample of 100. We then repeat this process many times (say 10,000) to create the null distribution looking at the simulated proportions of successes: set.seed(2016) null_distn &lt;- do(10000) * rflip(100, prob = 0.8) null_distn %&gt;% ggplot(aes(x = prop)) + geom_histogram(bins = 30, color = &quot;white&quot;) We can next use this distribution to observe our \\(p\\)-value. Recall this is a two-tailed test so we will be looking for values that are 0.8 - 0.73 = 0.07 away from 0.8 in BOTH directions for our \\(p\\)-value: p_hat &lt;- 73/100 dist &lt;- 0.8 - p_hat null_distn %&gt;% ggplot(aes(x = prop)) + geom_histogram(bins = 30, color = &quot;white&quot;) + geom_vline(color = &quot;red&quot;, xintercept = 0.8 + dist) + geom_vline(color = &quot;red&quot;, xintercept = p_hat) B.3.4.1.1 Calculate \\(p\\)-value pvalue &lt;- null_distn %&gt;% filter( (prop &gt;= 0.8 + dist) | (prop &lt;= p_hat) ) %&gt;% nrow() / nrow(null_distn) pvalue ## [1] 0.081 So our \\(p\\)-value is 0.081 and we fail to reject the null hypothesis at the 5% level. B.3.4.2 Bootstrapping for Confidence Interval We can also create a confidence interval for the unknown population parameter \\(\\pi\\) using our sample data. To do so, we use bootstrapping, which involves sampling with replacement from our original sample of 100 survey respondents and repeating this process 10,000 times, calculating the proportion of successes for each of the 10,000 bootstrap samples created in Step 1., combining all of these bootstrap statistics calculated in Step 2 into a boot_distn object, identifying the 2.5th and 97.5th percentiles of this distribution (corresponding to the 5% significance level chosen) to find a 95% confidence interval for \\(\\pi\\), and interpret this confidence interval in the context of the problem. boot_distn &lt;- do(10000) * elec %&gt;% resample(size = 100, replace = TRUE) %&gt;% summarize(success_rate = mean(satisfy == &quot;satisfied&quot;)) Just as we use the mean function for calculating the mean over a numerical variable, we can also use it to compute the proportion of successes for a categorical variable where we specify what we are calling a “success” after the ==. (Think about the formula for calculating a mean and how R handles logical statements such as satisfy == &quot;satisfied&quot; for why this must be true.) boot_distn %&gt;% ggplot(aes(x = success_rate)) + geom_histogram(bins = 30, color = &quot;white&quot;) boot_distn %&gt;% summarize(lower = quantile(success_rate, probs = 0.025), upper = quantile(success_rate, probs = 0.975)) ## lower upper ## 1 0.64 0.81 We see that 0.80 is contained in this confidence interval as a plausible value of \\(\\pi\\) (the unknown population proportion). This matches with our hypothesis test results of failing to reject the null hypothesis. Interpretation: We are 95% confident the true proportion of customers who are satisfied with the service they receive is between and . Note: You could also use the null distribution with a shift to have its center at \\(\\hat{p} = 0.73\\) instead of at \\(p_0 = 0.8\\) and calculate its percentiles. The confidence interval produced via this method should be comparable to the one done using bootstrapping above. B.3.5 Traditional methods B.3.5.1 Check conditions Remember that in order to use the shortcut (formula-based, theoretical) approach, we need to check that some conditions are met. Independent observations: The observations are collected independently. The cases are selected independently through random sampling so this condition is met. Approximately normal: The number of expected successes and expected failures is at least 10. This condition is met since 73 and 27 are both greater than 10. B.3.5.2 Test statistic The test statistic is a random variable based on the sample data. Here, we want to look at a way to estimate the population proportion \\(\\pi\\). A good guess is the sample proportion \\(\\hat{P}\\). Recall that this sample proportion is actually a random variable that will vary as different samples are (theoretically, would be) collected. We are looking to see how likely is it for us to have observed a sample proportion of \\(\\hat{p}_{obs} = 0.73\\) or larger assuming that the population proportion is 0.80 (assuming the null hypothesis is true). If the conditions are met and assuming \\(H_0\\) is true, we can standardize this original test statistic of \\(\\hat{P}\\) into a \\(Z\\) statistic that follows a \\(N(0, 1)\\) distribution. \\[ Z =\\dfrac{ \\hat{P} - p_0}{\\sqrt{\\dfrac{p_0(1 - p_0)}{n} }} \\sim N(0, 1) \\] B.3.5.2.1 Observed test statistic While one could compute this observed test statistic by “hand” by plugging the observed values into the formula, the focus here is on the set-up of the problem and in understanding which formula for the test statistic applies. The calculation has been done in R below for completeness though: p_hat &lt;- 0.73 p0 &lt;- 0.8 n &lt;- 100 (z_obs &lt;- (p_hat - p0) / sqrt( (p0 * (1 - p0)) / n)) ## [1] -1.75 We see here that the \\(z_{obs}\\) value is around -1.75. Our observed sample proportion of 0.73 is 1.75 standard errors below the hypothesized parameter value of 0.8. B.3.5.3 Compute \\(p\\)-value 2 * pnorm(z_obs) ## [1] 0.08011831 The \\(p\\)-value—the probability of observing an \\(z_{obs}\\) value of -1.75 or more extreme (in both directions) in our null distribution—is around 8%. Note that we could also do this test directly using the prop.test function. stats::prop.test(x = table(elec$satisfy), n = length(elec$satisfy), alternative = &quot;two.sided&quot;, p = 0.8, correct = FALSE) ## ## 1-sample proportions test without continuity correction ## ## data: table(elec$satisfy), null probability 0.8 ## X-squared = 3.0625, df = 1, p-value = 0.08012 ## alternative hypothesis: true p is not equal to 0.8 ## 95 percent confidence interval: ## 0.6356788 0.8073042 ## sample estimates: ## p ## 0.73 prop.test does a \\(\\chi^2\\) test here but this matches up exactly with what we would expect: \\(x^2_{obs} = 3.06 = (-1.75)^2 = (z_{obs})^2\\) and the \\(p\\)-values are the same because we are focusing on a two-tailed test. Note that the 95 percent confidence interval given above matches well with the one calculated using bootstrapping. B.3.5.4 State conclusion We, therefore, do not have sufficient evidence to reject the null hypothesis. Our initial guess that our observed sample proportion was not statistically greater than the hypothesized proportion has not been invalidated. Based on this sample, we have do not evidence that the proportion of all customers of the large electric utility satisfied with service they receive is different from 0.80, at the 5% level. B.3.6 Comparing results Observing the bootstrap distribution and the null distribution that were created, it makes quite a bit of sense that the results are so similar for traditional and non-traditional methods in terms of the \\(p\\)-value and the confidence interval since these distributions look very similar to normal distributions. The conditions also being met leads us to better guess that using any of the methods whether they are traditional (formula-based) or non-traditional (computational-based) will lead to similar results. B.4 Two Proportions B.4.1 Problem Statement A 2010 survey asked 827 randomly sampled registered voters in California “Do you support? Or do you oppose? Drilling for oil and natural gas off the Coast of California? Or do you not know enough to say?” Conduct a hypothesis test to determine if the data provide strong evidence that the proportion of college graduates who do not have an opinion on this issue is different than that of non-college graduates. (Tweaked a bit from Diez, Barr, and Çetinkaya-Rundel 2014 [Chapter 6]) B.4.2 Competing Hypotheses B.4.2.1 In words Null hypothesis: There is no association between having an opinion on drilling and having a college degree for all registered California voters in 2010. Alternative hypothesis: There is an association between having an opinion on drilling and having a college degree for all registered California voters in 2010. B.4.2.2 Another way in words Null hypothesis: The probability that a Californian voter in 2010 having no opinion on drilling and is a college graduate is the same as that of a non-college graduate. Alternative hypothesis: These parameter probabilities are different. B.4.2.3 In symbols (with annotations) \\(H_0: \\pi_{college} = \\pi_{no\\_college}\\) or \\(H_0: \\pi_{college} - \\pi_{no\\_college} = 0\\), where \\(\\pi\\) represents the probability of not having an opinion on drilling. \\(H_A: \\pi_{college} - \\pi_{no\\_college} \\ne 0\\) B.4.2.4 Set \\(\\alpha\\) It’s important to set the significance level before starting the testing using the data. Let’s set the significance level at 5% here. B.4.3 Exploring the sample data #download.file(&quot;http://ismayc.github.io/teaching/sample_problems/offshore.csv&quot;, # destfile = &quot;data/offshore.csv&quot;, # method = &quot;curl&quot;) offshore &lt;- read_csv(&quot;data/offshore.csv&quot;) table(offshore$college_grad, offshore$response) ## ## no opinion opinion ## no 131 258 ## yes 104 334 (off_summ &lt;- offshore %&gt;% group_by(college_grad) %&gt;% summarize(prop_no_opinion = mean(response == &quot;no opinion&quot;), sample_size = n()) ) ## # A tibble: 2 × 3 ## college_grad prop_no_opinion sample_size ## &lt;chr&gt; &lt;dbl&gt; &lt;int&gt; ## 1 no 0.3367609 389 ## 2 yes 0.2374429 438 offshore %&gt;% ggplot(aes(x = college_grad, fill = response)) + geom_bar(position = &quot;fill&quot;) + coord_flip() B.4.3.1 Guess about statistical significance We are looking to see if a difference exists in the heights of the bars corresponding to no opinion for the plot. Based solely on the plot, we have little reason to believe that a difference exists since the bars seem to be about the same height, BUT…it’s important to use statistics to see if that difference is actually statistically significant! B.4.4 Non-traditional methods B.4.4.1 Collecting summary info Next we will assign some key values to variable names in R: phat_nograd &lt;- off_summ$prop_no_opinion[1] phat_grad &lt;- off_summ$prop_no_opinion[2] obs_diff &lt;- phat_grad - phat_nograd n_nograd &lt;- off_summ$sample_size[1] n_grad &lt;- off_summ$sample_size[2] B.4.4.2 Randomization for Hypothesis Test In order to look to see if the observed sample proportion of no opinion for college graduates of 0.3367609 is statistically different than that for graduates of 0.2374429, we need to account for the sample sizes. Note that this is the same as looking to see if \\(\\hat{p}_{grad} - \\hat{p}_{nograd}\\) is statistically different than 0. We also need to determine a process that replicates how the original group sizes of 389 and 438 were selected. We can use the idea of randomization testing (also known as permutation testing) to simulate the population from which the sample came (with two groups of different sizes) and then generate samples using shuffling from that simulated population to account for sampling variability. set.seed(2016) many_shuffles &lt;- do(10000) * (offshore %&gt;% mutate(response = shuffle(response)) %&gt;% group_by(college_grad) %&gt;% summarize(prop_no_opinion = mean(response == &quot;no opinion&quot;)) ) null_distn &lt;- many_shuffles %&gt;% group_by(.index) %&gt;% summarize(diffprop = diff(prop_no_opinion)) null_distn %&gt;% ggplot(aes(x = diffprop)) + geom_histogram(bins = 25, color = &quot;white&quot;) We can next use this distribution to observe our \\(p\\)-value. Recall this is a two-tailed test so we will be looking for values that are greater than or equal to -0.099318 or less than or equal to 0.099318 for our \\(p\\)-value. null_distn %&gt;% ggplot(aes(x = diffprop)) + geom_histogram(bins = 20, color = &quot;white&quot;) + geom_vline(color = &quot;red&quot;, xintercept = obs_diff) + geom_vline(color = &quot;red&quot;, xintercept = -obs_diff) B.4.4.2.1 Calculate \\(p\\)-value pvalue &lt;- null_distn %&gt;% filter( (diffprop &lt;= obs_diff) | (diffprop &gt;= -obs_diff) ) %&gt;% nrow() / nrow(null_distn) pvalue ## [1] 0.0025 So our \\(p\\)-value is 0.0025 and we reject the null hypothesis at the 5% level. You can also see this from the histogram above that we are far into the tails of the null distribution. B.4.4.3 Bootstrapping for Confidence Interval We can also create a confidence interval for the unknown population parameter \\(\\pi_{college} - \\pi_{no\\_college}\\) using our sample data with bootstrapping. Here we will bootstrap each of the groups with replacement instead of shuffling. This is done using the groups argument in the resample function to fix the size of each group to be the same as the original group sizes of 389 for non-college graduates and 438 for college graduates. boot_props &lt;- do(10000) * offshore %&gt;% resample(replace = TRUE, groups = college_grad) %&gt;% group_by(college_grad) %&gt;% summarize(prop_no_opinion = mean(response == &quot;no opinion&quot;)) Next, we calculate the difference in sample proportions for each of the 10,000 replications: boot_distn &lt;- boot_props %&gt;% group_by(.index) %&gt;% summarize(diffprop = diff(prop_no_opinion)) boot_distn %&gt;% ggplot(aes(x = diffprop)) + geom_histogram(bins = 30, color = &quot;white&quot;) (ci_boot &lt;- boot_distn %&gt;% summarize(lower = quantile(diffprop, probs = 0.025), upper = quantile(diffprop, probs = 0.975))) ## # A tibble: 1 × 2 ## lower upper ## &lt;dbl&gt; &lt;dbl&gt; ## 1 -0.1595767 -0.03791979 We see that 0 is not contained in this confidence interval as a plausible value of \\(\\pi_{college} - \\pi_{no\\_college}\\) (the unknown population parameter). This matches with our hypothesis test results of rejecting the null hypothesis. Since zero is not a plausible value of the population parameter, we have evidence that the proportion of college graduates in California with no opinion on drilling is different than that of non-college graduates. Interpretation: We are 95% confident the true proportion of non-college graduates with no opinion on offshore drilling in California is between 0.16 dollars smaller to 0.04 dollars smaller than for college graduates. Note: You could also use the null distribution based on randomization with a shift to have its center at \\(\\hat{p}_{college} - \\hat{p}_{no\\_college} = \\$-0.1\\) instead of at 0 and calculate its percentiles. The confidence interval produced via this method should be comparable to the one done using bootstrapping above. B.4.5 Traditional methods B.4.6 Check conditions Remember that in order to use the short-cut (formula-based, theoretical) approach, we need to check that some conditions are met. Independent observations: Each case that was selected must be independent of all the other cases selected. This condition is met since cases were selected at random to observe. Sample size: The number of pooled successes and pooled failures must be at least 10 for each group. We need to first figure out the pooled success rate: \\[\\hat{p}_{obs} = \\dfrac{131 + 104}{827} = 0.28.\\] We now determine expected (pooled) success and failure counts: \\(0.28 \\cdot (131 + 258) = 108.92\\), \\(0.72 \\cdot (131 + 258) = 280.08\\) \\(0.28 \\cdot (104 + 334) = 122.64\\), \\(0.72 \\cdot (104 + 334) = 315.36\\) Independent selection of samples: The cases are not paired in any meaningful way. We have no reason to suspect that a college graduate selected would have any relationship to a non-college graduate selected. B.4.7 Test statistic The test statistic is a random variable based on the sample data. Here, we are interested in seeing if our observed difference in sample proportions corresponding to no opinion on drilling (\\(\\hat{p}_{college, obs} - \\hat{p}_{no\\_college, obs}\\) = 0.0326481) is statistically different than 0. Assuming that conditions are met and the null hypothesis is true, we can use the standard normal distribution to standardize the difference in sample proportions (\\(\\hat{P}_{college} - \\hat{P}_{no\\_college}\\)) using the standard error of \\(\\hat{P}_{college} - \\hat{P}_{no\\_college}\\) and the pooled estimate: \\[ Z =\\dfrac{ (\\hat{P}_1 - \\hat{P}_2) - 0}{\\sqrt{\\dfrac{\\hat{P}(1 - \\hat{P})}{n_1} + \\dfrac{\\hat{P}(1 - \\hat{P})}{n_2} }} \\sim N(0, 1) \\] where \\(\\hat{P} = \\dfrac{\\text{total number of successes} }{ \\text{total number of cases}}.\\) B.4.7.1 Observed test statistic While one could compute this observed test statistic by “hand”, the focus here is on the set-up of the problem and in understanding which formula for the test statistic applies. We can use the prop.test function to perform this analysis for us. stats::prop.test(x = table(offshore$college_grad, offshore$response), n = nrow(offshore), alternative = &quot;two.sided&quot;, correct = FALSE) ## ## 2-sample test for equality of proportions without continuity ## correction ## ## data: table(offshore$college_grad, offshore$response) ## X-squared = 9.9907, df = 1, p-value = 0.001573 ## alternative hypothesis: two.sided ## 95 percent confidence interval: ## 0.03772522 0.16091078 ## sample estimates: ## prop 1 prop 2 ## 0.3367609 0.2374429 prop.test does a \\(\\chi^2\\) test here but this matches up exactly with what we would expect from the test statistic above since \\(Z^2 = \\chi^2\\) so \\(\\sqrt{9.99} = 3.16 = z_{obs}\\): The \\(p\\)-values are the same because we are focusing on a two-tailed test. The observed difference in sample proportions is 3.16 standard deviations larger than 0. The \\(p\\)-value—the probability of observing a \\(Z\\) value of 3.16 or more extreme in our null distribution—is 0.0016. This can also be calculated in R directly: 2 * pnorm(3.16, lower.tail = FALSE) ## [1] 0.001577691 The 95% confidence interval is also stated above in the prop.test results. B.4.8 State conclusion We, therefore, have sufficient evidence to reject the null hypothesis. Our initial guess that a statistically significant difference did not exist in the proportions of no opinion on offshore drilling between college educated and non-college educated Californians was not validated. We do have evidence to suggest that there is a dependency between college graduation and position on offshore drilling for Californians. B.4.9 Comparing results Observing the bootstrap distribution and the null distribution that were created, it makes quite a bit of sense that the results are so similar for traditional and non-traditional methods in terms of the \\(p\\)-value and the confidence interval since these distributions look very similar to normal distributions. The conditions were not met since the number of pairs was small, but the sample data was not highly skewed. Using any of the methods whether they are traditional (formula-based) or non-traditional (computational-based) lead to similar results. B.5 Two Means (Independent Samples) B.5.1 Problem Statement Average income varies from one region of the country to another, and it often reflects both lifestyles and regional living expenses. Suppose a new graduate is considering a job in two locations, Cleveland, OH and Sacramento, CA, and he wants to see whether the average income in one of these cities is higher than the other. He would like to conduct a hypothesis test based on two randomly selected samples from the 2000 Census. (Tweaked a bit from Diez, Barr, and Çetinkaya-Rundel 2014 [Chapter 5]) B.5.2 Competing Hypotheses B.5.2.1 In words Null hypothesis: There is no association between income and location (Cleveland, OH and Sacramento, CA). Alternative hypothesis: There is an association between income and location (Cleveland, OH and Sacramento, CA). B.5.2.2 Another way in words Null hypothesis: The mean income is the same for both cities. Alternative hypothesis: The mean income is different for the two cities. B.5.2.3 In symbols (with annotations) \\(H_0: \\mu_{sac} = \\mu_{cle}\\) or \\(H_0: \\mu_{sac} - \\mu_{cle} = 0\\), where \\(\\mu\\) represents the average income. \\(H_A: \\mu_{sac} - \\mu_{cle} \\ne 0\\) B.5.2.4 Set \\(\\alpha\\) It’s important to set the significance level before starting the testing using the data. Let’s set the significance level at 5% here. B.5.3 Exploring the sample data inc_summ &lt;- cleSac %&gt;% group_by(metro_area) %&gt;% summarize(sample_size = n(), mean = mean(income), sd = sd(income), minimum = min(income), lower_quartile = quantile(income, 0.25), median = median(income), upper_quartile = quantile(income, 0.75), max = max(income)) kable(inc_summ) metro_area sample_size mean sd minimum lower_quartile median upper_quartile max Cleveland_ OH 212 27467.07 27680.68 0 8475 21000 35275 152400 Sacramento_ CA 175 32427.54 35773.63 0 8050 20000 49350 206900 The boxplot below also shows the mean for each group highlighted by the red dots. cleSac %&gt;% ggplot(aes(x = metro_area, y = income)) + geom_boxplot() + stat_summary(fun.y = &quot;mean&quot;, geom = &quot;point&quot;, color = &quot;red&quot;) B.5.3.1 Guess about statistical significance We are looking to see if a difference exists in the mean income of the two levels of the explanatory variable. Based solely on the boxplot, we have reason to believe that no difference exists. The distributions of income seem similar and the means fall in roughly the same place. B.5.4 Non-traditional methods B.5.4.1 Collecting summary info Next we will assign some key values to variable names in R: xbar_cle &lt;- inc_summ$mean[1] xbar_sac &lt;- inc_summ$mean[2] obs_diff &lt;- xbar_sac - xbar_cle n_cle &lt;- inc_summ$sample_size[1] n_sac &lt;- inc_summ$sample_size[2] B.5.4.2 Randomization for Hypothesis Test In order to look to see if the observed sample mean for Sacramento of 27467.0660377 is statistically different than that for Cleveland of 32427.5428571, we need to account for the sample sizes. Note that this is the same as looking to see if \\(\\bar{x}_{sac} - \\bar{x}_{cle}\\) is statistically different than 0. We also need to determine a process that replicates how the original group sizes of 212 and 175 were selected. We can use the idea of randomization testing (also known as permutation testing) to simulate the population from which the sample came (with two groups of different sizes) and then generate samples using shuffling from that simulated population to account for sampling variability. set.seed(2016) many_shuffles &lt;- do(10000) * (cleSac %&gt;% mutate(income = shuffle(income)) %&gt;% group_by(metro_area) %&gt;% summarize(mean_inc = mean(income)) ) null_distn &lt;- many_shuffles %&gt;% group_by(.index) %&gt;% summarize(diffmean = diff(mean_inc)) null_distn %&gt;% ggplot(aes(x = diffmean)) + geom_histogram(bins = 30, color = &quot;white&quot;) We can next use this distribution to observe our \\(p\\)-value. Recall this is a two-tailed test so we will be looking for values that are greater than or equal to 4960.4768194 or less than or equal to -4960.4768194 for our \\(p\\)-value. null_distn %&gt;% ggplot(aes(x = diffmean)) + geom_histogram(bins = 30, color = &quot;white&quot;) + geom_vline(color = &quot;red&quot;, xintercept = obs_diff) + geom_vline(color = &quot;red&quot;, xintercept = -obs_diff) B.5.4.2.1 Calculate \\(p\\)-value pvalue &lt;- null_distn %&gt;% filter( (diffmean &gt;= obs_diff) | (diffmean &lt;= -obs_diff) ) %&gt;% nrow() / nrow(null_distn) pvalue ## [1] 0.1225 So our \\(p\\)-value is 0.1225 and we fail to reject the null hypothesis at the 5% level. You can also see this from the histogram above that we are not very far into the tail of the null distribution. B.5.4.3 Bootstrapping for Confidence Interval We can also create a confidence interval for the unknown population parameter \\(\\mu_{sac} - \\mu_{cle}\\) using our sample data with bootstrapping. Here we will bootstrap each of the groups with replacement instead of shuffling. This is done using the groups argument in the resample function to fix the size of each group to be the same as the original group sizes of 175 for Sacramento and 212 for Cleveland. boot_means &lt;- do(10000) * cleSac %&gt;% resample(replace = TRUE, groups = metro_area) %&gt;% group_by(metro_area) %&gt;% summarize(mean_inc = mean(income)) Next, we calculate the difference in sample means for each of the 10,000 replications: boot_distn &lt;- boot_means %&gt;% group_by(.index) %&gt;% summarize(diffmean = diff(mean_inc)) boot_distn %&gt;% ggplot(aes(x = diffmean)) + geom_histogram(bins = 30, color = &quot;white&quot;) (ci_boot &lt;- boot_distn %&gt;% summarize(lower = quantile(diffmean, probs = 0.025), upper = quantile(diffmean, probs = 0.975))) ## # A tibble: 1 × 2 ## lower upper ## &lt;dbl&gt; &lt;dbl&gt; ## 1 -1512.59 11458.85 We see that 0 is contained in this confidence interval as a plausible value of \\(\\mu_{sac} - \\mu_{cle}\\) (the unknown population parameter). This matches with our hypothesis test results of failing to reject the null hypothesis. Since zero is a plausible value of the population parameter, we do not have evidence that Sacramento incomes are different than Cleveland incomes. Interpretation: We are 95% confident the true mean yearly income for those living in Sacramento is between 1512.59 dollars smaller to 11458.85 dollars higher than for Cleveland. Note: You could also use the null distribution based on randomization with a shift to have its center at \\(\\bar{x}_{sac} - \\bar{x}_{cle} = \\$4960.48\\) instead of at 0 and calculate its percentiles. The confidence interval produced via this method should be comparable to the one done using bootstrapping above. B.5.5 Traditional methods B.5.5.0.1 Check conditions Remember that in order to use the short-cut (formula-based, theoretical) approach, we need to check that some conditions are met. Independent observations: The observations are independent in both groups. This metro_area variable is met since the cases are randomly selected from each city. Approximately normal: The distribution of the response for each group should be normal or the sample sizes should be at least 30. cleSac %&gt;% ggplot(aes(x = income)) + geom_histogram(color = &quot;white&quot;, binwidth = 20000) + facet_wrap(~ metro_area) We have some reason to doubt the normality assumption here since both the histograms show deviation from a normal model fitting the data well for each group. The sample sizes for each group are greater than 100 though so the assumptions should still apply. Independent samples: The samples should be collected without any natural pairing. There is no mention of there being a relationship between those selected in Cleveland and in Sacramento. B.5.6 Test statistic The test statistic is a random variable based on the sample data. Here, we are interested in seeing if our observed difference in sample means (\\(\\bar{x}_{sac, obs} - \\bar{x}_{cle, obs}\\) = 4960.4768194) is statistically different than 0. Assuming that conditions are met and the null hypothesis is true, we can use the \\(t\\) distribution to standardize the difference in sample means (\\(\\bar{X}_{sac} - \\bar{X}_{cle}\\)) using the approximate standard error of \\(\\bar{X}_{sac} - \\bar{X}_{cle}\\) (invoking \\(S_{sac}\\) and \\(S_{cle}\\) as estimates of unknown \\(\\sigma_{sac}\\) and \\(\\sigma_{cle}\\)). \\[ T =\\dfrac{ (\\bar{X}_1 - \\bar{X}_2) - 0}{ \\sqrt{\\dfrac{S_1^2}{n_1} + \\dfrac{S_2^2}{n_2}} } \\sim t (df = min(n_1 - 1, n_2 - 1)) \\] where 1 = Sacramento and 2 = Cleveland with \\(S_1^2\\) and \\(S_2^2\\) the sample variance of the incomes of both cities, respectively, and \\(n_1 = 175\\) for Sacramento and \\(n_2 = 212\\) for Cleveland. B.5.6.1 Observed test statistic Note that we could also do (ALMOST) this test directly using the t.test function. The x and y arguments are expected to both be numeric vectors here so we’ll need to appropriately filter our data sets. cleveland &lt;- cleSac %&gt;% filter(metro_area == &quot;Cleveland_ OH&quot;) sacramento &lt;- cleSac %&gt;% filter(metro_area != &quot;Cleveland_ OH&quot;) t.test(y = cleveland$income, x = sacramento$income, alternative = &quot;two.sided&quot;) ## ## Welch Two Sample t-test ## ## data: sacramento$income and cleveland$income ## t = 1.5006, df = 323.36, p-value = 0.1344 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## -1542.758 11463.712 ## sample estimates: ## mean of x mean of y ## 32427.54 27467.07 Note that the degrees of freedom reported above are different than what we used above in specifying the Test Statistic. The degrees of freedom used here is also known as the Satterthwaite approximation and involves a quite complicated formula. For most problems, the must simpler smaller of sample sizes minus one will suffice. While one could compute this observed test statistic by “hand”, the focus here is on the set-up of the problem and in understanding which formula for the test statistic applies. We see here that the observed test statistic value is around -1.5 with \\(df = min(212 - 1, 175 - 1) = 174\\). Recall that for large degrees of freedom, the \\(t\\) distribution is roughly equal to the standard normal curve so our difference in df for the Satterthwaite and “min” variations doesn’t really matter. B.5.7 Compute \\(p\\)-value The \\(p\\)-value—the probability of observing an \\(t_{174}\\) value of -1.501 or more extreme (in both directions) in our null distribution—is 0.13. This can also be calculated in R directly: 2 * pt(-1.501, df = min(212 - 1, 175 - 1), lower.tail = TRUE) ## [1] 0.135168 We can also approximate by using the standard normal curve: 2 * pnorm(-1.501) ## [1] 0.1333556 Note that the 95 percent confidence interval given above matches well with the one calculated using bootstrapping. B.5.8 State conclusion We, therefore, do not have sufficient evidence to reject the null hypothesis. Our initial guess that a statistically significant difference not existing in the means was backed by this statistical analysis. We do not have evidence to suggest that the true mean income differs between Cleveland, OH and Sacramento, CA based on this data. B.5.9 Comparing results Observing the bootstrap distribution and the null distribution that were created, it makes quite a bit of sense that the results are so similar for traditional and non-traditional methods in terms of the \\(p\\)-value and the confidence interval since these distributions look very similar to normal distributions. The conditions also being met leads us to better guess that using any of the methods whether they are traditional (formula-based) or non-traditional (computational-based) will lead to similar results. B.6 Two Means (Paired Samples) B.6.0.1 Problem Statement Trace metals in drinking water affect the flavor and an unusually high concentration can pose a health hazard. Ten pairs of data were taken measuring zinc concentration in bottom water and surface water at 10 randomly selected locations on a stretch of river. Do the data suggest that the true average concentration in the surface water is smaller than that of bottom water? (Note that units are not given.) [Tweaked a bit from https://onlinecourses.science.psu.edu/stat500/node/51] B.6.1 Competing Hypotheses B.6.1.1 In words Null hypothesis: The mean concentration in the bottom water is the same as that of the surface water at different paired locations. Alternative hypothesis: The mean concentration in the surface water is smaller than that of the bottom water at different paired locations. B.6.1.2 In symbols (with annotations) \\(H_0: \\mu_{diff} = 0\\), where \\(\\mu_{diff}\\) represents the mean difference in concentration for surface water minus bottom water. \\(H_A: \\mu_{diff} &lt; 0\\) B.6.1.3 Set \\(\\alpha\\) It’s important to set the significance level before starting the testing using the data. Let’s set the significance level at 5% here. B.6.2 Exploring the sample data #download.file(&quot;http://ismayc.github.io/teaching/sample_problems/zinc_tidy.csv&quot;, # destfile = &quot;data/zinc_tidy.csv&quot;, # method = &quot;curl&quot;) zinc_tidy &lt;- read_csv(&quot;data/zinc_tidy.csv&quot;) We want to look at the differences in surface - bottom for each location: zinc_diff &lt;- zinc_tidy %&gt;% group_by(loc_id) %&gt;% summarize(pair_diff = diff(concentration)) zinc_summ &lt;- zinc_diff %&gt;% summarize(sample_size = n(), mean = mean(pair_diff), sd = sd(pair_diff), minimum = min(pair_diff), lower_quartile = quantile(pair_diff, 0.25), median = median(pair_diff), upper_quartile = quantile(pair_diff, 0.75), max = max(pair_diff)) kable(zinc_summ) sample_size mean sd minimum lower_quartile median upper_quartile max 10 -0.0804 0.0522732 -0.177 -0.11 -0.084 -0.0355 -0.015 The histogram below also shows the distribution of pair_diff. zinc_diff %&gt;% ggplot(aes(x = pair_diff)) + geom_histogram(binwidth = 0.04, color = &quot;white&quot;) B.6.2.1 Guess about statistical significance We are looking to see if the sample paired mean difference of -0.0804 is statistically less than 0. They seem to be quite close, but we have a small number of pairs here. Let’s guess that we will fail to reject the null hypothesis. B.6.3 Non-traditional methods B.6.3.1 Collecting summary info Next we will assign some key values to variable names in R: obs_diff &lt;- zinc_summ$mean n_pairs &lt;- zinc_summ$sample_size B.6.3.2 Randomization for Hypothesis Test In order to look to see if the observed sample mean difference \\(\\bar{x}_{diff} = -0.0804\\) is statistically less than 0, we need to account for the number of pairs. We also need to determine a process that replicates how the paired data was selected in a way similar to how we calculated our original difference in sample means. We can use the idea of randomization testing (also known as permutation testing) to simulate the population from which the sample came and then generate samples using shuffling from that simulated population to account for sampling variability. In this case, we will shuffle along each paired location. So values that were on the bottom of location 1 may now be switched to be on the surface or vice versa. library(mosaic) set.seed(2016) many_shuffles &lt;- do(10000) * (zinc_tidy %&gt;% mutate(concentration = shuffle(concentration, groups = loc_id)) %&gt;% group_by(loc_id) %&gt;% summarize(pair_diff = diff(concentration)) ) null_distn &lt;- many_shuffles %&gt;% group_by(.index) %&gt;% summarize(mean_diff = mean(pair_diff)) null_distn %&gt;% ggplot(aes(x = mean_diff)) + geom_histogram(bins = 30, color = &quot;white&quot;) We can next use this distribution to observe our \\(p\\)-value. Recall this is a left-tailed test so we will be looking for values that are less than or equal to -0.0804 for our \\(p\\)-value. null_distn %&gt;% ggplot(aes(x = mean_diff)) + geom_histogram(bins = 30, color = &quot;white&quot;) + geom_vline(color = &quot;red&quot;, xintercept = obs_diff) B.6.3.2.1 Calculate \\(p\\)-value pvalue &lt;- null_distn %&gt;% filter(mean_diff &lt;= obs_diff) %&gt;% nrow() / nrow(null_distn) pvalue ## [1] 0.0009 So our \\(p\\)-value is essentially 0.0009 and we reject the null hypothesis at the 5% level. You can also see this from the histogram above that we are far into the left tail of the null distribution. B.6.3.3 Bootstrapping for Confidence Interval We can also create a confidence interval for the unknown population parameter \\(\\mu_{diff}\\) using our sample data (the calculated differences) with bootstrapping. This is similar to the bootstrapping done in a one sample mean case, except now our data is differences instead of raw numerical data. boot_distn &lt;- do(10000) * resample(zinc_diff, replace = TRUE) %&gt;% summarize(mean_diff = mean(pair_diff)) boot_distn %&gt;% ggplot(aes(x = mean_diff)) + geom_histogram(bins = 30, color = &quot;white&quot;) (ci_boot &lt;- boot_distn %&gt;% summarize(lower = quantile(mean_diff, probs = 0.025), upper = quantile(mean_diff, probs = 0.975))) ## lower upper ## 1 -0.1114 -0.0504975 We see that 0 is not contained in this confidence interval as a plausible value of \\(\\mu_{diff}\\) (the unknown population parameter). This matches with our hypothesis test results of rejecting the null hypothesis. Since zero is not a plausible value of the population parameter and since the entire confidence interval falls below zero, we have evidence that surface zinc concentration levels are lower, on average, than bottom level zinc concentrations. Interpretation: We are 95% confident the true mean zinc concentration on the surface is between 0.11 units smaller to 0.05 units smaller than on the bottom. Note: You could also use the null distribution based on randomization with a shift to have its center at \\(\\bar{x}_{diff} = -0.08\\) instead of at 0 and calculate its percentiles. The confidence interval produced via this method should be comparable to the one done using bootstrapping above. B.6.4 Traditional methods B.6.4.1 Check conditions Remember that in order to use the shortcut (formula-based, theoretical) approach, we need to check that some conditions are met. Independent observations: The observations among pairs are independent. The locations are selected independently through random sampling so this condition is met. Approximately normal: The distribution of population of differences is normal or the number of pairs is at least 30. The histogram above does show some skew so we have reason to doubt the population being normal based on this sample. We also only have 10 pairs which is fewer than the 30 needed. A theory-based test may not be valid here. B.6.4.2 Test statistic The test statistic is a random variable based on the sample data. Here, we want to look at a way to estimate the population mean difference \\(\\mu_{diff}\\). A good guess is the sample mean difference \\(\\bar{X}_{diff}\\). Recall that this sample mean is actually a random variable that will vary as different samples are (theoretically, would be) collected. We are looking to see how likely is it for us to have observed a sample mean of \\(\\bar{x}_{diff, obs} = 0.0804\\) or larger assuming that the population mean difference is 0 (assuming the null hypothesis is true). If the conditions are met and assuming \\(H_0\\) is true, we can “standardize” this original test statistic of \\(\\bar{X}_{diff}\\) into a \\(T\\) statistic that follows a \\(t\\) distribution with degrees of freedom equal to \\(df = n - 1\\): \\[ T =\\dfrac{ \\bar{X}_{diff} - 0}{ S_{diff} / \\sqrt{n} } \\sim t (df = n - 1) \\] where \\(S\\) represents the standard deviation of the sample differences and \\(n\\) is the number of pairs. B.6.4.2.1 Observed test statistic While one could compute this observed test statistic by “hand”, the focus here is on the set-up of the problem and in understanding which formula for the test statistic applies. We can use the t.test function on the differences to perform this analysis for us. stats::t.test(x = zinc_diff$pair_diff, alternative = &quot;less&quot;, mu = 0) ## ## One Sample t-test ## ## data: zinc_diff$pair_diff ## t = -4.8638, df = 9, p-value = 0.0004456 ## alternative hypothesis: true mean is less than 0 ## 95 percent confidence interval: ## -Inf -0.0500982 ## sample estimates: ## mean of x ## -0.0804 We see here that the \\(t_{obs}\\) value is around -5. B.6.4.3 Compute \\(p\\)-value The \\(p\\)-value—the probability of observing a \\(t_{obs}\\) value of -5 or less in our null distribution of a \\(t\\) with 9 degrees of freedom—is 0.0004. This can also be calculated in R directly: pt(-5, df = nrow(zinc_diff) - 1, lower.tail = TRUE) ## [1] 0.000369484 B.6.4.4 State conclusion We, therefore, have sufficient evidence to reject the null hypothesis. Our initial guess that our observed sample mean difference was not statistically less than the hypothesized mean of 0 has been invalidated here. Based on this sample, we have evidence that the mean concentration in the bottom water is greater than that of the surface water at different paired locations. B.6.5 Comparing results Observing the bootstrap distribution and the null distribution that were created, it makes quite a bit of sense that the results are so similar for traditional and non-traditional methods in terms of the \\(p\\)-value and the confidence interval since these distributions look very similar to normal distributions. The conditions were not met since the number of pairs was small, but the sample data was not highly skewed. Using any of the methods whether they are traditional (formula-based) or non-traditional (computational-based) lead to similar results. References "],
-["C-appendixC.html", "C Reach for the Starts C.1 Sorted barplots C.2 Interactive graphics", " C Reach for the Starts C.1 Sorted barplots Building upon the example in Section ??: library(nycflights13) library(ggplot2) library(dplyr) flights_table &lt;- table(flights$carrier) flights_table ## ## 9E AA AS B6 DL EV F9 FL HA MQ OO UA ## 18460 32729 714 54635 48110 54173 685 3260 342 26397 32 58665 ## US VX WN YV ## 20536 5162 12275 601 We can sort this table from highest to lowest counts by using the sort function: sorted_flights &lt;- sort(flights_table, decreasing = TRUE) names(sorted_flights) ## [1] &quot;UA&quot; &quot;B6&quot; &quot;EV&quot; &quot;DL&quot; &quot;AA&quot; &quot;MQ&quot; &quot;US&quot; &quot;9E&quot; &quot;WN&quot; &quot;VX&quot; &quot;FL&quot; &quot;AS&quot; &quot;F9&quot; ## [14] &quot;YV&quot; &quot;HA&quot; &quot;OO&quot; It is often preferred for barplots to be ordered corresponding to the heights of the bars. This allows the reader to more easily compare the ordering of different airlines in terms of departed flights (Robbins 2013). We can also much more easily answer questions like “How many airlines have more departing flights than Southwest Airlines?”. We can use the sorted table giving the number of flights defined as sorted_flights to reorder the carrier. ggplot(data = flights, mapping = aes(x = carrier)) + geom_bar() + scale_x_discrete(limits = names(sorted_flights)) Figure C.1: Number of flights departing NYC in 2013 by airline - Descending numbers The last addition here specifies the values of the horizontal x axis on a discrete scale to correspond to those given by the entries of sorted_flights. C.2 Interactive graphics C.2.1 Interactive line-graphs Another useful tool for viewing line-graphs such as this is the dygraph function in the dygraphs package in combination with the dyRangeSelector function. This allows us to zoom in on a selected range and get an interactive plot for us to work with: library(dygraphs) flights_day &lt;- mutate(flights, date = as.Date(time_hour)) flights_summarized &lt;- flights_day %&gt;% group_by(date) %&gt;% summarize(median_arr_delay = median(arr_delay, na.rm = TRUE)) rownames(flights_summarized) &lt;- flights_summarized$date flights_summarized &lt;- select(flights_summarized, -date) dyRangeSelector(dygraph(flights_summarized)) The syntax here is a little different than what we have covered so far. The dygraph function is expecting for the dates to be given as the rownames of the object. We then remove the date variable from the flights_summarized dataframe since it is accounted for in the rownames. Lastly, we run the dygraph function on the new dataframe that only contains the median arrival delay as a column and then provide the ability to have a selector to zoom in on the interactive plot via dyRangeSelector. (Note that this plot will only be interactive in the HTML version of this book.) References "],
+["B-appendixB.html", "B Inference Examples Needed packages B.1 Inference Mind Map B.2 One Mean B.3 One Proportion B.4 Two Proportions B.5 Two Means (Independent Samples) B.6 Two Means (Paired Samples)", " B Inference Examples This appendix is designed to provide you with example of the five basic hypothesis tests and their corresponding confidence intervals. Traditional theory-based methods as well as computational-based methods are presented. Needed packages library(dplyr) library(ggplot2) library(mosaic) library(knitr) library(readr) B.1 Inference Mind Map To help you better navigate and choose the appropriate analysis, we’ve created a mind map on http://coggle.it available here and below. Figure B.1: Mind map for Inference B.2 One Mean B.2.1 Problem Statement The National Survey of Family Growth conducted by the Centers for Disease Control gathers information on family life, marriage and divorce, pregnancy, infertility, use of contraception, and men’s and women’s health. One of the variables collected on this survey is the age at first marriage. 5,534 randomly sampled US women between 2006 and 2010 completed the survey. The women sampled here had been married at least once. Do we have evidence that the mean age of first marriage for all US women from 2006 to 2010 is greater than 23 years? (Tweaked a bit from Diez, Barr, and Çetinkaya-Rundel 2014 [Chapter 4]) B.2.2 Competing Hypotheses B.2.2.1 In words Null hypothesis: The mean age of first marriage for all US women from 2006 to 2010 is equal to 23 years. Alternative hypothesis: The mean age of first marriage for all US women from 2006 to 2010 is greater than 23 years. B.2.2.2 In symbols (with annotations) \\(H_0: \\mu = \\mu_{0}\\), where \\(\\mu\\) represents the mean age of first marriage for all US women from 2006 to 2010 and \\(\\mu_0\\) is 23. \\(H_A: \\mu &gt; 23\\) B.2.2.3 Set \\(\\alpha\\) It’s important to set the significance level before starting the testing using the data. Let’s set the significance level at 5% here. B.2.3 Exploring the sample data #download.file(&quot;http://ismayc.github.io/teaching/sample_problems/ageAtMar.csv&quot;, # destfile = &quot;data/ageAtMar.csv&quot;, # method = &quot;curl&quot;) ageAtMar &lt;- read_csv(&quot;data/ageAtMar.csv&quot;) age_summ &lt;- ageAtMar %&gt;% summarize(sample_size = n(), mean = mean(age), sd = sd(age), minimum = min(age), lower_quartile = quantile(age, 0.25), median = median(age), upper_quartile = quantile(age, 0.75), max = max(age)) kable(age_summ) sample_size mean sd minimum lower_quartile median upper_quartile max 5534 23.44 4.721 10 20 23 26 43 The histogram below also shows the distribution of age. ageAtMar %&gt;% ggplot(aes(x = age)) + geom_histogram(binwidth = 3, color = &quot;white&quot;) B.2.3.1 Guess about statistical significance We are looking to see if the observed sample mean of 23.4402 is statistically greater than \\(\\mu_0 = 23\\). They seem to be quite close, but we have a large sample size here. Let’s guess that the large sample size will lead us to reject this practically small difference. B.2.4 Non-traditional methods B.2.4.1 Bootstrapping for Hypothesis Test In order to look to see if the observed sample mean of 23.4402 is statistically greater than \\(\\mu_0 = 23\\), we need to account for the sample size. We also need to determine a process that replicates how the original sample of size 5534 was selected. We can use the idea of bootstrapping to simulate the population from which the sample came and then generate samples from that simulated population to account for sampling variability. Recall how bootstrapping would apply in this context: Sample with replacement from our original sample of 5534 women and repeat this process 10,000 times, calculate the mean for each of the 10,000 bootstrap samples created in Step 1., combine all of these bootstrap statistics calculated in Step 2 into a boot_distn object, and shift the center of this distribution over to the null value of 23. (This is needed since it will be centered at 23.4402 via the process of bootstrapping.) set.seed(2016) mu0 &lt;- 23 shift &lt;- mu0 - age_summ$mean null_distn &lt;- do(10000) * resample(ageAtMar, replace = TRUE) %&gt;% mutate(age = age + shift) %&gt;% summarize(mean_age = mean(age)) null_distn %&gt;% ggplot(aes(x = mean_age)) + geom_histogram(bins = 30, color = &quot;white&quot;) We can next use this distribution to observe our \\(p\\)-value. Recall this is a right-tailed test so we will be looking for values that are greater than or equal to 23.4402 for our \\(p\\)-value. obs_mean &lt;- age_summ$mean null_distn %&gt;% ggplot(aes(x = mean_age)) + geom_histogram(bins = 30, color = &quot;white&quot;) + geom_vline(color = &quot;red&quot;, xintercept = obs_mean) B.2.4.1.1 Calculate \\(p\\)-value pvalue &lt;- null_distn %&gt;% filter( mean_age &gt;= obs_mean ) %&gt;% nrow() / nrow(null_distn) pvalue ## [1] 0 So our \\(p\\)-value is 0 and we reject the null hypothesis at the 5% level. You can also see this from the histogram above that we are far into the tail of the null distribution. B.2.4.2 Bootstrapping for Confidence Interval We can also create a confidence interval for the unknown population parameter \\(\\mu\\) using our sample data using bootstrapping. Note that we don’t need to shift this distribution since we want the center of our confidence interval to be our point estimate \\(\\bar{x}_{obs} = 23.4402\\). boot_distn &lt;- do(10000) * resample(ageAtMar, replace = TRUE) %&gt;% summarize(mean_age = mean(age)) boot_distn %&gt;% ggplot(aes(x = mean_age)) + geom_histogram(bins = 30, color = &quot;white&quot;) boot_distn %&gt;% summarize(lower = quantile(mean_age, probs = 0.025), upper = quantile(mean_age, probs = 0.975)) ## lower upper ## 1 23.32 23.56 We see that 23 is not contained in this confidence interval as a plausible value of \\(\\mu\\) (the unknown population mean) and the entire interval is larger than 23. This matches with our hypothesis test results of rejecting the null hypothesis in favor of the alternative (\\(\\mu &gt; 23\\)). Interpretation: We are 95% confident the true mean age of first marriage for all US women from 2006 to 2010 is between and . B.2.5 Traditional methods B.2.5.1 Check conditions Remember that in order to use the shortcut (formula-based, theoretical) approach, we need to check that some conditions are met. Independent observations: The observations are collected independently. The cases are selected independently through random sampling so this condition is met. Approximately normal: The distribution of the response variable should be normal or the sample size should be at least 30. The histogram for the sample above does show some skew. The Q-Q plot below also shows some skew. ggplot(data = ageAtMar, mapping = aes(sample = age)) + stat_qq() The sample size here is quite large though (\\(n = 5534\\)) so both conditions are met. B.2.5.2 Test statistic The test statistic is a random variable based on the sample data. Here, we want to look at a way to estimate the population mean \\(\\mu\\). A good guess is the sample mean \\(\\bar{X}\\). Recall that this sample mean is actually a random variable that will vary as different samples are (theoretically, would be) collected. We are looking to see how likely is it for us to have observed a sample mean of \\(\\bar{x}_{obs} = 23.4402\\) or larger assuming that the population mean is 23 (assuming the null hypothesis is true). If the conditions are met and assuming \\(H_0\\) is true, we can “standardize” this original test statistic of \\(\\bar{X}\\) into a \\(T\\) statistic that follows a \\(t\\) distribution with degrees of freedom equal to \\(df = n - 1\\): \\[ T =\\dfrac{ \\bar{X} - \\mu_0}{ S / \\sqrt{n} } \\sim t (df = n - 1) \\] where \\(S\\) represents the standard deviation of the sample and \\(n\\) is the sample size. B.2.5.2.1 Observed test statistic While one could compute this observed test statistic by “hand”, the focus here is on the set-up of the problem and in understanding which formula for the test statistic applies. We can use the t.test function to perform this analysis for us. t.test(x = ageAtMar$age, alternative = &quot;greater&quot;, mu = 23) ## ## One Sample t-test ## ## data: ageAtMar$age ## t = 6.9, df = 5500, p-value = 0.000000000002 ## alternative hypothesis: true mean is greater than 23 ## 95 percent confidence interval: ## 23.34 Inf ## sample estimates: ## mean of x ## 23.44 We see here that the \\(t_{obs}\\) value is around 6.94. Recall that for large sample sizes the \\(t\\) distribution is essentially the standard normal distribution and this is why the statistic is reported as Z. B.2.5.3 Compute \\(p\\)-value The \\(p\\)-value—the probability of observing an \\(t_{obs}\\) value of 6.94 or more in our null distribution of a \\(t\\) with 5433 degrees of freedom—is essentially 0. This can also be calculated in R directly: pt(6.936, df = nrow(ageAtMar) - 1, lower.tail = FALSE) ## [1] 0.000000000002247 We can also use the \\(N(0, 1)\\) distribution here: pnorm(6.936, lower.tail = FALSE) ## [1] 0.000000000002017 B.2.5.4 State conclusion We, therefore, have sufficient evidence to reject the null hypothesis. Our initial guess that our observed sample mean was statistically greater than the hypothesized mean has supporting evidence here. Based on this sample, we have evidence that the mean age of first marriage for all US women from 2006 to 2010 is greater than 23 years. B.2.5.5 Confidence interval The confidence interval reported above with t.test is known as a one-sided confidence interval and gives the lowest value one could expect \\(\\mu\\) to be with 95% confidence. We usually want a range of values so we can use alternative = &quot;two.sided&quot; to get the similar values compared to the bootstrapping process: t.test(x = ageAtMar$age, alternative = &quot;two.sided&quot;, mu = 23)$conf ## [1] 23.32 23.56 ## attr(,&quot;conf.level&quot;) ## [1] 0.95 B.2.6 Comparing results Observing the bootstrap distribution that were created, it makes quite a bit of sense that the results are so similar for traditional and non-traditional methods in terms of the \\(p\\)-value and the confidence interval since these distributions look very similar to normal distributions. The conditions also being met (the large sample size was the driver here) leads us to better guess that using any of the methods whether they are traditional (formula-based) or non-traditional (computational-based) will lead to similar results. B.3 One Proportion B.3.1 Problem Statement The CEO of a large electric utility claims that 80 percent of his 1,000,000 customers are satisfied with the service they receive. To test this claim, the local newspaper surveyed 100 customers, using simple random sampling. 73 were satisfied and the remaining were unsatisfied. Based on these findings from the sample, can we reject the CEO’s hypothesis that 80% of the customers are satisfied? [Tweaked a bit from http://stattrek.com/hypothesis-test/proportion.aspx?Tutorial=AP] B.3.2 Competing Hypotheses B.3.2.1 In words Null hypothesis: The proportion of all customers of the large electric utility satisfied with service they receive is equal 0.80. Alternative hypothesis: The proportion of all customers of the large electric utility satisfied with service they receive is different from 0.80. B.3.2.2 In symbols (with annotations) \\(H_0: \\pi = p_{0}\\), where \\(\\pi\\) represents the proportion of all customers of the large electric utility satisfied with service they receive and \\(p_0\\) is 0.8. \\(H_A: \\pi \\ne 0.8\\) B.3.2.3 Set \\(\\alpha\\) It’s important to set the significance level before starting the testing using the data. Let’s set the significance level at 5% here. B.3.3 Exploring the sample data elec &lt;- c(rep(&quot;satisfied&quot;, 73), rep(&quot;unsatisfied&quot;, 27)) %&gt;% as_data_frame() %&gt;% rename(&quot;satisfy&quot; = value) The bar graph below also shows the distribution of satisfy. ggplot(data = elec, aes(x = satisfy)) + geom_bar() B.3.3.1 Guess about statistical significance We are looking to see if the sample proportion of 0.73 is statistically different from \\(p_0 = 0.8\\) based on this sample. They seem to be quite close, and our sample size is not huge here (\\(n = 100\\)). Let’s guess that we do not have evidence to reject the null hypothesis. B.3.4 Non-traditional methods B.3.4.1 Simulation for Hypothesis Test In order to look to see if 0.73 is statistically different from 0.8, we need to account for the sample size. We also need to determine a process that replicates how the original sample of size 100 was selected. We can use the idea of an unfair coin to simulate this process. We will simulate flipping an unfair coin (with probability of success 0.8 matching the null hypothesis) 100 times. Then we will keep track of how many heads come up in those 100 flips. Our simulated statistic matches with how we calculated the original statistic \\(\\hat{p}\\): the number of heads (satisfied) out of our total sample of 100. We then repeat this process many times (say 10,000) to create the null distribution looking at the simulated proportions of successes: set.seed(2016) null_distn &lt;- do(10000) * rflip(100, prob = 0.8) null_distn %&gt;% ggplot(aes(x = prop)) + geom_histogram(bins = 30, color = &quot;white&quot;) We can next use this distribution to observe our \\(p\\)-value. Recall this is a two-tailed test so we will be looking for values that are 0.8 - 0.73 = 0.07 away from 0.8 in BOTH directions for our \\(p\\)-value: p_hat &lt;- 73/100 dist &lt;- 0.8 - p_hat null_distn %&gt;% ggplot(aes(x = prop)) + geom_histogram(bins = 30, color = &quot;white&quot;) + geom_vline(color = &quot;red&quot;, xintercept = 0.8 + dist) + geom_vline(color = &quot;red&quot;, xintercept = p_hat) B.3.4.1.1 Calculate \\(p\\)-value pvalue &lt;- null_distn %&gt;% filter( (prop &gt;= 0.8 + dist) | (prop &lt;= p_hat) ) %&gt;% nrow() / nrow(null_distn) pvalue ## [1] 0.081 So our \\(p\\)-value is 0.081 and we fail to reject the null hypothesis at the 5% level. B.3.4.2 Bootstrapping for Confidence Interval We can also create a confidence interval for the unknown population parameter \\(\\pi\\) using our sample data. To do so, we use bootstrapping, which involves sampling with replacement from our original sample of 100 survey respondents and repeating this process 10,000 times, calculating the proportion of successes for each of the 10,000 bootstrap samples created in Step 1., combining all of these bootstrap statistics calculated in Step 2 into a boot_distn object, identifying the 2.5th and 97.5th percentiles of this distribution (corresponding to the 5% significance level chosen) to find a 95% confidence interval for \\(\\pi\\), and interpret this confidence interval in the context of the problem. boot_distn &lt;- do(10000) * elec %&gt;% resample(size = 100, replace = TRUE) %&gt;% summarize(success_rate = mean(satisfy == &quot;satisfied&quot;)) Just as we use the mean function for calculating the mean over a numerical variable, we can also use it to compute the proportion of successes for a categorical variable where we specify what we are calling a “success” after the ==. (Think about the formula for calculating a mean and how R handles logical statements such as satisfy == &quot;satisfied&quot; for why this must be true.) boot_distn %&gt;% ggplot(aes(x = success_rate)) + geom_histogram(bins = 30, color = &quot;white&quot;) boot_distn %&gt;% summarize(lower = quantile(success_rate, probs = 0.025), upper = quantile(success_rate, probs = 0.975)) ## lower upper ## 1 0.64 0.81 We see that 0.80 is contained in this confidence interval as a plausible value of \\(\\pi\\) (the unknown population proportion). This matches with our hypothesis test results of failing to reject the null hypothesis. Interpretation: We are 95% confident the true proportion of customers who are satisfied with the service they receive is between and . Note: You could also use the null distribution with a shift to have its center at \\(\\hat{p} = 0.73\\) instead of at \\(p_0 = 0.8\\) and calculate its percentiles. The confidence interval produced via this method should be comparable to the one done using bootstrapping above. B.3.5 Traditional methods B.3.5.1 Check conditions Remember that in order to use the shortcut (formula-based, theoretical) approach, we need to check that some conditions are met. Independent observations: The observations are collected independently. The cases are selected independently through random sampling so this condition is met. Approximately normal: The number of expected successes and expected failures is at least 10. This condition is met since 73 and 27 are both greater than 10. B.3.5.2 Test statistic The test statistic is a random variable based on the sample data. Here, we want to look at a way to estimate the population proportion \\(\\pi\\). A good guess is the sample proportion \\(\\hat{P}\\). Recall that this sample proportion is actually a random variable that will vary as different samples are (theoretically, would be) collected. We are looking to see how likely is it for us to have observed a sample proportion of \\(\\hat{p}_{obs} = 0.73\\) or larger assuming that the population proportion is 0.80 (assuming the null hypothesis is true). If the conditions are met and assuming \\(H_0\\) is true, we can standardize this original test statistic of \\(\\hat{P}\\) into a \\(Z\\) statistic that follows a \\(N(0, 1)\\) distribution. \\[ Z =\\dfrac{ \\hat{P} - p_0}{\\sqrt{\\dfrac{p_0(1 - p_0)}{n} }} \\sim N(0, 1) \\] B.3.5.2.1 Observed test statistic While one could compute this observed test statistic by “hand” by plugging the observed values into the formula, the focus here is on the set-up of the problem and in understanding which formula for the test statistic applies. The calculation has been done in R below for completeness though: p_hat &lt;- 0.73 p0 &lt;- 0.8 n &lt;- 100 (z_obs &lt;- (p_hat - p0) / sqrt( (p0 * (1 - p0)) / n)) ## [1] -1.75 We see here that the \\(z_{obs}\\) value is around -1.75. Our observed sample proportion of 0.73 is 1.75 standard errors below the hypothesized parameter value of 0.8. B.3.5.3 Compute \\(p\\)-value 2 * pnorm(z_obs) ## [1] 0.08012 The \\(p\\)-value—the probability of observing an \\(z_{obs}\\) value of -1.75 or more extreme (in both directions) in our null distribution—is around 8%. Note that we could also do this test directly using the prop.test function. stats::prop.test(x = table(elec$satisfy), n = length(elec$satisfy), alternative = &quot;two.sided&quot;, p = 0.8, correct = FALSE) ## ## 1-sample proportions test without continuity correction ## ## data: table(elec$satisfy), null probability 0.8 ## X-squared = 3.1, df = 1, p-value = 0.08 ## alternative hypothesis: true p is not equal to 0.8 ## 95 percent confidence interval: ## 0.6357 0.8073 ## sample estimates: ## p ## 0.73 prop.test does a \\(\\chi^2\\) test here but this matches up exactly with what we would expect: \\(x^2_{obs} = 3.06 = (-1.75)^2 = (z_{obs})^2\\) and the \\(p\\)-values are the same because we are focusing on a two-tailed test. Note that the 95 percent confidence interval given above matches well with the one calculated using bootstrapping. B.3.5.4 State conclusion We, therefore, do not have sufficient evidence to reject the null hypothesis. Our initial guess that our observed sample proportion was not statistically greater than the hypothesized proportion has not been invalidated. Based on this sample, we have do not evidence that the proportion of all customers of the large electric utility satisfied with service they receive is different from 0.80, at the 5% level. B.3.6 Comparing results Observing the bootstrap distribution and the null distribution that were created, it makes quite a bit of sense that the results are so similar for traditional and non-traditional methods in terms of the \\(p\\)-value and the confidence interval since these distributions look very similar to normal distributions. The conditions also being met leads us to better guess that using any of the methods whether they are traditional (formula-based) or non-traditional (computational-based) will lead to similar results. B.4 Two Proportions B.4.1 Problem Statement A 2010 survey asked 827 randomly sampled registered voters in California “Do you support? Or do you oppose? Drilling for oil and natural gas off the Coast of California? Or do you not know enough to say?” Conduct a hypothesis test to determine if the data provide strong evidence that the proportion of college graduates who do not have an opinion on this issue is different than that of non-college graduates. (Tweaked a bit from Diez, Barr, and Çetinkaya-Rundel 2014 [Chapter 6]) B.4.2 Competing Hypotheses B.4.2.1 In words Null hypothesis: There is no association between having an opinion on drilling and having a college degree for all registered California voters in 2010. Alternative hypothesis: There is an association between having an opinion on drilling and having a college degree for all registered California voters in 2010. B.4.2.2 Another way in words Null hypothesis: The probability that a Californian voter in 2010 having no opinion on drilling and is a college graduate is the same as that of a non-college graduate. Alternative hypothesis: These parameter probabilities are different. B.4.2.3 In symbols (with annotations) \\(H_0: \\pi_{college} = \\pi_{no\\_college}\\) or \\(H_0: \\pi_{college} - \\pi_{no\\_college} = 0\\), where \\(\\pi\\) represents the probability of not having an opinion on drilling. \\(H_A: \\pi_{college} - \\pi_{no\\_college} \\ne 0\\) B.4.2.4 Set \\(\\alpha\\) It’s important to set the significance level before starting the testing using the data. Let’s set the significance level at 5% here. B.4.3 Exploring the sample data #download.file(&quot;http://ismayc.github.io/teaching/sample_problems/offshore.csv&quot;, # destfile = &quot;data/offshore.csv&quot;, # method = &quot;curl&quot;) offshore &lt;- read_csv(&quot;data/offshore.csv&quot;) table(offshore$college_grad, offshore$response) ## ## no opinion opinion ## no 131 258 ## yes 104 334 (off_summ &lt;- offshore %&gt;% group_by(college_grad) %&gt;% summarize(prop_no_opinion = mean(response == &quot;no opinion&quot;), sample_size = n()) ) ## # A tibble: 2 × 3 ## college_grad prop_no_opinion sample_size ## &lt;chr&gt; &lt;dbl&gt; &lt;int&gt; ## 1 no 0.3368 389 ## 2 yes 0.2374 438 offshore %&gt;% ggplot(aes(x = college_grad, fill = response)) + geom_bar(position = &quot;fill&quot;) + coord_flip() B.4.3.1 Guess about statistical significance We are looking to see if a difference exists in the heights of the bars corresponding to no opinion for the plot. Based solely on the plot, we have little reason to believe that a difference exists since the bars seem to be about the same height, BUT…it’s important to use statistics to see if that difference is actually statistically significant! B.4.4 Non-traditional methods B.4.4.1 Collecting summary info Next we will assign some key values to variable names in R: phat_nograd &lt;- off_summ$prop_no_opinion[1] phat_grad &lt;- off_summ$prop_no_opinion[2] obs_diff &lt;- phat_grad - phat_nograd n_nograd &lt;- off_summ$sample_size[1] n_grad &lt;- off_summ$sample_size[2] B.4.4.2 Randomization for Hypothesis Test In order to look to see if the observed sample proportion of no opinion for college graduates of 0.3368 is statistically different than that for graduates of 0.2374, we need to account for the sample sizes. Note that this is the same as looking to see if \\(\\hat{p}_{grad} - \\hat{p}_{nograd}\\) is statistically different than 0. We also need to determine a process that replicates how the original group sizes of 389 and 438 were selected. We can use the idea of randomization testing (also known as permutation testing) to simulate the population from which the sample came (with two groups of different sizes) and then generate samples using shuffling from that simulated population to account for sampling variability. set.seed(2016) many_shuffles &lt;- do(10000) * (offshore %&gt;% mutate(response = shuffle(response)) %&gt;% group_by(college_grad) %&gt;% summarize(prop_no_opinion = mean(response == &quot;no opinion&quot;)) ) null_distn &lt;- many_shuffles %&gt;% group_by(.index) %&gt;% summarize(diffprop = diff(prop_no_opinion)) null_distn %&gt;% ggplot(aes(x = diffprop)) + geom_histogram(bins = 25, color = &quot;white&quot;) We can next use this distribution to observe our \\(p\\)-value. Recall this is a two-tailed test so we will be looking for values that are greater than or equal to -0.0993 or less than or equal to 0.0993 for our \\(p\\)-value. null_distn %&gt;% ggplot(aes(x = diffprop)) + geom_histogram(bins = 20, color = &quot;white&quot;) + geom_vline(color = &quot;red&quot;, xintercept = obs_diff) + geom_vline(color = &quot;red&quot;, xintercept = -obs_diff) B.4.4.2.1 Calculate \\(p\\)-value pvalue &lt;- null_distn %&gt;% filter( (diffprop &lt;= obs_diff) | (diffprop &gt;= -obs_diff) ) %&gt;% nrow() / nrow(null_distn) pvalue ## [1] 0.0025 So our \\(p\\)-value is 0.0025 and we reject the null hypothesis at the 5% level. You can also see this from the histogram above that we are far into the tails of the null distribution. B.4.4.3 Bootstrapping for Confidence Interval We can also create a confidence interval for the unknown population parameter \\(\\pi_{college} - \\pi_{no\\_college}\\) using our sample data with bootstrapping. Here we will bootstrap each of the groups with replacement instead of shuffling. This is done using the groups argument in the resample function to fix the size of each group to be the same as the original group sizes of 389 for non-college graduates and 438 for college graduates. boot_props &lt;- do(10000) * offshore %&gt;% resample(replace = TRUE, groups = college_grad) %&gt;% group_by(college_grad) %&gt;% summarize(prop_no_opinion = mean(response == &quot;no opinion&quot;)) Next, we calculate the difference in sample proportions for each of the 10,000 replications: boot_distn &lt;- boot_props %&gt;% group_by(.index) %&gt;% summarize(diffprop = diff(prop_no_opinion)) boot_distn %&gt;% ggplot(aes(x = diffprop)) + geom_histogram(bins = 30, color = &quot;white&quot;) (ci_boot &lt;- boot_distn %&gt;% summarize(lower = quantile(diffprop, probs = 0.025), upper = quantile(diffprop, probs = 0.975))) ## # A tibble: 1 × 2 ## lower upper ## &lt;dbl&gt; &lt;dbl&gt; ## 1 -0.1596 -0.03792 We see that 0 is not contained in this confidence interval as a plausible value of \\(\\pi_{college} - \\pi_{no\\_college}\\) (the unknown population parameter). This matches with our hypothesis test results of rejecting the null hypothesis. Since zero is not a plausible value of the population parameter, we have evidence that the proportion of college graduates in California with no opinion on drilling is different than that of non-college graduates. Interpretation: We are 95% confident the true proportion of non-college graduates with no opinion on offshore drilling in California is between 0.16 dollars smaller to 0.04 dollars smaller than for college graduates. Note: You could also use the null distribution based on randomization with a shift to have its center at \\(\\hat{p}_{college} - \\hat{p}_{no\\_college} = \\$-0.1\\) instead of at 0 and calculate its percentiles. The confidence interval produced via this method should be comparable to the one done using bootstrapping above. B.4.5 Traditional methods B.4.6 Check conditions Remember that in order to use the short-cut (formula-based, theoretical) approach, we need to check that some conditions are met. Independent observations: Each case that was selected must be independent of all the other cases selected. This condition is met since cases were selected at random to observe. Sample size: The number of pooled successes and pooled failures must be at least 10 for each group. We need to first figure out the pooled success rate: \\[\\hat{p}_{obs} = \\dfrac{131 + 104}{827} = 0.28.\\] We now determine expected (pooled) success and failure counts: \\(0.28 \\cdot (131 + 258) = 108.92\\), \\(0.72 \\cdot (131 + 258) = 280.08\\) \\(0.28 \\cdot (104 + 334) = 122.64\\), \\(0.72 \\cdot (104 + 334) = 315.36\\) Independent selection of samples: The cases are not paired in any meaningful way. We have no reason to suspect that a college graduate selected would have any relationship to a non-college graduate selected. B.4.7 Test statistic The test statistic is a random variable based on the sample data. Here, we are interested in seeing if our observed difference in sample proportions corresponding to no opinion on drilling (\\(\\hat{p}_{college, obs} - \\hat{p}_{no\\_college, obs}\\) = 0.0326) is statistically different than 0. Assuming that conditions are met and the null hypothesis is true, we can use the standard normal distribution to standardize the difference in sample proportions (\\(\\hat{P}_{college} - \\hat{P}_{no\\_college}\\)) using the standard error of \\(\\hat{P}_{college} - \\hat{P}_{no\\_college}\\) and the pooled estimate: \\[ Z =\\dfrac{ (\\hat{P}_1 - \\hat{P}_2) - 0}{\\sqrt{\\dfrac{\\hat{P}(1 - \\hat{P})}{n_1} + \\dfrac{\\hat{P}(1 - \\hat{P})}{n_2} }} \\sim N(0, 1) \\] where \\(\\hat{P} = \\dfrac{\\text{total number of successes} }{ \\text{total number of cases}}.\\) B.4.7.1 Observed test statistic While one could compute this observed test statistic by “hand”, the focus here is on the set-up of the problem and in understanding which formula for the test statistic applies. We can use the prop.test function to perform this analysis for us. stats::prop.test(x = table(offshore$college_grad, offshore$response), n = nrow(offshore), alternative = &quot;two.sided&quot;, correct = FALSE) ## ## 2-sample test for equality of proportions without continuity ## correction ## ## data: table(offshore$college_grad, offshore$response) ## X-squared = 10, df = 1, p-value = 0.002 ## alternative hypothesis: two.sided ## 95 percent confidence interval: ## 0.03773 0.16091 ## sample estimates: ## prop 1 prop 2 ## 0.3368 0.2374 prop.test does a \\(\\chi^2\\) test here but this matches up exactly with what we would expect from the test statistic above since \\(Z^2 = \\chi^2\\) so \\(\\sqrt{9.99} = 3.16 = z_{obs}\\): The \\(p\\)-values are the same because we are focusing on a two-tailed test. The observed difference in sample proportions is 3.16 standard deviations larger than 0. The \\(p\\)-value—the probability of observing a \\(Z\\) value of 3.16 or more extreme in our null distribution—is 0.0016. This can also be calculated in R directly: 2 * pnorm(3.16, lower.tail = FALSE) ## [1] 0.001578 The 95% confidence interval is also stated above in the prop.test results. B.4.8 State conclusion We, therefore, have sufficient evidence to reject the null hypothesis. Our initial guess that a statistically significant difference did not exist in the proportions of no opinion on offshore drilling between college educated and non-college educated Californians was not validated. We do have evidence to suggest that there is a dependency between college graduation and position on offshore drilling for Californians. B.4.9 Comparing results Observing the bootstrap distribution and the null distribution that were created, it makes quite a bit of sense that the results are so similar for traditional and non-traditional methods in terms of the \\(p\\)-value and the confidence interval since these distributions look very similar to normal distributions. The conditions were not met since the number of pairs was small, but the sample data was not highly skewed. Using any of the methods whether they are traditional (formula-based) or non-traditional (computational-based) lead to similar results. B.5 Two Means (Independent Samples) B.5.1 Problem Statement Average income varies from one region of the country to another, and it often reflects both lifestyles and regional living expenses. Suppose a new graduate is considering a job in two locations, Cleveland, OH and Sacramento, CA, and he wants to see whether the average income in one of these cities is higher than the other. He would like to conduct a hypothesis test based on two randomly selected samples from the 2000 Census. (Tweaked a bit from Diez, Barr, and Çetinkaya-Rundel 2014 [Chapter 5]) B.5.2 Competing Hypotheses B.5.2.1 In words Null hypothesis: There is no association between income and location (Cleveland, OH and Sacramento, CA). Alternative hypothesis: There is an association between income and location (Cleveland, OH and Sacramento, CA). B.5.2.2 Another way in words Null hypothesis: The mean income is the same for both cities. Alternative hypothesis: The mean income is different for the two cities. B.5.2.3 In symbols (with annotations) \\(H_0: \\mu_{sac} = \\mu_{cle}\\) or \\(H_0: \\mu_{sac} - \\mu_{cle} = 0\\), where \\(\\mu\\) represents the average income. \\(H_A: \\mu_{sac} - \\mu_{cle} \\ne 0\\) B.5.2.4 Set \\(\\alpha\\) It’s important to set the significance level before starting the testing using the data. Let’s set the significance level at 5% here. B.5.3 Exploring the sample data inc_summ &lt;- cleSac %&gt;% group_by(metro_area) %&gt;% summarize(sample_size = n(), mean = mean(income), sd = sd(income), minimum = min(income), lower_quartile = quantile(income, 0.25), median = median(income), upper_quartile = quantile(income, 0.75), max = max(income)) kable(inc_summ) metro_area sample_size mean sd minimum lower_quartile median upper_quartile max Cleveland_ OH 212 27467 27681 0 8475 21000 35275 152400 Sacramento_ CA 175 32428 35774 0 8050 20000 49350 206900 The boxplot below also shows the mean for each group highlighted by the red dots. cleSac %&gt;% ggplot(aes(x = metro_area, y = income)) + geom_boxplot() + stat_summary(fun.y = &quot;mean&quot;, geom = &quot;point&quot;, color = &quot;red&quot;) B.5.3.1 Guess about statistical significance We are looking to see if a difference exists in the mean income of the two levels of the explanatory variable. Based solely on the boxplot, we have reason to believe that no difference exists. The distributions of income seem similar and the means fall in roughly the same place. B.5.4 Non-traditional methods B.5.4.1 Collecting summary info Next we will assign some key values to variable names in R: xbar_cle &lt;- inc_summ$mean[1] xbar_sac &lt;- inc_summ$mean[2] obs_diff &lt;- xbar_sac - xbar_cle n_cle &lt;- inc_summ$sample_size[1] n_sac &lt;- inc_summ$sample_size[2] B.5.4.2 Randomization for Hypothesis Test In order to look to see if the observed sample mean for Sacramento of 27467.066 is statistically different than that for Cleveland of 32427.5429, we need to account for the sample sizes. Note that this is the same as looking to see if \\(\\bar{x}_{sac} - \\bar{x}_{cle}\\) is statistically different than 0. We also need to determine a process that replicates how the original group sizes of 212 and 175 were selected. We can use the idea of randomization testing (also known as permutation testing) to simulate the population from which the sample came (with two groups of different sizes) and then generate samples using shuffling from that simulated population to account for sampling variability. set.seed(2016) many_shuffles &lt;- do(10000) * (cleSac %&gt;% mutate(income = shuffle(income)) %&gt;% group_by(metro_area) %&gt;% summarize(mean_inc = mean(income)) ) null_distn &lt;- many_shuffles %&gt;% group_by(.index) %&gt;% summarize(diffmean = diff(mean_inc)) null_distn %&gt;% ggplot(aes(x = diffmean)) + geom_histogram(bins = 30, color = &quot;white&quot;) We can next use this distribution to observe our \\(p\\)-value. Recall this is a two-tailed test so we will be looking for values that are greater than or equal to 4960.4768 or less than or equal to -4960.4768 for our \\(p\\)-value. null_distn %&gt;% ggplot(aes(x = diffmean)) + geom_histogram(bins = 30, color = &quot;white&quot;) + geom_vline(color = &quot;red&quot;, xintercept = obs_diff) + geom_vline(color = &quot;red&quot;, xintercept = -obs_diff) B.5.4.2.1 Calculate \\(p\\)-value pvalue &lt;- null_distn %&gt;% filter( (diffmean &gt;= obs_diff) | (diffmean &lt;= -obs_diff) ) %&gt;% nrow() / nrow(null_distn) pvalue ## [1] 0.1225 So our \\(p\\)-value is 0.1225 and we fail to reject the null hypothesis at the 5% level. You can also see this from the histogram above that we are not very far into the tail of the null distribution. B.5.4.3 Bootstrapping for Confidence Interval We can also create a confidence interval for the unknown population parameter \\(\\mu_{sac} - \\mu_{cle}\\) using our sample data with bootstrapping. Here we will bootstrap each of the groups with replacement instead of shuffling. This is done using the groups argument in the resample function to fix the size of each group to be the same as the original group sizes of 175 for Sacramento and 212 for Cleveland. boot_means &lt;- do(10000) * cleSac %&gt;% resample(replace = TRUE, groups = metro_area) %&gt;% group_by(metro_area) %&gt;% summarize(mean_inc = mean(income)) Next, we calculate the difference in sample means for each of the 10,000 replications: boot_distn &lt;- boot_means %&gt;% group_by(.index) %&gt;% summarize(diffmean = diff(mean_inc)) boot_distn %&gt;% ggplot(aes(x = diffmean)) + geom_histogram(bins = 30, color = &quot;white&quot;) (ci_boot &lt;- boot_distn %&gt;% summarize(lower = quantile(diffmean, probs = 0.025), upper = quantile(diffmean, probs = 0.975))) ## # A tibble: 1 × 2 ## lower upper ## &lt;dbl&gt; &lt;dbl&gt; ## 1 -1513 11459 We see that 0 is contained in this confidence interval as a plausible value of \\(\\mu_{sac} - \\mu_{cle}\\) (the unknown population parameter). This matches with our hypothesis test results of failing to reject the null hypothesis. Since zero is a plausible value of the population parameter, we do not have evidence that Sacramento incomes are different than Cleveland incomes. Interpretation: We are 95% confident the true mean yearly income for those living in Sacramento is between 1512.59 dollars smaller to 11458.85 dollars higher than for Cleveland. Note: You could also use the null distribution based on randomization with a shift to have its center at \\(\\bar{x}_{sac} - \\bar{x}_{cle} = \\$4960.48\\) instead of at 0 and calculate its percentiles. The confidence interval produced via this method should be comparable to the one done using bootstrapping above. B.5.5 Traditional methods B.5.5.0.1 Check conditions Remember that in order to use the short-cut (formula-based, theoretical) approach, we need to check that some conditions are met. Independent observations: The observations are independent in both groups. This metro_area variable is met since the cases are randomly selected from each city. Approximately normal: The distribution of the response for each group should be normal or the sample sizes should be at least 30. cleSac %&gt;% ggplot(aes(x = income)) + geom_histogram(color = &quot;white&quot;, binwidth = 20000) + facet_wrap(~ metro_area) We have some reason to doubt the normality assumption here since both the histograms show deviation from a normal model fitting the data well for each group. The sample sizes for each group are greater than 100 though so the assumptions should still apply. Independent samples: The samples should be collected without any natural pairing. There is no mention of there being a relationship between those selected in Cleveland and in Sacramento. B.5.6 Test statistic The test statistic is a random variable based on the sample data. Here, we are interested in seeing if our observed difference in sample means (\\(\\bar{x}_{sac, obs} - \\bar{x}_{cle, obs}\\) = 4960.4768) is statistically different than 0. Assuming that conditions are met and the null hypothesis is true, we can use the \\(t\\) distribution to standardize the difference in sample means (\\(\\bar{X}_{sac} - \\bar{X}_{cle}\\)) using the approximate standard error of \\(\\bar{X}_{sac} - \\bar{X}_{cle}\\) (invoking \\(S_{sac}\\) and \\(S_{cle}\\) as estimates of unknown \\(\\sigma_{sac}\\) and \\(\\sigma_{cle}\\)). \\[ T =\\dfrac{ (\\bar{X}_1 - \\bar{X}_2) - 0}{ \\sqrt{\\dfrac{S_1^2}{n_1} + \\dfrac{S_2^2}{n_2}} } \\sim t (df = min(n_1 - 1, n_2 - 1)) \\] where 1 = Sacramento and 2 = Cleveland with \\(S_1^2\\) and \\(S_2^2\\) the sample variance of the incomes of both cities, respectively, and \\(n_1 = 175\\) for Sacramento and \\(n_2 = 212\\) for Cleveland. B.5.6.1 Observed test statistic Note that we could also do (ALMOST) this test directly using the t.test function. The x and y arguments are expected to both be numeric vectors here so we’ll need to appropriately filter our data sets. cleveland &lt;- cleSac %&gt;% filter(metro_area == &quot;Cleveland_ OH&quot;) sacramento &lt;- cleSac %&gt;% filter(metro_area != &quot;Cleveland_ OH&quot;) t.test(y = cleveland$income, x = sacramento$income, alternative = &quot;two.sided&quot;) ## ## Welch Two Sample t-test ## ## data: sacramento$income and cleveland$income ## t = 1.5, df = 320, p-value = 0.1 ## alternative hypothesis: true difference in means is not equal to 0 ## 95 percent confidence interval: ## -1543 11464 ## sample estimates: ## mean of x mean of y ## 32428 27467 Note that the degrees of freedom reported above are different than what we used above in specifying the Test Statistic. The degrees of freedom used here is also known as the Satterthwaite approximation and involves a quite complicated formula. For most problems, the must simpler smaller of sample sizes minus one will suffice. While one could compute this observed test statistic by “hand”, the focus here is on the set-up of the problem and in understanding which formula for the test statistic applies. We see here that the observed test statistic value is around -1.5 with \\(df = min(212 - 1, 175 - 1) = 174\\). Recall that for large degrees of freedom, the \\(t\\) distribution is roughly equal to the standard normal curve so our difference in df for the Satterthwaite and “min” variations doesn’t really matter. B.5.7 Compute \\(p\\)-value The \\(p\\)-value—the probability of observing an \\(t_{174}\\) value of -1.501 or more extreme (in both directions) in our null distribution—is 0.13. This can also be calculated in R directly: 2 * pt(-1.501, df = min(212 - 1, 175 - 1), lower.tail = TRUE) ## [1] 0.1352 We can also approximate by using the standard normal curve: 2 * pnorm(-1.501) ## [1] 0.1334 Note that the 95 percent confidence interval given above matches well with the one calculated using bootstrapping. B.5.8 State conclusion We, therefore, do not have sufficient evidence to reject the null hypothesis. Our initial guess that a statistically significant difference not existing in the means was backed by this statistical analysis. We do not have evidence to suggest that the true mean income differs between Cleveland, OH and Sacramento, CA based on this data. B.5.9 Comparing results Observing the bootstrap distribution and the null distribution that were created, it makes quite a bit of sense that the results are so similar for traditional and non-traditional methods in terms of the \\(p\\)-value and the confidence interval since these distributions look very similar to normal distributions. The conditions also being met leads us to better guess that using any of the methods whether they are traditional (formula-based) or non-traditional (computational-based) will lead to similar results. B.6 Two Means (Paired Samples) B.6.0.1 Problem Statement Trace metals in drinking water affect the flavor and an unusually high concentration can pose a health hazard. Ten pairs of data were taken measuring zinc concentration in bottom water and surface water at 10 randomly selected locations on a stretch of river. Do the data suggest that the true average concentration in the surface water is smaller than that of bottom water? (Note that units are not given.) [Tweaked a bit from https://onlinecourses.science.psu.edu/stat500/node/51] B.6.1 Competing Hypotheses B.6.1.1 In words Null hypothesis: The mean concentration in the bottom water is the same as that of the surface water at different paired locations. Alternative hypothesis: The mean concentration in the surface water is smaller than that of the bottom water at different paired locations. B.6.1.2 In symbols (with annotations) \\(H_0: \\mu_{diff} = 0\\), where \\(\\mu_{diff}\\) represents the mean difference in concentration for surface water minus bottom water. \\(H_A: \\mu_{diff} &lt; 0\\) B.6.1.3 Set \\(\\alpha\\) It’s important to set the significance level before starting the testing using the data. Let’s set the significance level at 5% here. B.6.2 Exploring the sample data #download.file(&quot;http://ismayc.github.io/teaching/sample_problems/zinc_tidy.csv&quot;, # destfile = &quot;data/zinc_tidy.csv&quot;, # method = &quot;curl&quot;) zinc_tidy &lt;- read_csv(&quot;data/zinc_tidy.csv&quot;) We want to look at the differences in surface - bottom for each location: zinc_diff &lt;- zinc_tidy %&gt;% group_by(loc_id) %&gt;% summarize(pair_diff = diff(concentration)) zinc_summ &lt;- zinc_diff %&gt;% summarize(sample_size = n(), mean = mean(pair_diff), sd = sd(pair_diff), minimum = min(pair_diff), lower_quartile = quantile(pair_diff, 0.25), median = median(pair_diff), upper_quartile = quantile(pair_diff, 0.75), max = max(pair_diff)) kable(zinc_summ) sample_size mean sd minimum lower_quartile median upper_quartile max 10 -0.0804 0.0523 -0.177 -0.11 -0.084 -0.0355 -0.015 The histogram below also shows the distribution of pair_diff. zinc_diff %&gt;% ggplot(aes(x = pair_diff)) + geom_histogram(binwidth = 0.04, color = &quot;white&quot;) B.6.2.1 Guess about statistical significance We are looking to see if the sample paired mean difference of -0.0804 is statistically less than 0. They seem to be quite close, but we have a small number of pairs here. Let’s guess that we will fail to reject the null hypothesis. B.6.3 Non-traditional methods B.6.3.1 Collecting summary info Next we will assign some key values to variable names in R: obs_diff &lt;- zinc_summ$mean n_pairs &lt;- zinc_summ$sample_size B.6.3.2 Randomization for Hypothesis Test In order to look to see if the observed sample mean difference \\(\\bar{x}_{diff} = -0.0804\\) is statistically less than 0, we need to account for the number of pairs. We also need to determine a process that replicates how the paired data was selected in a way similar to how we calculated our original difference in sample means. We can use the idea of randomization testing (also known as permutation testing) to simulate the population from which the sample came and then generate samples using shuffling from that simulated population to account for sampling variability. In this case, we will shuffle along each paired location. So values that were on the bottom of location 1 may now be switched to be on the surface or vice versa. library(mosaic) set.seed(2016) many_shuffles &lt;- do(10000) * (zinc_tidy %&gt;% mutate(concentration = shuffle(concentration, groups = loc_id)) %&gt;% group_by(loc_id) %&gt;% summarize(pair_diff = diff(concentration)) ) null_distn &lt;- many_shuffles %&gt;% group_by(.index) %&gt;% summarize(mean_diff = mean(pair_diff)) null_distn %&gt;% ggplot(aes(x = mean_diff)) + geom_histogram(bins = 30, color = &quot;white&quot;) We can next use this distribution to observe our \\(p\\)-value. Recall this is a left-tailed test so we will be looking for values that are less than or equal to -0.0804 for our \\(p\\)-value. null_distn %&gt;% ggplot(aes(x = mean_diff)) + geom_histogram(bins = 30, color = &quot;white&quot;) + geom_vline(color = &quot;red&quot;, xintercept = obs_diff) B.6.3.2.1 Calculate \\(p\\)-value pvalue &lt;- null_distn %&gt;% filter(mean_diff &lt;= obs_diff) %&gt;% nrow() / nrow(null_distn) pvalue ## [1] 0.0009 So our \\(p\\)-value is essentially 0.0009 and we reject the null hypothesis at the 5% level. You can also see this from the histogram above that we are far into the left tail of the null distribution. B.6.3.3 Bootstrapping for Confidence Interval We can also create a confidence interval for the unknown population parameter \\(\\mu_{diff}\\) using our sample data (the calculated differences) with bootstrapping. This is similar to the bootstrapping done in a one sample mean case, except now our data is differences instead of raw numerical data. boot_distn &lt;- do(10000) * resample(zinc_diff, replace = TRUE) %&gt;% summarize(mean_diff = mean(pair_diff)) boot_distn %&gt;% ggplot(aes(x = mean_diff)) + geom_histogram(bins = 30, color = &quot;white&quot;) (ci_boot &lt;- boot_distn %&gt;% summarize(lower = quantile(mean_diff, probs = 0.025), upper = quantile(mean_diff, probs = 0.975))) ## lower upper ## 1 -0.1114 -0.0505 We see that 0 is not contained in this confidence interval as a plausible value of \\(\\mu_{diff}\\) (the unknown population parameter). This matches with our hypothesis test results of rejecting the null hypothesis. Since zero is not a plausible value of the population parameter and since the entire confidence interval falls below zero, we have evidence that surface zinc concentration levels are lower, on average, than bottom level zinc concentrations. Interpretation: We are 95% confident the true mean zinc concentration on the surface is between 0.11 units smaller to 0.05 units smaller than on the bottom. Note: You could also use the null distribution based on randomization with a shift to have its center at \\(\\bar{x}_{diff} = -0.08\\) instead of at 0 and calculate its percentiles. The confidence interval produced via this method should be comparable to the one done using bootstrapping above. B.6.4 Traditional methods B.6.4.1 Check conditions Remember that in order to use the shortcut (formula-based, theoretical) approach, we need to check that some conditions are met. Independent observations: The observations among pairs are independent. The locations are selected independently through random sampling so this condition is met. Approximately normal: The distribution of population of differences is normal or the number of pairs is at least 30. The histogram above does show some skew so we have reason to doubt the population being normal based on this sample. We also only have 10 pairs which is fewer than the 30 needed. A theory-based test may not be valid here. B.6.4.2 Test statistic The test statistic is a random variable based on the sample data. Here, we want to look at a way to estimate the population mean difference \\(\\mu_{diff}\\). A good guess is the sample mean difference \\(\\bar{X}_{diff}\\). Recall that this sample mean is actually a random variable that will vary as different samples are (theoretically, would be) collected. We are looking to see how likely is it for us to have observed a sample mean of \\(\\bar{x}_{diff, obs} = 0.0804\\) or larger assuming that the population mean difference is 0 (assuming the null hypothesis is true). If the conditions are met and assuming \\(H_0\\) is true, we can “standardize” this original test statistic of \\(\\bar{X}_{diff}\\) into a \\(T\\) statistic that follows a \\(t\\) distribution with degrees of freedom equal to \\(df = n - 1\\): \\[ T =\\dfrac{ \\bar{X}_{diff} - 0}{ S_{diff} / \\sqrt{n} } \\sim t (df = n - 1) \\] where \\(S\\) represents the standard deviation of the sample differences and \\(n\\) is the number of pairs. B.6.4.2.1 Observed test statistic While one could compute this observed test statistic by “hand”, the focus here is on the set-up of the problem and in understanding which formula for the test statistic applies. We can use the t.test function on the differences to perform this analysis for us. stats::t.test(x = zinc_diff$pair_diff, alternative = &quot;less&quot;, mu = 0) ## ## One Sample t-test ## ## data: zinc_diff$pair_diff ## t = -4.9, df = 9, p-value = 0.0004 ## alternative hypothesis: true mean is less than 0 ## 95 percent confidence interval: ## -Inf -0.0501 ## sample estimates: ## mean of x ## -0.0804 We see here that the \\(t_{obs}\\) value is around -5. B.6.4.3 Compute \\(p\\)-value The \\(p\\)-value—the probability of observing a \\(t_{obs}\\) value of -5 or less in our null distribution of a \\(t\\) with 9 degrees of freedom—is 0.0004. This can also be calculated in R directly: pt(-5, df = nrow(zinc_diff) - 1, lower.tail = TRUE) ## [1] 0.0003695 B.6.4.4 State conclusion We, therefore, have sufficient evidence to reject the null hypothesis. Our initial guess that our observed sample mean difference was not statistically less than the hypothesized mean of 0 has been invalidated here. Based on this sample, we have evidence that the mean concentration in the bottom water is greater than that of the surface water at different paired locations. B.6.5 Comparing results Observing the bootstrap distribution and the null distribution that were created, it makes quite a bit of sense that the results are so similar for traditional and non-traditional methods in terms of the \\(p\\)-value and the confidence interval since these distributions look very similar to normal distributions. The conditions were not met since the number of pairs was small, but the sample data was not highly skewed. Using any of the methods whether they are traditional (formula-based) or non-traditional (computational-based) lead to similar results. References "],
+["C-appendixC.html", "C Reach for the Starts Needed packages C.1 Sorted barplots C.2 Interactive graphics", " C Reach for the Starts Needed packages library(dplyr) library(ggplot2) library(knitr) library(dygraphs) library(nycflights13) C.1 Sorted barplots Building upon the example in Section ??: flights_table &lt;- table(flights$carrier) flights_table ## ## 9E AA AS B6 DL EV F9 FL HA MQ OO UA ## 18460 32729 714 54635 48110 54173 685 3260 342 26397 32 58665 ## US VX WN YV ## 20536 5162 12275 601 We can sort this table from highest to lowest counts by using the sort function: sorted_flights &lt;- sort(flights_table, decreasing = TRUE) names(sorted_flights) ## [1] &quot;UA&quot; &quot;B6&quot; &quot;EV&quot; &quot;DL&quot; &quot;AA&quot; &quot;MQ&quot; &quot;US&quot; &quot;9E&quot; &quot;WN&quot; &quot;VX&quot; &quot;FL&quot; &quot;AS&quot; &quot;F9&quot; ## [14] &quot;YV&quot; &quot;HA&quot; &quot;OO&quot; It is often preferred for barplots to be ordered corresponding to the heights of the bars. This allows the reader to more easily compare the ordering of different airlines in terms of departed flights (Robbins 2013). We can also much more easily answer questions like “How many airlines have more departing flights than Southwest Airlines?”. We can use the sorted table giving the number of flights defined as sorted_flights to reorder the carrier. ggplot(data = flights, mapping = aes(x = carrier)) + geom_bar() + scale_x_discrete(limits = names(sorted_flights)) Figure C.1: Number of flights departing NYC in 2013 by airline - Descending numbers The last addition here specifies the values of the horizontal x axis on a discrete scale to correspond to those given by the entries of sorted_flights. C.2 Interactive graphics C.2.1 Interactive line-graphs Another useful tool for viewing line-graphs such as this is the dygraph function in the dygraphs package in combination with the dyRangeSelector function. This allows us to zoom in on a selected range and get an interactive plot for us to work with: library(dygraphs) flights_day &lt;- mutate(flights, date = as.Date(time_hour)) flights_summarized &lt;- flights_day %&gt;% group_by(date) %&gt;% summarize(median_arr_delay = median(arr_delay, na.rm = TRUE)) rownames(flights_summarized) &lt;- flights_summarized$date flights_summarized &lt;- select(flights_summarized, -date) dyRangeSelector(dygraph(flights_summarized)) The syntax here is a little different than what we have covered so far. The dygraph function is expecting for the dates to be given as the rownames of the object. We then remove the date variable from the flights_summarized dataframe since it is accounted for in the rownames. Lastly, we run the dygraph function on the new dataframe that only contains the median arrival delay as a column and then provide the ability to have a selector to zoom in on the interactive plot via dyRangeSelector. (Note that this plot will only be interactive in the HTML version of this book.) References "],
 ["references.html", "References", " References "]
 ]

time	x	y	z
2009-01-01	-1.346	-2.241	4.412
2009-01-02	-0.777	-2.111	1.202
2009-01-03	0.304	-7.305	-4.859
2009-01-04	2.510	0.213	0.720
2009-01-05	-0.484	-0.008	7.705
carrier	name
9E	Endeavor Air Inc.
AA	American Airlines Inc.
AS	Alaska Airlines Inc.
B6	JetBlue Airways
DL	Delta Air Lines Inc.
EV	ExpressJet Airlines Inc.
F9	Frontier Airlines Inc.
FL	AirTran Airways Corporation
HA	Hawaiian Airlines Inc.
MQ	Envoy Air
OO	SkyWest Airlines Inc.
UA	United Air Lines Inc.
US	US Airways Inc.
VX	Virgin America
WN	Southwest Airlines Co.
YV	Mesa Airlines Inc.
carrier	n
9E	18460
AA	32729
AS	714
B6	54635
DL	48110
EV	54173
F9	685
FL	3260
HA	342
MQ	26397
OO	32
UA	58665
US	20536
VX	5162
WN	12275
YV	601
n	heads	tails	prop
10	9	1	0.9
10	10	0	1.0
10	9	1	0.9
10	9	1	0.9
10	10	0	1.0
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	10	0	1.0
10	10	0	1.0
10	10	0	1.0
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	10	0	1.0
10	9	1	0.9
10	9	1	0.9
10	10	0	1.0
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	10	0	1.0
10	10	0	1.0
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	10	0	1.0
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	10	0	1.0
10	9	1	0.9
10	9	1	0.9
10	10	0	1.0
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	10	0	1.0
10	10	0	1.0
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
10	9	1	0.9
Action	5.197059	1.464837	5.197	1.465	34
Romance	6.026471	1.202096	6.027	1.202	34
(Intercept)	-14.155017	2.8094813	-5.038302	0.0000071	-14.155	2.809	-5.038	0
dep_delay	1.217666	0.1360336	8.951212	0.0000000	1.218	0.136	8.951	0