pred_test.Rmd

---
title: "COVID-19 Analysis For INDIA and World"
author: "Samyak Gupta"
date: "22/05/2020"
output: 
 prettydoc::html_pretty:
    theme: cayman
    highlight: github
    math: katex
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(gtools)
library(rjson)
library(plyr)
require(boot)
library(Metrics)


my.JSON <- fromJSON(file="https://pomber.github.io/covid19/timeseries.json")
world_json <- lapply(my.JSON, function(x){unlist(x)})
world_df <- rbind.fill(lapply(world_json,function(x) do.call("data.frame",as.list(x))))
world_df <- do.call("rbind", world_json)
world_df <- as.data.frame(world_df)

India_json <- lapply(my.JSON$India, function(x){ unlist(x)})
India_df <- rbind.fill(lapply(India_json,function(x) do.call("data.frame", as.list(x))))
India_df <- do.call("rbind", India_json)
India_df <- as.data.frame(India_df)

I_date <- as.Date(India_df$date,format = "%Y-%m-%d")
I_confirmed <- as.numeric(India_df$confirmed)
I_deaths <- as.numeric(India_df$deaths)
I_recovered <- as.numeric(India_df$recovered)

date <- as.Date(world_df$date,format = "%Y-%m-%d")
confirmed <- as.numeric(world_df$confirmed)
deaths <- as.numeric(world_df$deaths)
recovered <- as.numeric(world_df$recovered)

#No. of days since 22-jan-2020
today = Sys.Date()-as.Date("2020-01-22")-1


t_case=c();t_dead=c();t_date=c(1:today)
count=0
cols = seq(from=2, length=today,by=4)

for (j in cols){
  total_cases = 0;total_deaths = 0
  count = count + 1
  for(i in world_df[,j]){
    total_cases = total_cases + as.numeric(i)
  }
  for(k in world_df[,j+1]){
    total_deaths = total_deaths + as.numeric(k)
  }
  t_case[count] = as.numeric(total_cases)
  t_dead[count] = as.numeric(total_deaths)
}
t_date <- as.Date(t_date,origin = "2020-01-22")
comp_w_df = cbind(t_date,t_case,t_dead)

comp_w_df <- as.data.frame(comp_w_df)
confirmed_1 <- as.numeric(comp_w_df$t_case)
deaths_1 <- as.numeric(comp_w_df$t_dead)

for(i in 1:today){
  if(i != 1){
    I_confirmed[i] = I_confirmed[i] - as.numeric(India_df[,2][i-1])
    confirmed_1[i] = confirmed_1[i] - comp_w_df$t_case[i-1]
  }else{confirmed_1[i] = confirmed_1[i];I_confirmed[i] = I_confirmed[i]}
  
}

confirmed_1;t_date

fit = lm(confirmed_1~poly(t_date,6))
a = today+1; b = today+7
new <- data.frame(t_date = as.Date(a:b ,origin = "2020-01-20"))
predict_date <- as.Date(a:b,origin="2020-01-20")
predict_date = c(as.character.Date(predict_date))
fit4 = lm(I_confirmed~poly(I_date,6))
```

## Predictions and Plots

So this little project is just for us to learn and apply the simple techniques used in statistical learning in a real time problem in the world.

Here we predict two things:

1) No. of confirmed cases for the next few days for the world.

2) No. of confirmed cases for the next few days for India.

```{r pressure, echo=FALSE,fig.align='center'}
par(las=2,mgp=c(5,1,0),mar=c(6,6,6,6))
options(scipen=999)
plot(t_date,confirmed_1,pch = 16,xlab = "date", col = "blue",xaxt = "n"
     ,ylab = "no. of cases ", main = "Total confirmed cases in the world")
legend("topleft",c("Actual","Predicted"), 
       col=c("blue","red"), lwd=2)
axis.Date(1, at=seq(min(t_date),max(t_date),by=10), format="%d %b %y")
points(t_date,fitted(fit),col="red",pch=20)

abc <- data.frame(I_date = as.Date(a:b ,origin = "2020-01-19"))
Final_2 <- as.data.frame(cbind(new,predict.lm(fit4,abc,interval = "confidence")))
Final <- as.data.frame(cbind(predict_date,predict.lm(fit,new,interval="confidence")))
Final$fit <- round(as.numeric(Final$fit));Final_2$fit <- round(as.numeric(Final_2$fit))
Final$lwr <- round(as.numeric(Final$lwr));Final_2$lwr <- round(as.numeric(Final_2$lwr))
Final$upr <- round(as.numeric(Final$upr));Final_2$upr <- round(as.numeric(Final_2$upr))
Final$fit <- format(Final$fit,big.mark=",",scientific=FALSE)
Final$lwr <- format(Final$lwr,big.mark=",",scientific=FALSE)
Final$upr <- format(Final$upr,big.mark=",",scientific=FALSE)
Final_2$fit <- format(Final_2$fit,big.mark=",",scientific=FALSE)
Final_2$lwr <- format(Final_2$lwr,big.mark=",",scientific=FALSE)
Final_2$upr <- format(Final_2$upr,big.mark=",",scientific=FALSE)

names(Final)[1] <- "Date"
names(Final)[2] <- "Prediction Fit"
names(Final)[3] <- "Minimum"
names(Final)[4] <- "Maximum"

knitr::kable(Final,caption = "TABLE 1: This is the predictions for the world ",align = "crrr")

par(las=2,mgp=c(5,1,0),mar=c(6,6,6,6))
plot(I_date,I_confirmed,pch = 16,xlab = "date", col = "blue",xaxt = "n"
     ,ylab = "no. of cases ", main = "Total confirmed cases in India")
legend("topleft",c("Actual","Predicted"), 
       col=c("blue","red"), lwd=2)
axis.Date(1, at=seq(min(t_date),max(t_date),by=10), format="%d %b %y")
points(I_date,fitted(fit4),col="red",pch=20)
names(Final_2)[1] <- "Date"
names(Final_2)[2] <- "Prediction Fit"
names(Final_2)[3] <- "Minimum"
names(Final_2)[4] <- "Maximum"
knitr::kable(Final_2,caption = "TABLE 2: This is the prediction for India",align = "crrr")
```

## Explanation, Inference and Error Analysis

Here, we use the simple linear model and gave it a polynomial argument to create the simple blue line graph shown above. The model in turn predicts the near date event and as it is obvious from the results below, the nearest future predictions are more precise out of all whereas the accuracy of the prediction decreases as the prediction goes further in the future.

The polynomial used was of degree six as we used the method of cross validation to determine which degree will give lowest error and less computations. We concluded that degree 6 was fitting the model perfectly. However, other models were also considered for this job but none gave as much accuracy than this simple polynomial. 

```{r echo=FALSE,warning=FALSE}

prediction <- c()
actual <- c(confirmed_1[(today-10):today])
date <- c(as.character.Date(as.Date((today - 10):today-1,origin="2020-01-20")));count=0
prediction_1 <- c()
actual_1 <-c(I_confirmed[(today - 10):today])
for(u in (today-10):today){
  count = count + 1
  fit = lm(confirmed_1[1:u-1]~poly(t_date[1:u-1],6))
  new <- data.frame(t_date = as.Date(u ,origin = "2020-01-21"))
  prediction[count] = predict.lm(fit,new,interval="confidence")
  fit4 = lm(I_confirmed[1:u-1]~poly(I_date[1:u-1],6))
  new_1 <- data.frame(I_date = as.Date(u ,origin = "2020-01-21"))
  prediction_1[count] = predict.lm(fit4,new_1,interval="confidence")
}
percentage_error <- round((ape(actual, prediction) * 100),digits=3); percentage_error_1 <- round((ape(actual_1,prediction_1) * 100),digits=3)
error_table <- as.data.frame(cbind(date,prediction,actual,percentage_error))
error_table_1 <- as.data.frame(cbind(date,prediction_1,actual_1,percentage_error_1))
error_table$prediction <- round(as.numeric(error_table$prediction));error_table_1$prediction_1 <- round(as.numeric(error_table_1$prediction_1))
error_table$actual <- round(as.numeric(error_table$actual));error_table_1$actual_1 <- round(as.numeric(error_table_1$actual_1))
error_table$prediction <- format(error_table$prediction,big.mark=",",scientific=FALSE)
error_table_1$prediction_1 <- format(error_table_1$prediction_1,big.mark=",",scientific=FALSE)
error_table$actual <- format(error_table$actual,big.mark=",",scientific=FALSE)
error_table_1$actual_1 <- format(error_table_1$actual_1,big.mark=",",scientific=FALSE)

names(error_table)[1] <- "Date"
names(error_table)[2] <- "Prediction"
names(error_table)[3] <- "Actual"
names(error_table)[4] <- "Average Error %"
names(error_table_1)[1] <- "Date"
names(error_table_1)[2] <- "Prediction"
names(error_table_1)[3] <- "Actual"
names(error_table_1)[4] <- "Average Error %"

knitr::kable(error_table,caption = "TABLE 3: This table represents the Average Percentage error of the prediction model of World",align = "crrr")
knitr::kable(error_table_1,caption="TABLE 4: This table represents the Average Percentage error of the prediction model of India",align = "crrr")
```