library(ggplot2)
library(hrbrthemes)
library(GGally)
library(corrgram)
library(visdat)
library(ISLR2)
library(e1071)
library(caTools)
library(hopkins)
#library(randomForest)
NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes. Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow Registered S3 method overwritten by 'GGally': method from +.gg ggplot2
Context
# Import datasets from csv
# Source: https://www.kaggle.com/datasets/imdevskp/corona-virus-report?resource=download&select=country_wise_latest.csv
# Online from github repo
df = read.csv('https://raw.githubusercontent.com/Giovo17/covid-country-analysis/main/country_wise_latest.csv')
# Local from disk
#df = read.csv("country_wise_latest.csv")
head(df, 7)
str(df)
summary(df)
Country.Region | Confirmed | Deaths | Recovered | Active | New.cases | New.deaths | New.recovered | Deaths...100.Cases | Recovered...100.Cases | Deaths...100.Recovered | Confirmed.last.week | X1.week.change | X1.week...increase | WHO.Region | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
<chr> | <int> | <int> | <int> | <int> | <int> | <int> | <int> | <dbl> | <dbl> | <dbl> | <int> | <int> | <dbl> | <chr> | |
1 | Afghanistan | 36263 | 1269 | 25198 | 9796 | 106 | 10 | 18 | 3.50 | 69.49 | 5.04 | 35526 | 737 | 2.07 | Eastern Mediterranean |
2 | Albania | 4880 | 144 | 2745 | 1991 | 117 | 6 | 63 | 2.95 | 56.25 | 5.25 | 4171 | 709 | 17.00 | Europe |
3 | Algeria | 27973 | 1163 | 18837 | 7973 | 616 | 8 | 749 | 4.16 | 67.34 | 6.17 | 23691 | 4282 | 18.07 | Africa |
4 | Andorra | 907 | 52 | 803 | 52 | 10 | 0 | 0 | 5.73 | 88.53 | 6.48 | 884 | 23 | 2.60 | Europe |
5 | Angola | 950 | 41 | 242 | 667 | 18 | 1 | 0 | 4.32 | 25.47 | 16.94 | 749 | 201 | 26.84 | Africa |
6 | Antigua and Barbuda | 86 | 3 | 65 | 18 | 4 | 0 | 5 | 3.49 | 75.58 | 4.62 | 76 | 10 | 13.16 | Americas |
7 | Argentina | 167416 | 3059 | 72575 | 91782 | 4890 | 120 | 2057 | 1.83 | 43.35 | 4.21 | 130774 | 36642 | 28.02 | Americas |
'data.frame': 187 obs. of 15 variables: $ Country.Region : chr "Afghanistan" "Albania" "Algeria" "Andorra" ... $ Confirmed : int 36263 4880 27973 907 950 86 167416 37390 15303 20558 ... $ Deaths : int 1269 144 1163 52 41 3 3059 711 167 713 ... $ Recovered : int 25198 2745 18837 803 242 65 72575 26665 9311 18246 ... $ Active : int 9796 1991 7973 52 667 18 91782 10014 5825 1599 ... $ New.cases : int 106 117 616 10 18 4 4890 73 368 86 ... $ New.deaths : int 10 6 8 0 1 0 120 6 6 1 ... $ New.recovered : int 18 63 749 0 0 5 2057 187 137 37 ... $ Deaths...100.Cases : num 3.5 2.95 4.16 5.73 4.32 3.49 1.83 1.9 1.09 3.47 ... $ Recovered...100.Cases : num 69.5 56.2 67.3 88.5 25.5 ... $ Deaths...100.Recovered: num 5.04 5.25 6.17 6.48 16.94 ... $ Confirmed.last.week : int 35526 4171 23691 884 749 76 130774 34981 12428 19743 ... $ X1.week.change : int 737 709 4282 23 201 10 36642 2409 2875 815 ... $ X1.week...increase : num 2.07 17 18.07 2.6 26.84 ... $ WHO.Region : chr "Eastern Mediterranean" "Europe" "Africa" "Europe" ...
Country.Region Confirmed Deaths Recovered Length:187 Min. : 10 Min. : 0.0 Min. : 0.0 Class :character 1st Qu.: 1114 1st Qu.: 18.5 1st Qu.: 626.5 Mode :character Median : 5059 Median : 108.0 Median : 2815.0 Mean : 88131 Mean : 3497.5 Mean : 50631.5 3rd Qu.: 40460 3rd Qu.: 734.0 3rd Qu.: 22606.0 Max. :4290259 Max. :148011.0 Max. :1846641.0 Active New.cases New.deaths New.recovered Min. : 0.0 Min. : 0.0 Min. : 0.00 Min. : 0.0 1st Qu.: 141.5 1st Qu.: 4.0 1st Qu.: 0.00 1st Qu.: 0.0 Median : 1600.0 Median : 49.0 Median : 1.00 Median : 22.0 Mean : 34001.9 Mean : 1223.0 Mean : 28.96 Mean : 933.8 3rd Qu.: 9149.0 3rd Qu.: 419.5 3rd Qu.: 6.00 3rd Qu.: 221.0 Max. :2816444.0 Max. :56336.0 Max. :1076.00 Max. :33728.0 Deaths...100.Cases Recovered...100.Cases Deaths...100.Recovered Min. : 0.000 Min. : 0.00 Min. :0.00 1st Qu.: 0.945 1st Qu.: 48.77 1st Qu.:1.45 Median : 2.150 Median : 71.32 Median :3.62 Mean : 3.020 Mean : 64.82 Mean : Inf 3rd Qu.: 3.875 3rd Qu.: 86.89 3rd Qu.:6.44 Max. :28.560 Max. :100.00 Max. : Inf Confirmed.last.week X1.week.change X1.week...increase WHO.Region Min. : 10 Min. : -47 Min. : -3.840 Length:187 1st Qu.: 1052 1st Qu.: 49 1st Qu.: 2.775 Class :character Median : 5020 Median : 432 Median : 6.890 Mode :character Mean : 78682 Mean : 9448 Mean : 13.606 3rd Qu.: 37080 3rd Qu.: 3172 3rd Qu.: 16.855 Max. :3834677 Max. :455582 Max. :226.320
vis_miss(df)
Warning message: "`gather_()` was deprecated in tidyr 1.2.0. ℹ Please use `gather()` instead. ℹ The deprecated feature was likely used in the visdat package. Please report the issue at <https://github.com/ropensci/visdat/issues>."
# Removed a row with a strange value
df = df[!(df$Deaths...100.Recovered=="Inf" ),]
Let's explore the dataset
print(colnames(df))
[1] "Country.Region" "Confirmed" "Deaths" [4] "Recovered" "Active" "New.cases" [7] "New.deaths" "New.recovered" "Deaths...100.Cases" [10] "Recovered...100.Cases" "Deaths...100.Recovered" "Confirmed.last.week" [13] "X1.week.change" "X1.week...increase" "WHO.Region"
boxplot.stats(df$Confirmed)
ggplot(df, aes(x=as.factor(WHO.Region), y=Confirmed)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of confirmed cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 120000)) +
xlab("WHO.Region")
ggplot(df, aes(y=Confirmed)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of confirmed cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 120000))
ggplot(df, aes(x=Confirmed)) +
geom_histogram( binwidth=4000, fill="#69b3a2", color="#e9ecef", alpha=0.9, position = 'identity') +
ggtitle("Histogram of confirmed cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,120000)
Warning message: "Removed 20 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
png(filename = "img/Confirmed_conditional_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(x=as.factor(WHO.Region), y=Confirmed)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of confirmed cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14,face="bold")
) +
coord_cartesian(ylim = c(0, 120000)) +
xlab("WHO.Region")
dev.off()
png(filename = "img/Confirmed_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(y=Confirmed)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of confirmed cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 120000))
dev.off()
png(filename = "img/Confirmed_histogram.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(Confirmed)) +
geom_histogram( binwidth=4000, fill="#69b3a2", color="#e9ecef", alpha=0.9) +
ggtitle("Histogram of confirmed cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,120000)
dev.off()
Warning message: "Removed 20 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
boxplot.stats(df$Deaths)
ggplot(df, aes(x=as.factor(WHO.Region), y=Deaths)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of deaths cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14,face="bold")
) +
coord_cartesian(ylim = c(0, 5000)) +
xlab("WHO.Region")
ggplot(df, aes(y=Deaths)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of deaths cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14,face="bold")
) +
coord_cartesian(ylim = c(0, 5000))
ggplot(df, aes(Deaths)) +
geom_histogram( binwidth=150, fill="#69b3a2", color="#e9ecef", alpha=0.9) +
ggtitle("Histogram of deaths cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,5000)
Warning message: "Removed 20 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
png(filename = "img/Deaths_conditional_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(x=as.factor(WHO.Region), y=Deaths)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of deaths cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14,face="bold")
) +
coord_cartesian(ylim = c(0, 5000)) +
xlab("WHO.Region")
dev.off()
png(filename = "img/Deaths_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(y=Deaths)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of deaths cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 5000))
dev.off()
png(filename = "img/Deaths_histogram.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(Deaths)) +
geom_histogram( binwidth=150, fill="#69b3a2", color="#e9ecef", alpha=0.9) +
ggtitle("Histogram of deaths cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,5000)
dev.off()
Warning message: "Removed 20 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
boxplot.stats(df$Recovered)
ggplot(df, aes(x=as.factor(WHO.Region), y=Recovered)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of recovered cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 70000)) +
xlab("WHO.Region")
ggplot(df, aes(y=Recovered)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of recovered cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 70000))
ggplot(df, aes(x=Recovered)) +
geom_histogram( binwidth=3000, fill="#69b3a2", color="#e9ecef", alpha=0.9, position = 'identity') +
ggtitle("Histogram of recovered cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,70000)
Warning message: "Removed 22 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
png(filename = "img/Recovered_conditional_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(x=as.factor(WHO.Region), y=Recovered)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of recovered cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14,face="bold")
) +
coord_cartesian(ylim = c(0, 70000)) +
xlab("WHO.Region")
dev.off()
png(filename = "img/Recovered_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(y=Recovered)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of confirmed cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 70000))
dev.off()
png(filename = "img/Recovered_histogram.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(Recovered)) +
geom_histogram( binwidth=3000, fill="#69b3a2", color="#e9ecef", alpha=0.9) +
ggtitle("Histogram of confirmed cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,70000)
dev.off()
Warning message: "Removed 22 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
boxplot.stats(df$Active)
ggplot(df, aes(x=as.factor(WHO.Region), y=Active)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of active cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 40000)) +
xlab("WHO.Region")
ggplot(df, aes(y=Active)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of active cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 40000))
ggplot(df, aes(x=Active)) +
geom_histogram( binwidth=2500, fill="#69b3a2", color="#e9ecef", alpha=0.9, position = 'identity') +
ggtitle("Histogram of active cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,40000)
Warning message: "Removed 19 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
png(filename = "img/Active_conditional_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(x=as.factor(WHO.Region), y=Active)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of active cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14,face="bold")
) +
coord_cartesian(ylim = c(0, 40000)) +
xlab("WHO.Region")
dev.off()
png(filename = "img/Active_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(y=Active)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of active cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 40000))
dev.off()
png(filename = "img/Active_histogram.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(Active)) +
geom_histogram( binwidth=2500, fill="#69b3a2", color="#e9ecef", alpha=0.9) +
ggtitle("Histogram of active cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,40000)
dev.off()
Warning message: "Removed 19 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
boxplot.stats(df$New.cases)
ggplot(df, aes(x=as.factor(WHO.Region), y=New.cases)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of new cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 1600)) +
xlab("WHO.Region")
ggplot(df, aes(y=New.cases)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of new cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 1600))
ggplot(df, aes(x=New.cases)) +
geom_histogram( binwidth=70, fill="#69b3a2", color="#e9ecef", alpha=0.9, position = 'identity') +
ggtitle("Histogram of new cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,1600)
Warning message: "Removed 17 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
png(filename = "img/New_cases_conditional_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(x=as.factor(WHO.Region), y=New.cases)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of new cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14,face="bold")
) +
coord_cartesian(ylim = c(0, 1600)) +
xlab("WHO.Region")
dev.off()
png(filename = "img/New_cases_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(y=New.cases)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of new cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 1600))
dev.off()
png(filename = "img/New_cases_histogram.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(New.cases)) +
geom_histogram( binwidth=70, fill="#69b3a2", color="#e9ecef", alpha=0.9) +
ggtitle("Histogram of new cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,1600)
dev.off()
Warning message: "Removed 17 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
boxplot.stats(df$New.deaths)
ggplot(df, aes(x=as.factor(WHO.Region), y=New.deaths)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of new deaths cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 60)) +
xlab("WHO.Region")
ggplot(df, aes(y=New.deaths)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of new deaths cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 60))
ggplot(df, aes(x=New.deaths)) +
geom_histogram( binwidth=5, fill="#69b3a2", color="#e9ecef", alpha=0.9, position = 'identity') +
ggtitle("Histogram of new deaths cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,60)
Warning message: "Removed 13 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
png(filename = "img/New_deaths_conditional_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(x=as.factor(WHO.Region), y=New.deaths)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of new deaths cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14,face="bold")
) +
coord_cartesian(ylim = c(0, 60)) +
xlab("WHO.Region")
dev.off()
png(filename = "img/New_deaths_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(y=New.deaths)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of new deaths cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 60))
dev.off()
png(filename = "img/New_deaths_histogram.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(New.deaths)) +
geom_histogram( binwidth=5, fill="#69b3a2", color="#e9ecef", alpha=0.9) +
ggtitle("Histogram of new deaths cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,60)
dev.off()
Warning message: "Removed 13 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
boxplot.stats(df$New.recovered)
ggplot(df, aes(x=as.factor(WHO.Region), y=New.recovered)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of new recovered cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 1700)) +
xlab("WHO.Region")
ggplot(df, aes(y=New.recovered)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of new recovered cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 1700))
ggplot(df, aes(x=New.recovered)) +
geom_histogram( binwidth=80, fill="#69b3a2", color="#e9ecef", alpha=0.9, position = 'identity') +
ggtitle("Histogram of new recovered cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,1700)
Warning message: "Removed 17 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
png(filename = "img/New_recovered_conditional_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(x=as.factor(WHO.Region), y=New.recovered)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of new recovered cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14,face="bold")
) +
coord_cartesian(ylim = c(0, 1700)) +
xlab("WHO.Region")
dev.off()
png(filename = "img/New_recovered_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(y=New.recovered)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of new recovered cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 1700))
dev.off()
png(filename = "img/New_recovered_histogram.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(New.recovered)) +
geom_histogram( binwidth=80, fill="#69b3a2", color="#e9ecef", alpha=0.9) +
ggtitle("Histogram of new recovered cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,1700)
dev.off()
Warning message: "Removed 17 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
boxplot.stats(df$Deaths...100.Cases)
ggplot(df, aes(x=as.factor(WHO.Region), y=Deaths...100.Cases)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of deaths over 100 cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 10)) +
xlab("WHO.Region")
ggplot(df, aes(y=Deaths...100.Cases)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of deaths over 100 cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 10))
ggplot(df, aes(x=Deaths...100.Cases)) +
geom_histogram( binwidth=0.6, fill="#69b3a2", color="#e9ecef", alpha=0.9, position = 'identity') +
ggtitle("Histogram of deaths over 100 cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,10)
Warning message: "Removed 9 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
png(filename = "img/Deaths_over_100_cases_conditional_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(x=as.factor(WHO.Region), y=Deaths...100.Cases)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of deaths over 100 cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14,face="bold")
) +
coord_cartesian(ylim = c(0, 10)) +
xlab("WHO.Region")
dev.off()
png(filename = "img/Deaths_over_100_cases_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(y=Deaths...100.Cases)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of deaths over 100 cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 10))
dev.off()
png(filename = "img/Deaths_over_100_cases_histogram.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(Deaths...100.Cases)) +
geom_histogram( binwidth=0.6, fill="#69b3a2", color="#e9ecef", alpha=0.9) +
ggtitle("Histogram of deaths over 100 cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,10)
dev.off()
Warning message: "Removed 9 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
boxplot.stats(df$Recovered...100.Cases)
ggplot(df, aes(x=as.factor(WHO.Region), y=Recovered...100.Cases)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of recovered over 100 cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 110)) +
xlab("WHO.Region")
ggplot(df, aes(y=Recovered...100.Cases)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of recovered over 100 cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 110))
ggplot(df, aes(x=Recovered...100.Cases)) +
geom_histogram( binwidth=8, fill="#69b3a2", color="#e9ecef", alpha=0.9, position = 'identity') +
ggtitle("Histogram of recovered over 100 cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,110)
Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
png(filename = "img/Recovered_over_100_cases_conditional_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(x=as.factor(WHO.Region), y=Recovered...100.Cases)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of recovered over 100 cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14,face="bold")
) +
coord_cartesian(ylim = c(0, 110)) +
xlab("WHO.Region")
dev.off()
png(filename = "img/Recovered_over_100_cases_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(y=Confirmed)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of recovered over 100 cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 110))
dev.off()
png(filename = "img/Recovered_over_100_cases_histogram.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(Recovered...100.Cases)) +
geom_histogram( binwidth=8, fill="#69b3a2", color="#e9ecef", alpha=0.9) +
ggtitle("Histogram of recovered over 100 cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,110)
dev.off()
Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
boxplot.stats(df$Deaths...100.Cases)
ggplot(df, aes(x=as.factor(WHO.Region), y=Deaths...100.Cases)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of deaths over 100 cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 15)) +
xlab("WHO.Region")
ggplot(df, aes(y=Deaths...100.Cases)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of recovered over 100 cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 15))
ggplot(df, aes(x=Deaths...100.Cases)) +
geom_histogram( binwidth=0.8, fill="#69b3a2", color="#e9ecef", alpha=0.9, position = 'identity') +
ggtitle("Histogram of deaths over 100 cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,15)
Warning message: "Removed 2 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
png(filename = "img/Deaths_over_100_cases_conditional_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(x=as.factor(WHO.Region), y=Deaths...100.Cases)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of deaths over 100 cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14,face="bold")
) +
coord_cartesian(ylim = c(0, 15)) +
xlab("WHO.Region")
dev.off()
png(filename = "img/Deaths_over_100_cases_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(y=Deaths...100.Cases)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of deaths over 100 cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 15))
dev.off()
png(filename = "img/Deaths_over_100_cases_histogram.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(Deaths...100.Cases)) +
geom_histogram( binwidth=0.8, fill="#69b3a2", color="#e9ecef", alpha=0.9) +
ggtitle("Histogram of deaths over 100 cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,15)
dev.off()
Warning message: "Removed 2 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
boxplot.stats(df$Confirmed.last.week)
ggplot(df, aes(x=as.factor(WHO.Region), y=Confirmed.last.week)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of confirmed last week cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 120000)) +
xlab("WHO.Region")
ggplot(df, aes(y=Confirmed.last.week)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of confirmed last week cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 120000))
ggplot(df, aes(x=Confirmed.last.week)) +
geom_histogram( binwidth=5000, fill="#69b3a2", color="#e9ecef", alpha=0.9, position = 'identity') +
ggtitle("Histogram of confirmed last week cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,120000)
Warning message: "Removed 20 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
png(filename = "img/Confirmed_last_week_conditional_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(x=as.factor(WHO.Region), y=Confirmed.last.week)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of confirmed last week cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14,face="bold")
) +
coord_cartesian(ylim = c(0, 120000)) +
xlab("WHO.Region")
dev.off()
png(filename = "img/Confirmed_last_week_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(y=Confirmed.last.week)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of confirmed last week cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 120000))
dev.off()
png(filename = "img/Confirmed_last_week_histogram.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(Confirmed.last.week)) +
geom_histogram( binwidth=5000, fill="#69b3a2", color="#e9ecef", alpha=0.9) +
ggtitle("Histogram of confirmed last week cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,120000)
dev.off()
Warning message: "Removed 20 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
boxplot.stats(df$X1.week.change)
ggplot(df, aes(x=as.factor(WHO.Region), y=X1.week.change)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of 1 week change cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 11000)) +
xlab("WHO.Region")
ggplot(df, aes(y=X1.week.change)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of 1 week change cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 11000))
ggplot(df, aes(x=X1.week.change)) +
geom_histogram( binwidth=500, fill="#69b3a2", color="#e9ecef", alpha=0.9, position = 'identity') +
ggtitle("Histogram of 1 week change cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,11000)
Warning message: "Removed 19 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
png(filename = "img/1_week_change_conditional_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(x=as.factor(WHO.Region), y=X1.week.change)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of 1 week change cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14,face="bold")
) +
coord_cartesian(ylim = c(0, 11000)) +
xlab("WHO.Region")
dev.off()
png(filename = "img/1_week_change_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(y=X1.week.change)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of 1 week change cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 11000))
dev.off()
png(filename = "img/1_week_change_histogram.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(X1.week.change)) +
geom_histogram( binwidth=500, fill="#69b3a2", color="#e9ecef", alpha=0.9) +
ggtitle("Histogram of 1 week change cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,11000)
dev.off()
Warning message: "Removed 19 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
boxplot.stats(df$X1.week...increase)
ggplot(df, aes(x=as.factor(WHO.Region), y=X1.week...increase)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of 1 week increase cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 50)) +
xlab("WHO.Region")
ggplot(df, aes(y=X1.week...increase)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of 1 week increase cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 50))
ggplot(df, aes(x=X1.week...increase)) +
geom_histogram( binwidth=3, fill="#69b3a2", color="#e9ecef", alpha=0.9, position = 'identity') +
ggtitle("Histogram of 1 week increase cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,50)
Warning message: "Removed 5 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
png(filename = "img/1_week_increase_conditional_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(x=as.factor(WHO.Region), y=X1.week...increase)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplots of 1 week increase cases divided by WHO region") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14,face="bold")
) +
coord_cartesian(ylim = c(0, 50)) +
xlab("WHO.Region")
dev.off()
png(filename = "img/1_week_increase_boxplot.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(y=X1.week...increase)) +
geom_boxplot(fill="slateblue", alpha=0.2) +
ggtitle("Boxplot of 1 week increase cases") +
theme(
axis.text=element_text(size=6.5),
axis.title=element_text(size=14, face="bold")
) +
coord_cartesian(ylim = c(0, 50))
dev.off()
png(filename = "img/1_week_increase_histogram.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(Confirmed)) +
geom_histogram( binwidth=3, fill="#69b3a2", color="#e9ecef", alpha=0.9) +
ggtitle("Histogram of 1 week increase cases") +
theme(plot.title = element_text(size=15)) +
xlim(0,50)
dev.off()
Warning message: "Removed 171 rows containing non-finite values (`stat_bin()`)." Warning message: "Removed 2 rows containing missing values (`geom_bar()`)."
table(df$WHO.Region)
Africa Americas Eastern Mediterranean 47 34 21 Europe South-East Asia Western Pacific 54 10 16
png(filename = "img/Scatter_confirmed_vs_deaths.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(x=Confirmed, y=Deaths, color=WHO.Region)) +
geom_point(size=3) +
geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
xlim(0,100000) +
ylim(0,4000) +
theme(legend.position = c(0.8, 0.8))
dev.off()
ggplot(df, aes(x=Confirmed, y=Deaths, color=WHO.Region)) +
geom_point(size=3) +
geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
xlim(0,100000) +
ylim(0,4000) +
theme(legend.position = c(0.85, 0.85))
`geom_smooth()` using formula = 'y ~ x' Warning message: "Removed 28 rows containing non-finite values (`stat_smooth()`)." Warning message: "Removed 28 rows containing missing values (`geom_point()`)."
`geom_smooth()` using formula = 'y ~ x' Warning message: "Removed 28 rows containing non-finite values (`stat_smooth()`)." Warning message: "Removed 28 rows containing missing values (`geom_point()`)."
confirmed_deaths_lm = lm(formula = Deaths ~ Confirmed, data=df)
summary(confirmed_deaths_lm)
Call: lm(formula = Deaths ~ Confirmed, data = df) Residuals: Min 1Q Median 3Q Max -17907 -574 -456 -435 35034 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 4.393e+02 3.864e+02 1.137 0.257 Confirmed 3.437e-02 9.720e-04 35.363 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 5080 on 180 degrees of freedom Multiple R-squared: 0.8742, Adjusted R-squared: 0.8735 F-statistic: 1251 on 1 and 180 DF, p-value: < 2.2e-16
png(filename = "img/Scatter_confirmed_vs_recovered.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(x=Confirmed, y=Recovered, color=WHO.Region)) +
geom_point(size=3) +
geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
xlim(0,100000) +
ylim(0,100000) +
theme(legend.position = c(0.8, 0.8))
dev.off()
ggplot(df, aes(x=Confirmed, y=Recovered, color=WHO.Region)) +
geom_point(size=3) +
geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
xlim(0,100000) +
ylim(0,100000) +
theme(legend.position = c(0.85, 0.85))
`geom_smooth()` using formula = 'y ~ x' Warning message: "Removed 23 rows containing non-finite values (`stat_smooth()`)." Warning message: "Removed 23 rows containing missing values (`geom_point()`)."
`geom_smooth()` using formula = 'y ~ x' Warning message: "Removed 23 rows containing non-finite values (`stat_smooth()`)." Warning message: "Removed 23 rows containing missing values (`geom_point()`)."
confirmed_recovered_lm = lm(formula = Recovered ~ Confirmed, data=df)
summary(confirmed_recovered_lm)
Call: lm(formula = Recovered ~ Confirmed, data = df) Residuals: Min 1Q Median 3Q Max -615078 -11847 -11340 -5746 736621 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 1.186e+04 6.195e+03 1.914 0.0572 . Confirmed 4.496e-01 1.558e-02 28.853 <2e-16 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 81440 on 180 degrees of freedom Multiple R-squared: 0.8222, Adjusted R-squared: 0.8212 F-statistic: 832.5 on 1 and 180 DF, p-value: < 2.2e-16
df_num = df[2:14]
# First
corrgram(df_num, order=TRUE, lower.panel=panel.shade, upper.panel=panel.pie, text.panel=panel.txt, main="Correlation matrix")
png(filename = "img/Correlation_matrix.png", units="in", width=5, height=5, res=300)
corrgram(df_num, order=TRUE, lower.panel=panel.shade, upper.panel=panel.pie, text.panel=panel.txt, main="Correlation matrix")
dev.off()
Let's try PCA!
str(df)
'data.frame': 182 obs. of 15 variables: $ Country.Region : chr "Afghanistan" "Albania" "Algeria" "Andorra" ... $ Confirmed : int 36263 4880 27973 907 950 86 167416 37390 15303 20558 ... $ Deaths : int 1269 144 1163 52 41 3 3059 711 167 713 ... $ Recovered : int 25198 2745 18837 803 242 65 72575 26665 9311 18246 ... $ Active : int 9796 1991 7973 52 667 18 91782 10014 5825 1599 ... $ New.cases : int 106 117 616 10 18 4 4890 73 368 86 ... $ New.deaths : int 10 6 8 0 1 0 120 6 6 1 ... $ New.recovered : int 18 63 749 0 0 5 2057 187 137 37 ... $ Deaths...100.Cases : num 3.5 2.95 4.16 5.73 4.32 3.49 1.83 1.9 1.09 3.47 ... $ Recovered...100.Cases : num 69.5 56.2 67.3 88.5 25.5 ... $ Deaths...100.Recovered: num 5.04 5.25 6.17 6.48 16.94 ... $ Confirmed.last.week : int 35526 4171 23691 884 749 76 130774 34981 12428 19743 ... $ X1.week.change : int 737 709 4282 23 201 10 36642 2409 2875 815 ... $ X1.week...increase : num 2.07 17 18.07 2.6 26.84 ... $ WHO.Region : chr "Eastern Mediterranean" "Europe" "Africa" "Europe" ...
df_num = df[2:14]
df_scaled = scale(df_num)
corr_mat = cor(df_scaled)
eig = eigen(corr_mat)
PVE = eig$values/sum(eig$values)
PVE
cumsum(PVE)
df_pca = prcomp(df_scaled)
# compute total variance
variance = df_pca$sdev^2 / sum(df_pca$sdev^2)
# Scree plot
qplot(c(1:13), variance) +
geom_line() +
geom_point(size=4)+
xlab("Principal Component") +
ylab("Variance Explained") +
ggtitle("Scree Plot") +
ylim(0, 1)
Warning message: "`qplot()` was deprecated in ggplot2 3.4.0."
# compute total variance
cum_variance = cumsum(variance)
# Scree plot
qplot(c(1:13), cum_variance) +
geom_line() +
geom_point(size=4)+
xlab("Principal Component") +
ylab("Variance Explained") +
ggtitle("Scree Plot") +
ylim(0, 1)
# Kaiser rule
kaiser = df_pca$sdev^2
print(kaiser)
which(kaiser>1)
[1] 8.020707e+00 1.602460e+00 1.360913e+00 6.636280e-01 4.938230e-01 [6] 4.153839e-01 2.592074e-01 1.154204e-01 4.700889e-02 1.691263e-02 [11] 4.536435e-03 2.061963e-31 1.867706e-33
#str(df_pca)
pca_scores = data.frame(df_pca$x[,1:2])
plot(pca_scores)
biplot(df_pca)
png(filename = "img/Biplot.png", units="in", width=5, height=5, res=300)
biplot(df_pca)
dev.off()
Let's try to clusterize the data!
k = 6
fit_kmeans <- kmeans(df_scaled, k)
table(df$WHO.Region, fit_kmeans$cluster)
1 2 3 4 5 6 Africa 1 23 0 22 1 0 Americas 1 14 2 14 3 0 Eastern Mediterranean 0 13 0 6 2 0 Europe 0 33 0 13 6 2 South-East Asia 0 6 1 3 0 0 Western Pacific 1 12 0 3 0 0
ggplot(df, aes(x=Confirmed, y=Deaths, color=factor(fit_kmeans$cluster))) +
geom_point(size=3) +
xlim(0,100000) +
ylim(0,4000) +
theme(legend.position = c(0.85, 0.85))
png(filename = "img/Scatter_confirmed_vs_deaths_cluster.png", units="in", width=5, height=5, res=300)
ggplot(df, aes(x=Confirmed, y=Deaths, color=factor(fit_kmeans$cluster))) +
geom_point(size=3) +
xlim(0,100000) +
ylim(0,4000) +
theme(legend.position = c(0.85, 0.85))
dev.off()
Warning message: "Removed 28 rows containing missing values (`geom_point()`)." Warning message: "Removed 28 rows containing missing values (`geom_point()`)."
ggplot(df, aes(x=Confirmed, y=Deaths, color=WHO.Region)) +
geom_point(size=3) +
xlim(0,100000) +
ylim(0,4000) +
theme(legend.position = c(0.85, 0.85))
Warning message: "Removed 28 rows containing missing values (`geom_point()`)."
pca_dataframe = data.frame(df_pca$x)
ggplot(pca_dataframe, aes(x=PC1, y=PC2, color=factor(fit_kmeans$cluster))) +
geom_point(size=3) +
theme(legend.position = c(0.85, 0.85))
png(filename = "img/biplot_cluster.png", units="in", width=5, height=5, res=300)
ggplot(pca_dataframe, aes(x=PC1, y=PC2, color=factor(fit_kmeans$cluster))) +
geom_point(size=3) +
theme(legend.position = c(0.85, 0.85))
dev.off()
ggplot(pca_dataframe, aes(x=PC1, y=PC2, color=df$WHO.Region)) +
geom_point(size=3) +
theme(legend.position = c(0.85, 0.85))
png(filename = "img/biplot_whoregion.png", units="in", width=5, height=5, res=300)
ggplot(pca_dataframe, aes(x=PC1, y=PC2, color=df$WHO.Region)) +
geom_point(size=3) +
theme(legend.position = c(0.85, 0.85))
dev.off()
Let's try a linear regressor
linear_model = lm(formula = Deaths ~ Confirmed + Recovered + Active + New.cases + New.deaths +
New.recovered + Deaths...100.Cases + Recovered...100.Cases +
Deaths...100.Recovered + Confirmed.last.week + X1.week.change +
X1.week...increase, data=df)
summary(linear_model)
Call: lm(formula = Deaths ~ Confirmed + Recovered + Active + New.cases + New.deaths + New.recovered + Deaths...100.Cases + Recovered...100.Cases + Deaths...100.Recovered + Confirmed.last.week + X1.week.change + X1.week...increase, data = df) Residuals: Min 1Q Median 3Q Max -3.938e-10 -1.841e-11 -1.111e-11 -5.690e-12 1.754e-09 Coefficients: (1 not defined because of singularities) Estimate Std. Error t value Pr(>|t|) (Intercept) 3.452e-11 4.622e-11 7.470e-01 0.456 Confirmed 1.000e+00 4.228e-15 2.365e+14 <2e-16 *** Recovered -1.000e+00 3.073e-15 -3.254e+14 <2e-16 *** Active -1.000e+00 3.133e-15 -3.191e+14 <2e-16 *** New.cases -5.663e-15 1.179e-14 -4.800e-01 0.632 New.deaths -4.424e-14 3.589e-13 -1.230e-01 0.902 New.recovered 1.051e-15 2.351e-14 4.500e-02 0.964 Deaths...100.Cases 7.179e-13 3.990e-12 1.800e-01 0.857 Recovered...100.Cases -1.770e-13 5.665e-13 -3.120e-01 0.755 Deaths...100.Recovered -5.731e-15 3.957e-14 -1.450e-01 0.885 Confirmed.last.week -9.233e-16 2.783e-15 -3.320e-01 0.740 X1.week.change NA NA NA NA X1.week...increase 5.668e-14 5.230e-13 1.080e-01 0.914 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 Residual standard error: 1.502e-10 on 170 degrees of freedom Multiple R-squared: 1, Adjusted R-squared: 1 F-statistic: 1.486e+29 on 11 and 170 DF, p-value: < 2.2e-16