knitr::opts_chunk$set(message = FALSE, warning = FALSE,echo = TRUE,results = TRUE)
Download and import data
- all plots are absolute values!
- TODO: relative data, normalized by population
rm(list = ls())
workdir = "G:/Dropbox/idx/topic/20210213T213259"
setwd(workdir)
XLSXFile <- "data.xlsx"
dataURL <- 'https://www.destatis.de/DE/Themen/Gesellschaft-Umwelt/Bevoelkerung/Sterbefaelle-Lebenserwartung/Tabellen/sonderauswertung-sterbefaelle.xlsx?__blob=publicationFile'
download.file(dataURL, destfile=XLSXFile,mode='wb')
df <- readxl::read_excel(XLSXFile,
sheet=4,
skip=8,
trim_ws=TRUE,
col_types = "text")
# head(df)
Plot data over full time
library(ggplot2)
p<-ggplot(df2,aes(x=POSIXct, y=as.numeric(value))) +
geom_point()
# interactive
library(plotly)
ggplotly(p)
Plot data over year
library(scales)
p<-ggplot(df2,
aes(x=CommonDate,
y=as.numeric(value)))+
geom_line()+
facet_grid(year~.)+
scale_x_date(labels = date_format("%d-%b"), breaks = date_breaks("1 month"))
# interactive
ggplotly(p)
Plot 2020 compared to previous years
library(lubridate)
ggplot(data=filter(df2,POSIXct >= as.Date('2016-01-01') & POSIXct <= as.Date('2019-12-31') ),
aes(x=CommonDate,
y=as.numeric(value)))+
geom_point()+
geom_smooth()+
geom_line(data=filter(df2,POSIXct >= as.Date('2020-01-01') & POSIXct <= as.Date('2020-12-31') ),
aes(x=CommonDate,
y=as.numeric(value)),color="red")+
scale_x_date(labels = date_format("%d-%b"), breaks = date_breaks("1 month"))
Compare 2018 with 2020
ggplot()+
geom_line(data=filter(df2,POSIXct >= as.Date('2017-11-01') & POSIXct <= as.Date('2019-03-01') ),
aes(x=CommonDate,
y=as.numeric(value)))+
geom_line(data=filter(df2,POSIXct >= as.Date('2019-11-01') & POSIXct <= as.Date('2021-03-01') ),
aes(x=CommonDate,
y=as.numeric(value)),color="red")+
scale_x_date(labels = date_format("%d-%b"), breaks = date_breaks("1 month"))
Boxplot of daily deaths per month per year
dfmonth <- df2 %>%
group_by(month,year,day) %>%
summarise(value = sum(as.numeric(value)))
b <- aes(x=as.factor(month),
y=as.numeric(value),color=year)
ggplot(data=filter(dfmonth,year!="2021"),
aes(x=as.factor(month),
y=as.numeric(value)))+
geom_point(b, position = position_jitterdodge())+
geom_boxplot(b,alpha=0.5)
Weeklysums over years compared to average
df2$week2 <- as.numeric(df2$week)
df2$value2 <- as.numeric(df2$value)
# calculated weekly values
df3<-df2 %>%
group_by(week2,year) %>%
summarise(weekvalue = sum(value2))
# average per week all data
df4<-df3 %>%
group_by(week2) %>%
summarise(weekvalue = mean(weekvalue))
# ignore first and last weeks of calculated weekly values
df5 <- subset(df3,week2!=0& week2!=52 & week2!=53)
df6 <- rename(df5,year2=year)
# data without 2020 and 2021
df7 <- subset(df6,year2!=2020 & year2!=2021)
aes1 <- aes(x=week2, y=weekvalue,group=year)
aes2 <- aes(x=week2, y=weekvalue)
# plot actual weekly values with average weekly values
ggplot(na.omit(df5)) +
geom_point(aes1)+
facet_grid(year~.)+
geom_smooth(aes1,method="loess",span = 0.3,formula = y~x)+
stat_smooth(data=df6,aes2,stat='summary',fun.data=mean_cl_boot,color="black")+ # average with 2020, 2021
stat_smooth(data=df7,aes2,stat='summary',fun.data=mean_cl_boot,color="red") # average without 2020, 2021
yearly cumsum
dfyearly<-df5 %>%
group_by(year) %>%
summarise(yearvalue = sum(weekvalue))
Estimation stats with dabestr
Weekly
library(dabestr)
dab <- dabest(df5,
year, weekvalue,
idx = c("2016","2017","2018", "2019", "2020","2021"),
paired = FALSE)
dab_median <- median_diff(dab,reps=5000)
dab_median
## dabestr (Data Analysis with Bootstrap Estimation in R) v0.3.0
## =============================================================
##
## Good evening!
## The current time is 22:10 on Dienstag Mai 18, 2021.
##
## Dataset : df5
## X Variable : year
## Y Variable : weekvalue
##
## Unpaired median difference of 2017 (n = 51) minus 2016 (n = 51)
## -501 [95CI -1370; 196]
##
## Unpaired median difference of 2018 (n = 51) minus 2016 (n = 51)
## -354 [95CI -1410; 756]
##
## Unpaired median difference of 2019 (n = 51) minus 2016 (n = 51)
## 272 [95CI -596; 1040]
##
## Unpaired median difference of 2020 (n = 51) minus 2016 (n = 51)
## 994 [95CI -186; 2140]
##
## Unpaired median difference of 2021 (n = 18) minus 2016 (n = 51)
## 1430 [95CI 473; 2900]
##
##
## 5000 bootstrap resamples.
## All confidence intervals are bias-corrected and accelerated.
plot(dab_median,float.contrast = FALSE)
Daily
dfdaily <- df2
dfdaily$value <- as.numeric(dfdaily$value)
dab <- dabest(dfdaily,
year, value,
idx = c("2016","2017","2018", "2019", "2020","2021"),
paired = FALSE)
dab_median <- median_diff(dab,reps=5000)
dab_median
## dabestr (Data Analysis with Bootstrap Estimation in R) v0.3.0
## =============================================================
##
## Good evening!
## The current time is 22:11 on Dienstag Mai 18, 2021.
##
## Dataset : dfdaily
## X Variable : year
## Y Variable : value
##
## Unpaired median difference of 2017 (n = 365) minus 2016 (n = 366)
## -35 [95CI -78; 9]
##
## Unpaired median difference of 2018 (n = 365) minus 2016 (n = 366)
## 0 [95CI -45; 54]
##
## Unpaired median difference of 2019 (n = 365) minus 2016 (n = 366)
## 58 [95CI 15.5; 101]
##
## Unpaired median difference of 2020 (n = 366) minus 2016 (n = 366)
## 150 [95CI 82.5; 207]
##
## Unpaired median difference of 2021 (n = 129) minus 2016 (n = 366)
## 253 [95CI 202; 386]
##
##
## 5000 bootstrap resamples.
## All confidence intervals are bias-corrected and accelerated.
plot(dab_median,float.contrast = FALSE)
Normalise to total population!