knitr::opts_chunk$set(message = FALSE, warning = FALSE,echo = TRUE,results = TRUE)

Download and import data

rm(list = ls())

workdir      = "G:/Dropbox/idx/topic/20210213T213259"
setwd(workdir)

XLSXFile <- "data.xlsx"

dataURL <- 'https://www.destatis.de/DE/Themen/Gesellschaft-Umwelt/Bevoelkerung/Sterbefaelle-Lebenserwartung/Tabellen/sonderauswertung-sterbefaelle.xlsx?__blob=publicationFile'

download.file(dataURL, destfile=XLSXFile,mode='wb')

df <- readxl::read_excel(XLSXFile,
                      sheet=4,
                      skip=8,
                      trim_ws=TRUE,
                      col_types = "text")
# head(df)

Prepare data set, extract date information

library(tidyverse)

# long format
df2 <- df %>%
  pivot_longer(cols = matches("\\d{2}.\\d{2}."),
               names_to = c("day"),
               values_to = "value",
               names_pattern =  "(\\d{2}.\\d{2}.)"
               )
#head(df2)

# create datestamps
df2$datestamp <- as.character(paste(df2$day,df2$...1,sep=""))

df2$POSIXct <- as.POSIXct(df2$datestamp,
                             format = "%d.%m.%Y", 
                             tz="CET")

# split datestamp
df2$year  <- format(df2$POSIXct,"%Y")
df2$month <- format(df2$POSIXct,"%m")
df2$day <- format(df2$POSIXct,"%d")
df2$monthDay <- format(df2$POSIXct,"%b-%d")
df2$month2 <- format(df2$POSIXct,"%b")
df2$week <- format(df2$POSIXct,"%W")

df2$DayOfYear <- as.numeric(format(df2$POSIXct, "%j"))
df2$CommonDate <- as.Date(paste0("2000-",format(df2$POSIXct, "%j")), "%Y-%j")

# delete unneeded cols
df2$Insgesamt <- NULL
df2$...1 <- NULL

# delete empty rows
df2 <- na.omit(df2)

Plot data over full time

library(ggplot2)
p<-ggplot(df2,aes(x=POSIXct, y=as.numeric(value))) +
  geom_point()

# interactive 
library(plotly)
ggplotly(p)

Plot data over year

library(scales)
p<-ggplot(df2,
       aes(x=CommonDate, 
           y=as.numeric(value)))+
  geom_line()+
  facet_grid(year~.)+
  scale_x_date(labels = date_format("%d-%b"), breaks = date_breaks("1 month")) 

# interactive
ggplotly(p)

Plot 2020 compared to previous years

library(lubridate)

ggplot(data=filter(df2,POSIXct >= as.Date('2016-01-01') & POSIXct <= as.Date('2019-12-31')  ),
            aes(x=CommonDate, 
           y=as.numeric(value)))+
  geom_point()+
  geom_smooth()+
  geom_line(data=filter(df2,POSIXct >= as.Date('2020-01-01') & POSIXct <= as.Date('2020-12-31')  ),
            aes(x=CommonDate, 
           y=as.numeric(value)),color="red")+
   scale_x_date(labels = date_format("%d-%b"), breaks = date_breaks("1 month")) 

Compare 2018 with 2020

ggplot()+
  geom_line(data=filter(df2,POSIXct >= as.Date('2017-11-01') & POSIXct <= as.Date('2019-03-01')  ),
            aes(x=CommonDate, 
           y=as.numeric(value)))+
  geom_line(data=filter(df2,POSIXct >= as.Date('2019-11-01') & POSIXct <= as.Date('2021-03-01')  ),
            aes(x=CommonDate, 
           y=as.numeric(value)),color="red")+
   scale_x_date(labels = date_format("%d-%b"), breaks = date_breaks("1 month")) 

Boxplot of daily deaths per month per year

dfmonth <- df2 %>% 
  group_by(month,year,day) %>%
  summarise(value = sum(as.numeric(value)))


b <- aes(x=as.factor(month), 
           y=as.numeric(value),color=year)

ggplot(data=filter(dfmonth,year!="2021"),
            aes(x=as.factor(month), 
           y=as.numeric(value)))+
     geom_point(b, position = position_jitterdodge())+
    geom_boxplot(b,alpha=0.5)

Weeklysums over years compared to average

df2$week2 <- as.numeric(df2$week)
df2$value2 <- as.numeric(df2$value)

# calculated weekly values
df3<-df2  %>%
  group_by(week2,year) %>%
  summarise(weekvalue = sum(value2))

# average per week all data
df4<-df3  %>%
  group_by(week2) %>%
  summarise(weekvalue = mean(weekvalue))

# ignore first and last weeks of calculated weekly values
df5 <- subset(df3,week2!=0& week2!=52 & week2!=53)
df6 <- rename(df5,year2=year)

# data without 2020 and 2021
df7 <- subset(df6,year2!=2020 & year2!=2021)


aes1 <- aes(x=week2, y=weekvalue,group=year)
aes2 <- aes(x=week2, y=weekvalue)

# plot actual weekly values with average weekly values
ggplot(na.omit(df5)) +
  geom_point(aes1)+
  facet_grid(year~.)+
  geom_smooth(aes1,method="loess",span = 0.3,formula = y~x)+
  stat_smooth(data=df6,aes2,stat='summary',fun.data=mean_cl_boot,color="black")+ # average with 2020, 2021
  stat_smooth(data=df7,aes2,stat='summary',fun.data=mean_cl_boot,color="red") # average without 2020, 2021

yearly cumsum

dfyearly<-df5  %>%
  group_by(year) %>%
  summarise(yearvalue = sum(weekvalue))

Estimation stats with dabestr

Weekly

library(dabestr)

dab <- dabest(df5, 
       year, weekvalue,
       idx = c("2016","2017","2018", "2019", "2020","2021"),
       paired = FALSE)



dab_median <- median_diff(dab,reps=5000) 


dab_median
## dabestr (Data Analysis with Bootstrap Estimation in R) v0.3.0
## =============================================================
## 
## Good evening!
## The current time is 22:10  on Dienstag Mai 18, 2021.
## 
## Dataset    :  df5
## X Variable :  year
## Y Variable :  weekvalue
## 
## Unpaired median difference of 2017 (n = 51) minus 2016 (n = 51)
##  -501 [95CI  -1370; 196]
## 
## Unpaired median difference of 2018 (n = 51) minus 2016 (n = 51)
##  -354 [95CI  -1410; 756]
## 
## Unpaired median difference of 2019 (n = 51) minus 2016 (n = 51)
##  272 [95CI  -596; 1040]
## 
## Unpaired median difference of 2020 (n = 51) minus 2016 (n = 51)
##  994 [95CI  -186; 2140]
## 
## Unpaired median difference of 2021 (n = 18) minus 2016 (n = 51)
##  1430 [95CI  473; 2900]
## 
## 
## 5000 bootstrap resamples.
## All confidence intervals are bias-corrected and accelerated.
plot(dab_median,float.contrast = FALSE)

Daily

dfdaily <- df2
dfdaily$value <- as.numeric(dfdaily$value)
dab <- dabest(dfdaily, 
       year, value,
       idx = c("2016","2017","2018", "2019", "2020","2021"),
       paired = FALSE)



dab_median <- median_diff(dab,reps=5000) 


dab_median
## dabestr (Data Analysis with Bootstrap Estimation in R) v0.3.0
## =============================================================
## 
## Good evening!
## The current time is 22:11  on Dienstag Mai 18, 2021.
## 
## Dataset    :  dfdaily
## X Variable :  year
## Y Variable :  value
## 
## Unpaired median difference of 2017 (n = 365) minus 2016 (n = 366)
##  -35 [95CI  -78; 9]
## 
## Unpaired median difference of 2018 (n = 365) minus 2016 (n = 366)
##  0 [95CI  -45; 54]
## 
## Unpaired median difference of 2019 (n = 365) minus 2016 (n = 366)
##  58 [95CI  15.5; 101]
## 
## Unpaired median difference of 2020 (n = 366) minus 2016 (n = 366)
##  150 [95CI  82.5; 207]
## 
## Unpaired median difference of 2021 (n = 129) minus 2016 (n = 366)
##  253 [95CI  202; 386]
## 
## 
## 5000 bootstrap resamples.
## All confidence intervals are bias-corrected and accelerated.
plot(dab_median,float.contrast = FALSE)

Normalise to total population!