knitr::opts_chunk$set(message = FALSE, warning = FALSE,echo = TRUE,results = TRUE)

Download and import data

all plots are absolute values!
TODO: relative data, normalized by population

rm(list = ls())

workdir      = "G:/Dropbox/idx/topic/20210213T213259"
setwd(workdir)

XLSXFile <- "data.xlsx"

dataURL <- 'https://www.destatis.de/DE/Themen/Gesellschaft-Umwelt/Bevoelkerung/Sterbefaelle-Lebenserwartung/Tabellen/sonderauswertung-sterbefaelle.xlsx?__blob=publicationFile'

download.file(dataURL, destfile=XLSXFile,mode='wb')

df <- readxl::read_excel(XLSXFile,
                      sheet=4,
                      skip=8,
                      trim_ws=TRUE,
                      col_types = "text")
# head(df)

Prepare data set, extract date information

library(tidyverse)

# long format
df2 <- df %>%
  pivot_longer(cols = matches("\\d{2}.\\d{2}."),
               names_to = c("day"),
               values_to = "value",
               names_pattern =  "(\\d{2}.\\d{2}.)"
               )
#head(df2)

# create datestamps
df2$datestamp <- as.character(paste(df2$day,df2$...1,sep=""))

df2$POSIXct <- as.POSIXct(df2$datestamp,
                             format = "%d.%m.%Y", 
                             tz="CET")

# split datestamp
df2$year  <- format(df2$POSIXct,"%Y")
df2$month <- format(df2$POSIXct,"%m")
df2$day <- format(df2$POSIXct,"%d")
df2$monthDay <- format(df2$POSIXct,"%b-%d")
df2$month2 <- format(df2$POSIXct,"%b")
df2$week <- format(df2$POSIXct,"%W")

df2$DayOfYear <- as.numeric(format(df2$POSIXct, "%j"))
df2$CommonDate <- as.Date(paste0("2000-",format(df2$POSIXct, "%j")), "%Y-%j")

# delete unneeded cols
df2$Insgesamt <- NULL
df2$...1 <- NULL

# delete empty rows
df2 <- na.omit(df2)

Plot data over full time

library(ggplot2)
p<-ggplot(df2,aes(x=POSIXct, y=as.numeric(value))) +
  geom_point()

# interactive 
library(plotly)
ggplotly(p)

Plot data over year

library(scales)
p<-ggplot(df2,
       aes(x=CommonDate, 
           y=as.numeric(value)))+
  geom_line()+
  facet_grid(year~.)+
  scale_x_date(labels = date_format("%d-%b"), breaks = date_breaks("1 month")) 

# interactive
ggplotly(p)

Plot 2020 compared to previous years

library(lubridate)

ggplot(data=filter(df2,POSIXct >= as.Date('2016-01-01') & POSIXct <= as.Date('2019-12-31')  ),
            aes(x=CommonDate, 
           y=as.numeric(value)))+
  geom_point()+
  geom_smooth()+
  geom_line(data=filter(df2,POSIXct >= as.Date('2020-01-01') & POSIXct <= as.Date('2020-12-31')  ),
            aes(x=CommonDate, 
           y=as.numeric(value)),color="red")+
   scale_x_date(labels = date_format("%d-%b"), breaks = date_breaks("1 month"))

Compare 2018 with 2020

ggplot()+
  geom_line(data=filter(df2,POSIXct >= as.Date('2017-11-01') & POSIXct <= as.Date('2019-03-01')  ),
            aes(x=CommonDate, 
           y=as.numeric(value)))+
  geom_line(data=filter(df2,POSIXct >= as.Date('2019-11-01') & POSIXct <= as.Date('2021-03-01')  ),
            aes(x=CommonDate, 
           y=as.numeric(value)),color="red")+
   scale_x_date(labels = date_format("%d-%b"), breaks = date_breaks("1 month"))

Boxplot of daily deaths per month per year

dfmonth <- df2 %>% 
  group_by(month,year,day) %>%
  summarise(value = sum(as.numeric(value)))


b <- aes(x=as.factor(month), 
           y=as.numeric(value),color=year)

ggplot(data=filter(dfmonth,year!="2021"),
            aes(x=as.factor(month), 
           y=as.numeric(value)))+
     geom_point(b, position = position_jitterdodge())+
    geom_boxplot(b,alpha=0.5)

Weeklysums over years compared to average

df2$week2 <- as.numeric(df2$week)
df2$value2 <- as.numeric(df2$value)

# calculated weekly values
df3<-df2  %>%
  group_by(week2,year) %>%
  summarise(weekvalue = sum(value2))

# average per week all data
df4<-df3  %>%
  group_by(week2) %>%
  summarise(weekvalue = mean(weekvalue))

# ignore first and last weeks of calculated weekly values
df5 <- subset(df3,week2!=0& week2!=52 & week2!=53)
df6 <- rename(df5,year2=year)

# data without 2020 and 2021
df7 <- subset(df6,year2!=2020 & year2!=2021)


aes1 <- aes(x=week2, y=weekvalue,group=year)
aes2 <- aes(x=week2, y=weekvalue)

# plot actual weekly values with average weekly values
ggplot(na.omit(df5)) +
  geom_point(aes1)+
  facet_grid(year~.)+
  geom_smooth(aes1,method="loess",span = 0.3,formula = y~x)+
  stat_smooth(data=df6,aes2,stat='summary',fun.data=mean_cl_boot,color="black")+ # average with 2020, 2021
  stat_smooth(data=df7,aes2,stat='summary',fun.data=mean_cl_boot,color="red") # average without 2020, 2021

yearly cumsum

dfyearly<-df5  %>%
  group_by(year) %>%
  summarise(yearvalue = sum(weekvalue))

Estimation stats with dabestr

Weekly

library(dabestr)

dab <- dabest(df5, 
       year, weekvalue,
       idx = c("2016","2017","2018", "2019", "2020","2021"),
       paired = FALSE)



dab_median <- median_diff(dab,reps=5000) 


dab_median

## dabestr (Data Analysis with Bootstrap Estimation in R) v0.3.0
## =============================================================
## 
## Good evening!
## The current time is 22:10  on Dienstag Mai 18, 2021.
## 
## Dataset    :  df5
## X Variable :  year
## Y Variable :  weekvalue
## 
## Unpaired median difference of 2017 (n = 51) minus 2016 (n = 51)
##  -501 [95CI  -1370; 196]
## 
## Unpaired median difference of 2018 (n = 51) minus 2016 (n = 51)
##  -354 [95CI  -1410; 756]
## 
## Unpaired median difference of 2019 (n = 51) minus 2016 (n = 51)
##  272 [95CI  -596; 1040]
## 
## Unpaired median difference of 2020 (n = 51) minus 2016 (n = 51)
##  994 [95CI  -186; 2140]
## 
## Unpaired median difference of 2021 (n = 18) minus 2016 (n = 51)
##  1430 [95CI  473; 2900]
## 
## 
## 5000 bootstrap resamples.
## All confidence intervals are bias-corrected and accelerated.

plot(dab_median,float.contrast = FALSE)

Daily

dfdaily <- df2
dfdaily$value <- as.numeric(dfdaily$value)
dab <- dabest(dfdaily, 
       year, value,
       idx = c("2016","2017","2018", "2019", "2020","2021"),
       paired = FALSE)



dab_median <- median_diff(dab,reps=5000) 


dab_median

## dabestr (Data Analysis with Bootstrap Estimation in R) v0.3.0
## =============================================================
## 
## Good evening!
## The current time is 22:11  on Dienstag Mai 18, 2021.
## 
## Dataset    :  dfdaily
## X Variable :  year
## Y Variable :  value
## 
## Unpaired median difference of 2017 (n = 365) minus 2016 (n = 366)
##  -35 [95CI  -78; 9]
## 
## Unpaired median difference of 2018 (n = 365) minus 2016 (n = 366)
##  0 [95CI  -45; 54]
## 
## Unpaired median difference of 2019 (n = 365) minus 2016 (n = 366)
##  58 [95CI  15.5; 101]
## 
## Unpaired median difference of 2020 (n = 366) minus 2016 (n = 366)
##  150 [95CI  82.5; 207]
## 
## Unpaired median difference of 2021 (n = 129) minus 2016 (n = 366)
##  253 [95CI  202; 386]
## 
## 
## 5000 bootstrap resamples.
## All confidence intervals are bias-corrected and accelerated.

plot(dab_median,float.contrast = FALSE)

Normalise to total population!

R: Date series exercise zur Sterblichkeit

2021

Download and import data

Prepare data set, extract date information

Plot data over full time

Plot data over year

Plot 2020 compared to previous years

Compare 2018 with 2020

Boxplot of daily deaths per month per year

Weeklysums over years compared to average

yearly cumsum

Estimation stats with dabestr

Weekly

Daily

Normalise to total population!