TN Excess deaths vs Official COVID deaths

################################################################################
### Pull mortality data from the CDC and graph excess death - /u/MetricT
################################################################################

library(tidyverse)
library(janitor)
library(cowplot)
library(lubridate)
library(tsibble)
library(geofacet)
library(cowplot)
library(zoo)
library(forecast)
library(broom)

# Load data on finalized (2014-2018) and provisional (2019-2020) mortality
finalized_ss   <- read_csv("https://data.cdc.gov/api/views/3yf8-kanr/rows.csv?accessType=DOWNLOAD")
provisional_ss <- read_csv("https://data.cdc.gov/api/views/muzy-jte6/rows.csv?accessType=DOWNLOAD")


# Take the data and clean it up a bit
finalized   <- finalized_ss   %>% clean_names() %>% select("jurisdiction_of_occurrence", "mmwr_year", "mmwr_week", "all_cause")
provisional <- provisional_ss %>% clean_names() %>% select("jurisdiction_of_occurrence", "mmwr_year", "mmwr_week", "all_cause")


# Combine both datasets and filter the subset of states we want to look at
deaths <-
  finalized %>%
  bind_rows(provisional) %>%
  rename(year = mmwr_year,
         week = mmwr_week,
         deaths = all_cause,
         state = jurisdiction_of_occurrence) %>%

  filter(state %in% c("Tennessee")) %>%

  # Quick hack to remove "week 53" in 2014 because yearweek() can't handle it
  # This wouldn't be necessary if the spreadsheet included dates, it's an aliasing problem
  filter(week != 53) %>%

  # Turn the year and week into a yearweek() datatype
  mutate(yearweek = yearweek(date_decimal(year + (week - 1) / 52))) %>%
  select(yearweek, deaths) %>%
  arrange(yearweek) %>%

  # Remove the last week's worth of data since TN lags in sending mortality data to CDC
  filter(row_number() <= n() - 1)

# Use a feasts STL() model to seasonally-adjust the deaths data
deaths_mod <-
  deaths %>%
  as_tsibble(index = "yearweek") %>%
  fill_gaps() %>%
  mutate_if(is.numeric, ~ na.locf(.)) %>%
  model(STL(deaths ~ trend() + season(window = "periodic"))) %>%
  components()


# Add the STL data values to our original death tibble
deaths <-
  deaths %>%
  left_join(deaths_mod %>% as_tibble() %>% select(-.model, -deaths), by = "yearweek") %>%

  # Ditch the "yearweek" at this point by converting it into a regular date.
  # "yearweek" was necessary for the tsibble model, but it's a weird way of
  # tracking date that causes weird glitches.  For example, week("2014 W01") = 52.
  mutate(date = as.Date(yearweek)) %>%

  # Add a decimal date to make linear regression in the next step easier
  mutate(decimal_date = decimal_date(date))


# Compute a linear regression using data from 2014-2019 (before COVID hit) so
# we can subtract off deaths due to population/population growth.
lin_reg <-
  deaths %>%
  filter(date < as.Date("2020-01-01")) %>%
  lm(season_adjust ~ decimal_date, data = .) %>%
  tidy() %>%
  pull("estimate")


# Add a "growth" term based on that regression, so we can subtract growth off and get excess deaths
deaths <-
  deaths %>%
  mutate(growth = lin_reg[1] + lin_reg[2] * decimal_date) %>%
  mutate(excess_deaths = season_adjust - growth) %>%
  select(yearweek, date, decimal_date, deaths, trend, season_year, remainder, season_adjust, growth, excess_deaths) %>%
  pivot_longer(-c(yearweek, date, decimal_date), names_to = "series", values_to = "value")


### Fetch total/new official deaths from TN DoH so we can graph against excess deaths

read_excel_url <- function(url, ...) {
  tf <- tempfile(fileext = ".xlsx")
  curl::curl_download(url, tf)
  return(readxl::read_excel(tf, ...))
}

official_deaths <-
  "https://www.tn.gov/content/dam/tn/health/documents/cedep/novel-coronavirus/datasets/Public-Dataset-Daily-Case-Info.XLSX" %>%
  read_excel_url(col_types = NULL) %>%
  select(DATE, NEW_DEATHS, TOTAL_DEATHS) %>%
  rename(date         = DATE,
         new_deaths   = NEW_DEATHS,
         total_deaths = TOTAL_DEATHS) %>%
  mutate(date = as.Date(date),
         yearweek = yearweek(date)) %>%
  select(yearweek, new_deaths, total_deaths) %>%
  group_by(yearweek) %>%
  summarize(new_deaths = sum(new_deaths),
            total_deaths = max(total_deaths)) %>%
  ungroup() %>%
  mutate(date = as.Date(yearweek)) %>%
  select(yearweek, date, new_deaths, total_deaths)


# Excess deaths since COVID arrived
g_excess_deaths_covid_era <-
  deaths %>%
  filter(series %in% c("excess_deaths")) %>%
  filter(date >= as.Date("2020-01-01"),
         date <  as.Date("2021-07-01")) %>%
  ggplot() +
  theme_bw() +
  geom_line(aes(x = date, y = value)) +
  geom_line(data = official_deaths, aes(x = date, y = new_deaths), color = "darkblue", linetype = "dashed") +
  geom_hline(yintercept = 0) + #, linetype = "dashed") +
  geom_vline(xintercept = as.Date("2020-03-20"), color = "darkred", linetype = "dotted") +
  annotate("text", size = 4, label = "1st Official TN COVID Death - 2020-03-20", x = as.Date("2020-03-12"), y = 500, angle = "90", color = "darkred") +
  scale_x_date(breaks = pretty_breaks(10)) +
  scale_y_continuous(breaks = pretty_breaks(10), labels = scales::comma) +
  labs(x = "", y = "New Excess Deaths per Week",
       title = "Official Weekly COVID deaths (blue) vs Weekly Excess Deaths in Tennessee, Jan 2020 - Jun 2021 (black)",
       #caption = "Mortality data to compute Excess Deaths from CDC.gov\nOfficial COVID Deaths from Tennessee Dept. of Health"
       )
print(g_excess_deaths_covid_era)


# Cumulative Excess deaths since COVID arrived
g_tot_excess_deaths_covid_era <-
  deaths %>%
  filter(series %in% c("excess_deaths")) %>%
  filter(date >= as.Date("2020-01-01"),
         date <  as.Date("2021-07-01")) %>%
  ggplot() +
  theme_bw() +
  geom_line(aes(x = date, y = cumsum(value))) +
  geom_line(data = official_deaths, aes(x = date, y = cumsum(new_deaths)), color = "darkblue", linetype = "dashed") +
  geom_hline(yintercept = 0) +
  geom_vline(xintercept = as.Date("2020-03-20"), color = "darkred", linetype = "dotted") +
  annotate("text", size = 4, label = "1st Official TN COVID Death - 2020-03-20", x = as.Date("2020-03-12"), y = 10000, angle = "90", color = "darkred") +
  scale_x_date(breaks = pretty_breaks(10)) +
  scale_y_continuous(breaks = pretty_breaks(10), labels = scales::comma) +
  labs(x = "", y = "Cumulative Excess Deaths",
       title = "Cumulative Official Weekly COVID deaths (blue) vs Weekly Excess Deaths in Tennessee, Jan 2020 - Jun 2021 (black)",
       caption = "Mortality data to compute Excess Deaths from CDC.gov\nOfficial COVID Deaths from Tennessee Dept. of Health")
print(g_tot_excess_deaths_covid_era)

plot_grid(g_excess_deaths_covid_era, g_tot_excess_deaths_covid_era, nrow = 2, ncol = 1, align = "hv")