Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ################################################################################
- ### Pull mortality data from the CDC and graph excess death - /u/MetricT
- ################################################################################
- library(tidyverse)
- library(janitor)
- library(cowplot)
- library(lubridate)
- library(tsibble)
- library(geofacet)
- library(cowplot)
- library(zoo)
- library(forecast)
- library(broom)
- # Load data on finalized (2014-2018) and provisional (2019-2020) mortality
- finalized_ss <- read_csv("https://data.cdc.gov/api/views/3yf8-kanr/rows.csv?accessType=DOWNLOAD")
- provisional_ss <- read_csv("https://data.cdc.gov/api/views/muzy-jte6/rows.csv?accessType=DOWNLOAD")
- # Take the data and clean it up a bit
- finalized <- finalized_ss %>% clean_names() %>% select("jurisdiction_of_occurrence", "mmwr_year", "mmwr_week", "all_cause")
- provisional <- provisional_ss %>% clean_names() %>% select("jurisdiction_of_occurrence", "mmwr_year", "mmwr_week", "all_cause")
- # Combine both datasets and filter the subset of states we want to look at
- deaths <-
- finalized %>%
- bind_rows(provisional) %>%
- rename(year = mmwr_year,
- week = mmwr_week,
- deaths = all_cause,
- state = jurisdiction_of_occurrence) %>%
- filter(state %in% c("Tennessee")) %>%
- # Quick hack to remove "week 53" in 2014 because yearweek() can't handle it
- # This wouldn't be necessary if the spreadsheet included dates, it's an aliasing problem
- filter(week != 53) %>%
- # Turn the year and week into a yearweek() datatype
- mutate(yearweek = yearweek(date_decimal(year + (week - 1) / 52))) %>%
- select(yearweek, deaths) %>%
- arrange(yearweek) %>%
- # Remove the last week's worth of data since TN lags in sending mortality data to CDC
- filter(row_number() <= n() - 1)
- # Use a feasts STL() model to seasonally-adjust the deaths data
- deaths_mod <-
- deaths %>%
- as_tsibble(index = "yearweek") %>%
- fill_gaps() %>%
- mutate_if(is.numeric, ~ na.locf(.)) %>%
- model(STL(deaths ~ trend() + season(window = "periodic"))) %>%
- components()
- # Add the STL data values to our original death tibble
- deaths <-
- deaths %>%
- left_join(deaths_mod %>% as_tibble() %>% select(-.model, -deaths), by = "yearweek") %>%
- # Ditch the "yearweek" at this point by converting it into a regular date.
- # "yearweek" was necessary for the tsibble model, but it's a weird way of
- # tracking date that causes weird glitches. For example, week("2014 W01") = 52.
- mutate(date = as.Date(yearweek)) %>%
- # Add a decimal date to make linear regression in the next step easier
- mutate(decimal_date = decimal_date(date))
- # Compute a linear regression using data from 2014-2019 (before COVID hit) so
- # we can subtract off deaths due to population/population growth.
- lin_reg <-
- deaths %>%
- filter(date < as.Date("2020-01-01")) %>%
- lm(season_adjust ~ decimal_date, data = .) %>%
- tidy() %>%
- pull("estimate")
- # Add a "growth" term based on that regression, so we can subtract growth off and get excess deaths
- deaths <-
- deaths %>%
- mutate(growth = lin_reg[1] + lin_reg[2] * decimal_date) %>%
- mutate(excess_deaths = season_adjust - growth) %>%
- select(yearweek, date, decimal_date, deaths, trend, season_year, remainder, season_adjust, growth, excess_deaths) %>%
- pivot_longer(-c(yearweek, date, decimal_date), names_to = "series", values_to = "value")
- ### Fetch total/new official deaths from TN DoH so we can graph against excess deaths
- read_excel_url <- function(url, ...) {
- tf <- tempfile(fileext = ".xlsx")
- curl::curl_download(url, tf)
- return(readxl::read_excel(tf, ...))
- }
- official_deaths <-
- "https://www.tn.gov/content/dam/tn/health/documents/cedep/novel-coronavirus/datasets/Public-Dataset-Daily-Case-Info.XLSX" %>%
- read_excel_url(col_types = NULL) %>%
- select(DATE, NEW_DEATHS, TOTAL_DEATHS) %>%
- rename(date = DATE,
- new_deaths = NEW_DEATHS,
- total_deaths = TOTAL_DEATHS) %>%
- mutate(date = as.Date(date),
- yearweek = yearweek(date)) %>%
- select(yearweek, new_deaths, total_deaths) %>%
- group_by(yearweek) %>%
- summarize(new_deaths = sum(new_deaths),
- total_deaths = max(total_deaths)) %>%
- ungroup() %>%
- mutate(date = as.Date(yearweek)) %>%
- select(yearweek, date, new_deaths, total_deaths)
- # Excess deaths since COVID arrived
- g_excess_deaths_covid_era <-
- deaths %>%
- filter(series %in% c("excess_deaths")) %>%
- filter(date >= as.Date("2020-01-01"),
- date < as.Date("2021-07-01")) %>%
- ggplot() +
- theme_bw() +
- geom_line(aes(x = date, y = value)) +
- geom_line(data = official_deaths, aes(x = date, y = new_deaths), color = "darkblue", linetype = "dashed") +
- geom_hline(yintercept = 0) + #, linetype = "dashed") +
- geom_vline(xintercept = as.Date("2020-03-20"), color = "darkred", linetype = "dotted") +
- annotate("text", size = 4, label = "1st Official TN COVID Death - 2020-03-20", x = as.Date("2020-03-12"), y = 500, angle = "90", color = "darkred") +
- scale_x_date(breaks = pretty_breaks(10)) +
- scale_y_continuous(breaks = pretty_breaks(10), labels = scales::comma) +
- labs(x = "", y = "New Excess Deaths per Week",
- title = "Official Weekly COVID deaths (blue) vs Weekly Excess Deaths in Tennessee, Jan 2020 - Jun 2021 (black)",
- #caption = "Mortality data to compute Excess Deaths from CDC.gov\nOfficial COVID Deaths from Tennessee Dept. of Health"
- )
- print(g_excess_deaths_covid_era)
- # Cumulative Excess deaths since COVID arrived
- g_tot_excess_deaths_covid_era <-
- deaths %>%
- filter(series %in% c("excess_deaths")) %>%
- filter(date >= as.Date("2020-01-01"),
- date < as.Date("2021-07-01")) %>%
- ggplot() +
- theme_bw() +
- geom_line(aes(x = date, y = cumsum(value))) +
- geom_line(data = official_deaths, aes(x = date, y = cumsum(new_deaths)), color = "darkblue", linetype = "dashed") +
- geom_hline(yintercept = 0) +
- geom_vline(xintercept = as.Date("2020-03-20"), color = "darkred", linetype = "dotted") +
- annotate("text", size = 4, label = "1st Official TN COVID Death - 2020-03-20", x = as.Date("2020-03-12"), y = 10000, angle = "90", color = "darkred") +
- scale_x_date(breaks = pretty_breaks(10)) +
- scale_y_continuous(breaks = pretty_breaks(10), labels = scales::comma) +
- labs(x = "", y = "Cumulative Excess Deaths",
- title = "Cumulative Official Weekly COVID deaths (blue) vs Weekly Excess Deaths in Tennessee, Jan 2020 - Jun 2021 (black)",
- caption = "Mortality data to compute Excess Deaths from CDC.gov\nOfficial COVID Deaths from Tennessee Dept. of Health")
- print(g_tot_excess_deaths_covid_era)
- plot_grid(g_excess_deaths_covid_era, g_tot_excess_deaths_covid_era, nrow = 2, ncol = 1, align = "hv")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement