Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ### Idea from: https://www.reddit.com/r/dataisbeautiful/comments/i1sedr/oc_united_states_covid19_deaths_curve_with/
- ### Implementation by /u/MetricT
- library(tidyverse)
- library(tidycensus)
- library(tigris)
- library(forecast)
- library(naniar)
- library(useful)
- library(TTR)
- library(politicaldata)
- ### Download this Github repo and add the path to the spreadsheet
- ### https://github.com/nytimes/covid-19-data
- # Load the spreadsheet (which contains total cases/deaths) and compute
- # new cases/deaths
- spreadsheet <-
- "../Datasets/nytimes/covid-19-data/us-counties.csv" %>%
- read_csv(col_names = TRUE, col_types = "Dcccdd") %>%
- mutate(state_fips = substr(fips, 1, 2)) %>%
- filter(!state_fips %in% c(69, 72, 78)) %>%
- filter(!is.na(state_fips)) %>%
- select(date, fips, cases, deaths) %>%
- arrange(date) %>%
- pivot_wider(id_cols = "date",
- names_from = "fips",
- values_from = c("cases", "deaths")) %>%
- mutate_if(is.numeric, ~ (replace_na(., 0))) %>%
- mutate(across(-date, ~ . - lag(.))) %>%
- mutate_if(is.numeric, ~ (replace_na(., 0))) %>%
- mutate("cases_53061" = if_else(date == as.Date("2020-01-21"), 1, `cases_53061`)) %>%
- pivot_longer(-date, names_to = c("type", "fips"), names_sep = "_", values_to = "values") %>%
- pivot_wider(id_cols = c("date", "fips"), names_from = "type", values_from = "values")
- current_date <- spreadsheet %>% arrange(date) %>% tail(n = 1) %>% pull("date")
- ### Download this Github repo and add the path to the spreadsheet
- ### https://github.com/tonmcg/US_County_Level_Election_Results_08-16
- pres_2016 <-
- "../Datasets/US_County_Level_Election_Results_08-16/2016_US_County_Level_Presidential_Results.csv" %>%
- read_csv(col_names = TRUE, col_types = "ddddddncccc") %>%
- rename(index = X1,
- fips = combined_fips,
- dem = per_dem,
- rep = per_gop) %>%
- mutate(fips = str_pad(fips, 5, pad = "0"),
- year = 2016,
- other = 1 - dem - rep) %>%
- mutate(fips = if_else(fips == "46113", "46102", fips)) %>%
- mutate(fips = if_else(fips == "02270", "02158", fips)) %>%
- rowwise() %>%
- mutate(winner = max(dem, rep, other)) %>%
- ungroup() %>%
- mutate(winner = case_when(
- winner == dem ~ "Democrat",
- winner == rep ~ "Republican",
- winner == other ~ "Other")) %>%
- mutate(margin = rep - dem) %>%
- select(year, total_votes, dem, rep, other, winner, margin, fips)
- population <-
- get_acs(geography = "county",
- variables = c("B01003_001"),
- year = 2016,
- geometry = FALSE,
- cache_table = TRUE) %>%
- rename(population = estimate) %>%
- select(GEOID, population)
- pres_2016 <-
- pres_2016 %>%
- left_join(population, by = c("fips"= "GEOID")) %>%
- mutate(voter_turnout = total_votes / population)
- pres_2016 <-
- pres_2016 %>%
- arrange(desc(margin)) %>%
- mutate(cum_population = cumsum(population),
- percent = population / 322087547,
- cum_percent = cum_population / 322087547) %>%
- mutate(subtype = case_when(
- cum_percent < 0.2 ~ "Very\nGOP",
- cum_percent < 0.4 ~ "GOP",
- cum_percent < 0.6 ~ "Swing\nCounties",
- cum_percent < 0.8 ~ "Dem",
- cum_percent <= 1 ~ "Very\nDem"))
- ################################################################################
- ### Take our spreadsheet data, reformat it a bit, and compute a 7-day SMA
- ################################################################################
- data <-
- spreadsheet %>%
- left_join(pres_2016, by = "fips") %>%
- group_by(date, subtype) %>%
- summarize(cases = sum(cases, na.rm = TRUE),
- deaths = sum(deaths, na.rm = TRUE)) %>%
- ungroup() %>%
- pivot_wider(id_cols = c("date"), names_from = "subtype", values_from = c("cases", "deaths")) %>%
- ### Take a 7-day SMA of values to smooth them out a bit
- mutate(across(-date,
- .fns = list(sma = ~ SMA(., n = 7)),
- .names = "{fn}_{col}"
- )) %>%
- select("date", starts_with("sma_")) %>%
- rename_at(vars(starts_with("sma_")), ~ str_replace(., "sma_", "")) %>%
- filter(date >= as.Date("2020-03-01")) %>%
- pivot_longer(-date, names_to = c("cases_deaths", "subtype"), names_sep = "_", values_to = "values") %>%
- pivot_wider(id_cols = c("date", "subtype"), names_from = "cases_deaths", values_from = "values") %>%
- mutate_if(is.numeric, ~ (replace_na(., 0)))
- ################################################################################
- ### Draw the inset "Regional Curves" graph
- ################################################################################
- data$subtype <- factor(data$subtype, levels = c("Very\nGOP", "GOP", "Swing\nCounties", "Dem", "Very\nDem"))
- g_regional_curves_cases <-
- ggplot(data = data, aes(x = as.Date(date), y = cases)) +
- theme_bw() +
- theme(legend.position = "none") +
- theme(strip.background = element_blank()) +
- geom_line(color = "black", size = 1) +
- geom_area(aes(fill = as.factor(subtype))) +
- labs(title = "Cases by Political Lean", x = "", y = "") +
- scale_y_continuous(labels = scales::comma) +
- scale_fill_manual(values = c("Very\nDem" = "#0015BC",
- "Dem" = "#8bb1ff",
- "Swing\nCounties" = "#FFFFFF",
- "GOP" = "#ff8b98",
- "Very\nGOP" = "#E9141D")) +
- facet_wrap(~ subtype, nrow = 5, ncol = 1, strip.position = "right")
- print(g_regional_curves_cases)
- g_regional_curves_deaths <-
- ggplot(data = data, aes(x = as.Date(date), y = deaths)) +
- theme_bw() +
- theme(legend.position = "none") +
- theme(strip.background = element_blank()) +
- geom_line(color = "black", size = 1) +
- geom_area(aes(fill = as.factor(subtype))) +
- labs(title = "Deaths by Political Lean", x = "", y = "") +
- scale_y_continuous(labels = scales::comma) +
- scale_fill_manual(values = c("Very\nDem" = "#0015BC",
- "Dem" = "#8bb1ff",
- "Swing\nCounties" = "#FFFFFF",
- "GOP" = "#ff8b98",
- "Very\nGOP" = "#E9141D")) +
- facet_wrap(~ subtype, nrow = 5, ncol = 1, strip.position = "right")
- print(g_regional_curves_deaths)
- ################################################################################
- ### Draw the inset USA map
- ################################################################################
- county_map <-
- county_laea %>%
- rename(fips = GEOID) %>%
- mutate(fips = if_else(fips == "46113", "46102", fips)) %>%
- mutate(fips = if_else(fips == "02270", "02158", fips)) %>%
- left_join(pres_2016, by = "fips")
- g_map_usa_regions <-
- ggplot(data = county_map) +
- theme_void() +
- theme(legend.position = "none") +
- geom_sf(aes(fill = subtype), color = "black", size = 0) +
- scale_fill_manual(values = c("Very\nDem" = "#0015BC",
- "Dem" = "#8bb1ff",
- "Swing\nCounties" = NA,
- "GOP" = "#ff8b98",
- "Very\nGOP" = "#E9141D")) +
- geom_sf(data = state_laea, size = 0.1, color = "black", fill = NA)
- print(g_map_usa_regions)
- ################################################################################
- ### Draw the main graph (stacked line graph of cases)
- ################################################################################
- caption <-
- paste("Data Source: The New York Times\n",
- "Current as of ",
- data %>% tail(n = 1) %>% pull("date") %>% format("%B %d, %Y"),
- sep = "")
- total_cases <- spreadsheet %>% filter(date == (spreadsheet %>% tail(n = 1) %>% pull("date"))) %>% pull("cases") %>% sum()
- total_deaths <- spreadsheet %>% filter(date == (spreadsheet %>% tail(n = 1) %>% pull("date"))) %>% pull("deaths") %>% sum()
- cases_growing_at <-
- spreadsheet %>%
- select(date, cases) %>%
- group_by(date) %>%
- summarize(cases = sum(cases)) %>%
- tail(n = 7) %>%
- pull(cases) %>%
- mean() %>%
- round() %>%
- format(big.mark = ",", scientific = FALSE)
- deaths_growing_at <-
- spreadsheet %>%
- select(date, deaths) %>%
- group_by(date) %>%
- summarize(deaths = sum(deaths)) %>%
- tail(n = 7) %>%
- pull(deaths) %>%
- mean() %>%
- round() %>%
- format(big.mark = ",", scientific = FALSE)
- cases_up_rate <-
- spreadsheet %>%
- select(date, cases) %>%
- group_by(date) %>%
- summarize(cases = sum(cases)) %>%
- ungroup() %>%
- mutate(new_cases_sma = SMA(cases, n = 7)) %>%
- tail(n = 7) %>%
- arrange(date) %>%
- filter(row_number() %in% c(1, n())) %>%
- pull(new_cases_sma)
- deaths_up_rate <-
- spreadsheet %>%
- select(date, deaths) %>%
- group_by(date) %>%
- summarize(deaths = sum(deaths)) %>%
- ungroup() %>%
- mutate(new_deaths_sma = SMA(deaths, n = 7)) %>%
- tail(n = 7) %>%
- arrange(date) %>%
- filter(row_number() %in% c(1, n())) %>%
- pull(new_deaths_sma)
- cases_up_rate <- round(100 * (cases_up_rate[2] - cases_up_rate[1]) / cases_up_rate[1], 2)
- cases_up_rate_txt <- "Up"
- if (cases_up_rate < 0) { cases_up_rate_txt <- "Down"}
- deaths_up_rate <- round(100 * (deaths_up_rate[2] - deaths_up_rate[1]) / deaths_up_rate[1], 2)
- deaths_up_rate_txt <- "Up"
- if (deaths_up_rate < 0) { deaths_up_rate_txt <- "Down"}
- subtitle_cases <-
- paste(total_cases %>% format(big.mark = ",", scientific = FALSE),
- " New Cases (",
- round(100000 * total_cases / 327533795, 1),
- " per 100,000 people)\n",
- "Average ",cases_growing_at, " new cases/day last 7 days\n",
- cases_up_rate_txt, " ", abs(cases_up_rate), "% from 7 days ago",
- sep = "")
- subtitle_deaths <-
- paste(total_deaths %>% format(big.mark = ",", scientific = FALSE),
- " New Deaths (",
- round(100000 * total_deaths / 327533795, 1),
- " per 100,000 people)\n",
- "Average ",deaths_growing_at, " new deaths/day last 7 days\n",
- deaths_up_rate_txt, " ", abs(deaths_up_rate), "% from 7 days ago",
- sep = "")
- g_cases_stacked <-
- ggplot(data = data, aes(x = as.Date(date), y = cases, fill = subtype)) +
- theme_linedraw() +
- theme(legend.title = element_blank()) +
- theme(legend.position = "none") +
- geom_area(color="black", size = 0.4, alpha = 0.8) +
- scale_fill_manual(values = c("Very\nDem" = "#0015BC",
- "Dem" = "#8bb1ff",
- "Swing\nCounties" = NA, #"goldenrod2",
- "GOP" = "#ff8b98",
- "Very\nGOP" = "#E9141D")) +
- scale_y_continuous(labels = scales::comma) +
- scale_x_date(date_breaks = "1 month", date_labels = "%b") +
- labs(title = "Daily COVID-19 Cases by winner of 2016 US Presidential Election binned by population quintile",
- subtitle = subtitle_cases,
- x = "Date",
- y = "Daily Cases",
- caption = caption)
- print(g_cases_stacked)
- g_deaths_stacked <-
- ggplot(data = data, aes(x = as.Date(date), y = deaths, fill = subtype)) +
- theme_linedraw() +
- theme(legend.title = element_blank()) +
- theme(legend.position = "none") +
- geom_area(color="black", size = 0.4, alpha = 0.8) +
- scale_fill_manual(values = c("Very\nDem" = "#0015BC",
- "Dem" = "#8bb1ff",
- "Swing\nCounties" = NA, #"goldenrod2",
- "GOP" = "#ff8b98",
- "Very\nGOP" = "#E9141D")) +
- scale_y_continuous(labels = scales::comma) +
- scale_x_date(date_breaks = "1 month", date_labels = "%b") +
- labs(title = "Daily COVID-19 Deaths by winner of 2016 US Presidential Election binned by population quintile",
- subtitle = subtitle_deaths,
- x = "Date",
- y = "Daily Deaths",
- caption = caption)
- print(g_deaths_stacked)
- ################################################################################
- ### Draw the inset stacked graph percent chart in the upper-right
- ################################################################################
- per_data <-
- data %>%
- group_by(date, subtype) %>%
- summarise(n_cases = sum(cases),
- n_deaths = sum(deaths)) %>%
- mutate(per_cases = n_cases / sum(n_cases)) %>%
- mutate(per_deaths = n_deaths / sum(n_deaths)) %>%
- select(-n_cases, -n_deaths) %>%
- filter(date >= as.Date("2020-03-16"))
- g_cases_stacked_per <-
- ggplot(data = per_data, aes(x = as.Date(date), y = per_cases, fill = subtype)) +
- theme_linedraw() +
- theme(legend.title = element_blank(),
- legend.position = "none",
- plot.background = element_rect(fill = "transparent",colour = NA)
- ) +
- geom_area(color="black", size = 0.4, alpha = .8) +
- scale_y_continuous(labels = scales::percent) +
- geom_hline(yintercept = 0.5, linetype = "dashed") +
- scale_fill_manual(values = c("Very\nDem" = "#0015BC",
- "Dem" = "#8bb1ff",
- "Swing\nCounties" = NA,
- "GOP" = "#ff8b98",
- "Very\nGOP" = "#E9141D")) +
- labs(title = "Proportion of National Daily Cases",
- #subtitle = "Dotted line represents fraction of voters in respective parties",
- x = "", y = "")
- print(g_cases_stacked_per)
- g_deaths_stacked_per <-
- ggplot(data = per_data, aes(x = as.Date(date), y = per_deaths, fill = subtype)) +
- theme_linedraw() +
- theme(legend.title = element_blank(),
- legend.position = "none",
- plot.background = element_rect(fill = "transparent",colour = NA)
- ) +
- geom_area(color="black", size = 0.4, alpha = .8) +
- scale_y_continuous(labels = scales::percent) +
- geom_hline(yintercept = 0.5, linetype = "dashed") +
- scale_fill_manual(values = c("Very\nDem" = "#0015BC",
- "Dem" = "#8bb1ff",
- "Swing\nCounties" = NA,
- "GOP" = "#ff8b98",
- "Very\nGOP" = "#E9141D")) +
- labs(title = "Proportion of National Daily Deaths",
- #subtitle = "Dotted line represents fraction of voters in respective parties",
- x = "", y = "")
- print(g_deaths_stacked_per)
- ################################################################################
- ### Put it all together
- ################################################################################
- g_final_cases <-
- g_cases_stacked +
- annotation_custom(ggplotGrob(g_cases_stacked_per),
- xmin = as.Date(data %>% tail(n = 1) %>% pull("date")) - 95 - 60,
- xmax = as.Date(data %>% tail(n = 1) %>% pull("date")) - 95,
- ymin = 67000,
- ymax = 38000) +
- annotation_custom(ggplotGrob(g_map_usa_regions),
- xmin = as.Date(data %>% head(n = 1) %>% pull("date")) - 5,
- xmax = as.Date(data %>% head(n = 1) %>% pull("date")) - 5 + 45,
- ymax = 67000,
- ymin = 42000) +
- annotation_custom(ggplotGrob(g_regional_curves_cases),
- xmin = as.Date(data %>% head(n = 1) %>% pull("date")) - 10,
- xmax = as.Date(data %>% head(n = 1) %>% pull("date")) - 10 + 33,
- ymin = 7000,
- ymax = 40000)
- print(g_final_cases)
- g_final_deaths <-
- g_deaths_stacked +
- annotation_custom(ggplotGrob(g_deaths_stacked_per),
- xmin = as.Date(data %>% tail(n = 1) %>% pull("date")) -60 - 60,
- xmax = as.Date(data %>% tail(n = 1) %>% pull("date")) -60,
- ymin = 1000,
- ymax = 1800) +
- annotation_custom(ggplotGrob(g_map_usa_regions),
- xmin = as.Date(data %>% head(n = 1) %>% pull("date")) - 5,
- xmax = as.Date(data %>% head(n = 1) %>% pull("date")) - 5 + 45,
- ymin = 1100,
- ymax = 1800) +
- annotation_custom(ggplotGrob(g_regional_curves_deaths),
- xmin = as.Date(data %>% head(n = 1) %>% pull("date")) - 10,
- xmax = as.Date(data %>% head(n = 1) %>% pull("date")) - 10 + 33,
- ymin = 100,
- ymax = 1050)
- print(g_final_deaths)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement