US Patent Rate per 100k with nonlinear fit

### Compute the ratio of patents granted per 100k US population, as a measure
### of innovation over time.  <[email protected]>

library(tidyverse)
library(rvest)
library(scales)
library(broom)
library(minpack.lm)
library(gt)
library(cowplot)

### US Population since 1900
us_pop_pres <-
  "https://multpl.com/united-states-population/table/by-year" %>%
  read_html() %>%
  html_nodes("table") %>%
  .[[1]] %>%
  html_table() %>%
  filter(grepl("Jul", Date)) %>%
  rename(Year = Date) %>%
  mutate(
    Year  = gsub("Jul 1, ", "", Year) %>% as.integer(),
    Value = gsub(" million", "", Value) %>% as.numeric(),
    Date  = as.Date(paste(Year, "-01-01", sep = "")),
    Value = Value * 1000000
  ) %>%
  select(Date, Value)

# US population before 1900 from Wikipedia
# https://en.wikipedia.org/wiki/Demographic_history_of_the_United_States
us_pop_hist <-
  tribble(
    ~year, ~Value,
    1610,   350,
    1620,   2302,
    1630,   4646,
    1640,   26634,
    1650,   50368,
    1660,   75058,
    1670,   111935,
    1680,   151507,
    1690,   210372,
    1700,   250888,
    1710,   331711,
    1720,   466185,
    1730,   629445,
    1740,   905563,
    1750,   1170760,
    1760,   1593625,
    1770,   2148076,
    1780,   2780369,
    1790,   3929214,
    1800,   5308483,
    1810,   7239881,
    1820,   9638453,
    1830,   12866020,
    1840,   17069453,
    1850,   23191876,
    1860,   31443321,
    1870,   38558371,
    1880,   50189209,
    1890,   62979766
  ) %>%
  mutate(Date = as.Date(paste(year, "-01-01", sep = ""))) %>%
  select(Date, Value)

### Combine past/present history into one dataset
us_pop <-
  us_pop_pres %>%
  bind_rows(us_pop_hist) %>%
  arrange(Date) %>%
  rename(date = Date, us_pop = Value)

# Info on number of patents from US Patent & Trademark Office
patents <-
  "https://www.uspto.gov/web/offices/ac/ido/oeip/taf/h_counts.htm" %>%
  read_html() %>%
  html_elements("table") %>%
  .[[3]] %>%
  html_table() %>%
  janitor::clean_names() %>%
  mutate_all(~sub(",", "", .)) %>%
  select(-notes) %>%
  rename(
    year = 1,
    utility_patent_applications = 2,
    design_patent_applications = 3,
    plant_patent_applications = 4,
    utility_patents = 5,
    design_patents = 6,
    plant_patents = 7,
    patents_to_foreign_residents = 8
  ) %>%
  mutate(
    year = ifelse(year == "1836 (c)", "1836", year),
    design_patents = ifelse(design_patents == "(b)", "", design_patents)
    ) %>%
  mutate_all(~sub("n/a", "", .)) %>%
  mutate_all(as.numeric) %>%
  replace(is.na(.), 0) %>%
  mutate(
    date = as.Date(paste(year, "-01-01", sep = "")),
    total_patents = utility_patents + design_patents + plant_patents
    ) %>%
  select(date, total_patents)

# Combine patents and population into one table so we can compute the rate of
# patents per 100k population
combo <-
  us_pop %>%
  full_join(patents, by = "date") %>%
  filter(!is.na(total_patents), !is.na(us_pop)) %>%
  mutate(
    year = year(date),
    patent_rate_100k = 100000 * total_patents / us_pop)

# Do a non-linear fit and derive a fit.   Starting points originally derived by eye-balling it.
fit <-
  combo %>%
  nlsLM(patent_rate_100k ~ a + (b * year) + exp(c + d * year) + amplitude * sin((2 * pi / period) * (year + phase)),
        data = .,
        start = list(a = -178.75411547, b = 0.10855252, c = -179.64201187, d = 0.09114342, amplitude = 50, period = 68.9, phase = 40))

a <- fit %>% tidy() %>% filter(term == "a") %>% pull(estimate)
b <- fit %>% tidy() %>% filter(term == "b") %>% pull(estimate)
c <- fit %>% tidy() %>% filter(term == "c") %>% pull(estimate)
d <- fit %>% tidy() %>% filter(term == "d") %>% pull(estimate)

amplitude <- fit %>% tidy() %>% filter(term == "amplitude") %>% pull(estimate)
period    <- fit %>% tidy() %>% filter(term == "period")    %>% pull(estimate)
phase     <- fit %>% tidy() %>% filter(term == "phase")     %>% pull(estimate)

combo <-
  combo %>%
  mutate(

    # Fit the non-cyclical parts so we can subtract out
    fit_lin_exp     = a + b*year + exp(c + d * year),
    detrend_lin_exp = patent_rate_100k - fit_lin_exp,

    # Fit the cyclical parts so we can also subtract them to look at residuals
    fit_sin         = amplitude * sin((2 * pi / period) * (year + phase)),
    detrend_sin     = detrend_lin_exp - fit_sin
    )

# Create a table of the fit info to append to our graph.   "gt" doesn't allow
# converting tables to grobs, so I save as a png and add to the graph in GIMP.
fit_info <-
  fit %>%
  tidy() %>%
  gt() %>%
  tab_header(title = "patent_rate_100k ~ (a + b * year) + exp(c + d * year) + amplitude * sin((2 * pi / period) * (year + phase)") %>%
  fmt_number(decimals = 3)
print(fit_info)

# Graph the raw patent per 100k rate, showing the trend line we will subtract
g_patents <-
  ggplot(data = combo) +
  theme_bw() +
  geom_line(aes(x = date, y = patent_rate_100k)) +
  geom_line(aes(x = date, y = fit_lin_exp + fit_sin), color = "firebrick2") +
  scale_x_date(breaks = pretty_breaks(10)) +
  scale_y_continuous(breaks = pretty_breaks(10)) +
  labs(x = "", y = "Patent Rate per 100k population", title = "US Patent Rate per 100k population")
print(g_patents)

# Subtract out the non-cyclical linear/exponential components and graph the
# z-score of the detrended patent rate
g_patents_detrend <-
  ggplot(data = combo) +
  theme_bw() +
  geom_line(aes(x = date, y = (detrend_lin_exp - mean(detrend_lin_exp))/sd(detrend_lin_exp))) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "steelblue2") +
  scale_x_date(breaks = pretty_breaks(10), limits = c(as.Date("1790-01-01"), Sys.Date())) +
  scale_y_continuous(breaks = pretty_breaks(10)) +
  labs(x = "", y = "Z-Score",
       title = "Z-Score of linear/exponential-detrended US Patent Rate per 100k population")
print(g_patents_detrend)

plot_grid(g_patents, g_patents_detrend, ncol = 1, align = "hv")