Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ### Compute the ratio of patents granted per 100k US population, as a measure
- ### of innovation over time. <Mathew.Binkley@Vanderbilt.edu>
- library(tidyverse)
- library(rvest)
- library(scales)
- library(broom)
- library(minpack.lm)
- library(gt)
- library(cowplot)
- ### US Population since 1900
- us_pop_pres <-
- "https://multpl.com/united-states-population/table/by-year" %>%
- read_html() %>%
- html_nodes("table") %>%
- .[[1]] %>%
- html_table() %>%
- filter(grepl("Jul", Date)) %>%
- rename(Year = Date) %>%
- mutate(
- Year = gsub("Jul 1, ", "", Year) %>% as.integer(),
- Value = gsub(" million", "", Value) %>% as.numeric(),
- Date = as.Date(paste(Year, "-01-01", sep = "")),
- Value = Value * 1000000
- ) %>%
- select(Date, Value)
- # US population before 1900 from Wikipedia
- # https://en.wikipedia.org/wiki/Demographic_history_of_the_United_States
- us_pop_hist <-
- tribble(
- ~year, ~Value,
- 1610, 350,
- 1620, 2302,
- 1630, 4646,
- 1640, 26634,
- 1650, 50368,
- 1660, 75058,
- 1670, 111935,
- 1680, 151507,
- 1690, 210372,
- 1700, 250888,
- 1710, 331711,
- 1720, 466185,
- 1730, 629445,
- 1740, 905563,
- 1750, 1170760,
- 1760, 1593625,
- 1770, 2148076,
- 1780, 2780369,
- 1790, 3929214,
- 1800, 5308483,
- 1810, 7239881,
- 1820, 9638453,
- 1830, 12866020,
- 1840, 17069453,
- 1850, 23191876,
- 1860, 31443321,
- 1870, 38558371,
- 1880, 50189209,
- 1890, 62979766
- ) %>%
- mutate(Date = as.Date(paste(year, "-01-01", sep = ""))) %>%
- select(Date, Value)
- ### Combine past/present history into one dataset
- us_pop <-
- us_pop_pres %>%
- bind_rows(us_pop_hist) %>%
- arrange(Date) %>%
- rename(date = Date, us_pop = Value)
- # Info on number of patents from US Patent & Trademark Office
- patents <-
- "https://www.uspto.gov/web/offices/ac/ido/oeip/taf/h_counts.htm" %>%
- read_html() %>%
- html_elements("table") %>%
- .[[3]] %>%
- html_table() %>%
- janitor::clean_names() %>%
- mutate_all(~sub(",", "", .)) %>%
- select(-notes) %>%
- rename(
- year = 1,
- utility_patent_applications = 2,
- design_patent_applications = 3,
- plant_patent_applications = 4,
- utility_patents = 5,
- design_patents = 6,
- plant_patents = 7,
- patents_to_foreign_residents = 8
- ) %>%
- mutate(
- year = ifelse(year == "1836 (c)", "1836", year),
- design_patents = ifelse(design_patents == "(b)", "", design_patents)
- ) %>%
- mutate_all(~sub("n/a", "", .)) %>%
- mutate_all(as.numeric) %>%
- replace(is.na(.), 0) %>%
- mutate(
- date = as.Date(paste(year, "-01-01", sep = "")),
- total_patents = utility_patents + design_patents + plant_patents
- ) %>%
- select(date, total_patents)
- # Combine patents and population into one table so we can compute the rate of
- # patents per 100k population
- combo <-
- us_pop %>%
- full_join(patents, by = "date") %>%
- filter(!is.na(total_patents), !is.na(us_pop)) %>%
- mutate(
- year = year(date),
- patent_rate_100k = 100000 * total_patents / us_pop)
- # Do a non-linear fit and derive a fit. Starting points originally derived by eye-balling it.
- fit <-
- combo %>%
- nlsLM(patent_rate_100k ~ a + (b * year) + exp(c + d * year) + amplitude * sin((2 * pi / period) * (year + phase)),
- data = .,
- start = list(a = -178.75411547, b = 0.10855252, c = -179.64201187, d = 0.09114342, amplitude = 50, period = 68.9, phase = 40))
- a <- fit %>% tidy() %>% filter(term == "a") %>% pull(estimate)
- b <- fit %>% tidy() %>% filter(term == "b") %>% pull(estimate)
- c <- fit %>% tidy() %>% filter(term == "c") %>% pull(estimate)
- d <- fit %>% tidy() %>% filter(term == "d") %>% pull(estimate)
- amplitude <- fit %>% tidy() %>% filter(term == "amplitude") %>% pull(estimate)
- period <- fit %>% tidy() %>% filter(term == "period") %>% pull(estimate)
- phase <- fit %>% tidy() %>% filter(term == "phase") %>% pull(estimate)
- combo <-
- combo %>%
- mutate(
- # Fit the non-cyclical parts so we can subtract out
- fit_lin_exp = a + b*year + exp(c + d * year),
- detrend_lin_exp = patent_rate_100k - fit_lin_exp,
- # Fit the cyclical parts so we can also subtract them to look at residuals
- fit_sin = amplitude * sin((2 * pi / period) * (year + phase)),
- detrend_sin = detrend_lin_exp - fit_sin
- )
- # Create a table of the fit info to append to our graph. "gt" doesn't allow
- # converting tables to grobs, so I save as a png and add to the graph in GIMP.
- fit_info <-
- fit %>%
- tidy() %>%
- gt() %>%
- tab_header(title = "patent_rate_100k ~ (a + b * year) + exp(c + d * year) + amplitude * sin((2 * pi / period) * (year + phase)") %>%
- fmt_number(decimals = 3)
- print(fit_info)
- # Graph the raw patent per 100k rate, showing the trend line we will subtract
- g_patents <-
- ggplot(data = combo) +
- theme_bw() +
- geom_line(aes(x = date, y = patent_rate_100k)) +
- geom_line(aes(x = date, y = fit_lin_exp + fit_sin), color = "firebrick2") +
- scale_x_date(breaks = pretty_breaks(10)) +
- scale_y_continuous(breaks = pretty_breaks(10)) +
- labs(x = "", y = "Patent Rate per 100k population", title = "US Patent Rate per 100k population")
- print(g_patents)
- # Subtract out the non-cyclical linear/exponential components and graph the
- # z-score of the detrended patent rate
- g_patents_detrend <-
- ggplot(data = combo) +
- theme_bw() +
- geom_line(aes(x = date, y = (detrend_lin_exp - mean(detrend_lin_exp))/sd(detrend_lin_exp))) +
- geom_hline(yintercept = 0, linetype = "dashed", color = "steelblue2") +
- scale_x_date(breaks = pretty_breaks(10), limits = c(as.Date("1790-01-01"), Sys.Date())) +
- scale_y_continuous(breaks = pretty_breaks(10)) +
- labs(x = "", y = "Z-Score",
- title = "Z-Score of linear/exponential-detrended US Patent Rate per 100k population")
- print(g_patents_detrend)
- plot_grid(g_patents, g_patents_detrend, ncol = 1, align = "hv")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement