Python-based SQL queries: HARD QUERY - Temporary Table - Exercise 5

# 1. Setup - Import, client, dataset_ref, dataset

from google.cloud import bigquery
# Create a "Client" object
client = bigquery.Client()
# Construct a reference to the "chicago_taxi_trips" dataset
dataset_ref = client.dataset("chicago_taxi_trips", project="bigquery-public-data")
# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)


# 2. Check the names of the tables to find the right table

tables = list(client.list_tables(dataset))
for table in tables:
    print(table.table_id)
print("Only 1 table with name: taxi_trips")
table_name = "taxi_trips"


# 3a. Select the table - table_ref, table
table_ref = dataset_ref.table("taxi_trips")
table = client.get_table(table_ref)
# 3b. Inspect the table by viewing the top 5 rows of the table
client.list_rows(table, max_results=5).to_dataframe()


# 4. Query 1 - Group by

rides_per_year_query = """
                        SELECT EXTRACT(YEAR FROM trip_start_timestamp) AS year, COUNT(1) AS num_trips
                        FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
                        GROUP BY year
                        ORDER BY year
                        """

# Set up the query (cancel the query if it would use too much of your quota)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
rides_per_year_query_job = client.query(rides_per_year_query, job_config=safe_config)

# API request - run the query, and return a pandas DataFrame
rides_per_year_result = rides_per_year_query_job.to_dataframe()
print(rides_per_year_result)


# 5. Query 2 - Similar to the 1st one - Group by

rides_per_month_query = """
                       SELECT EXTRACT(MONTH FROM trip_start_timestamp) AS month, COUNT(1) AS num_trips
                       FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
                       WHERE EXTRACT(YEAR FROM trip_start_timestamp) = 2016
                       GROUP BY month
                       ORDER BY month
                       """

# Set up the query
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
rides_per_month_query_job = client.query(rides_per_month_query, job_config=safe_config)

# API request - run the query, and return a pandas DataFrame
rides_per_month_result = rides_per_month_query_job.to_dataframe()
print(rides_per_month_result)


# 5. Hard Query - Temporary Table

speeds_query = """
               WITH RelevantRides AS
               (
                   SELECT trip_start_timestamp, trip_miles, trip_seconds
                   FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
                   WHERE trip_start_timestamp > '2016-01-01' AND
                         trip_start_timestamp < '2016-04-01' AND
                         trip_seconds>0 AND
                         trip_miles>0
               )
               SELECT EXTRACT(HOUR FROM trip_start_timestamp) AS hour_of_day, COUNT(1) AS num_trips, 3600*SUM(trip_miles)/SUM(trip_seconds) AS avg_mph
               FROM RelevantRides
               GROUP BY hour_of_day
               ORDER BY hour_of_day
               """

# Set up the query
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
speeds_query_job = client.query(speeds_query, job_config=safe_config)

# API request - run the query, and return a pandas DataFrame
speeds_result = speeds_query_job.to_dataframe()
print(speeds_result)