Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # 1. Setup - Import, client, dataset_ref, dataset
- from google.cloud import bigquery
- # Create a "Client" object
- client = bigquery.Client()
- # Construct a reference to the "chicago_taxi_trips" dataset
- dataset_ref = client.dataset("chicago_taxi_trips", project="bigquery-public-data")
- # API request - fetch the dataset
- dataset = client.get_dataset(dataset_ref)
- # 2. Check the names of the tables to find the right table
- tables = list(client.list_tables(dataset))
- for table in tables:
- print(table.table_id)
- print("Only 1 table with name: taxi_trips")
- table_name = "taxi_trips"
- # 3a. Select the table - table_ref, table
- table_ref = dataset_ref.table("taxi_trips")
- table = client.get_table(table_ref)
- # 3b. Inspect the table by viewing the top 5 rows of the table
- client.list_rows(table, max_results=5).to_dataframe()
- # 4. Query 1 - Group by
- rides_per_year_query = """
- SELECT EXTRACT(YEAR FROM trip_start_timestamp) AS year, COUNT(1) AS num_trips
- FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
- GROUP BY year
- ORDER BY year
- """
- # Set up the query (cancel the query if it would use too much of your quota)
- safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
- rides_per_year_query_job = client.query(rides_per_year_query, job_config=safe_config)
- # API request - run the query, and return a pandas DataFrame
- rides_per_year_result = rides_per_year_query_job.to_dataframe()
- print(rides_per_year_result)
- # 5. Query 2 - Similar to the 1st one - Group by
- rides_per_month_query = """
- SELECT EXTRACT(MONTH FROM trip_start_timestamp) AS month, COUNT(1) AS num_trips
- FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
- WHERE EXTRACT(YEAR FROM trip_start_timestamp) = 2016
- GROUP BY month
- ORDER BY month
- """
- # Set up the query
- safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
- rides_per_month_query_job = client.query(rides_per_month_query, job_config=safe_config)
- # API request - run the query, and return a pandas DataFrame
- rides_per_month_result = rides_per_month_query_job.to_dataframe()
- print(rides_per_month_result)
- # 5. Hard Query - Temporary Table
- speeds_query = """
- WITH RelevantRides AS
- (
- SELECT trip_start_timestamp, trip_miles, trip_seconds
- FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
- WHERE trip_start_timestamp > '2016-01-01' AND
- trip_start_timestamp < '2016-04-01' AND
- trip_seconds>0 AND
- trip_miles>0
- )
- SELECT EXTRACT(HOUR FROM trip_start_timestamp) AS hour_of_day, COUNT(1) AS num_trips, 3600*SUM(trip_miles)/SUM(trip_seconds) AS avg_mph
- FROM RelevantRides
- GROUP BY hour_of_day
- ORDER BY hour_of_day
- """
- # Set up the query
- safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
- speeds_query_job = client.query(speeds_query, job_config=safe_config)
- # API request - run the query, and return a pandas DataFrame
- speeds_result = speeds_query_job.to_dataframe()
- print(speeds_result)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement