Advertisement
makispaiktis

Python-based SQL queries: HARD QUERY - Temporary Table - Exercise 5

Jun 9th, 2023 (edited)
980
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.40 KB | None | 0 0
  1. # 1. Setup - Import, client, dataset_ref, dataset
  2.  
  3. from google.cloud import bigquery
  4. # Create a "Client" object
  5. client = bigquery.Client()
  6. # Construct a reference to the "chicago_taxi_trips" dataset
  7. dataset_ref = client.dataset("chicago_taxi_trips", project="bigquery-public-data")
  8. # API request - fetch the dataset
  9. dataset = client.get_dataset(dataset_ref)
  10.  
  11.  
  12.  
  13. # 2. Check the names of the tables to find the right table
  14.  
  15. tables = list(client.list_tables(dataset))
  16. for table in tables:
  17.     print(table.table_id)
  18. print("Only 1 table with name: taxi_trips")
  19. table_name = "taxi_trips"
  20.  
  21.  
  22.  
  23. # 3a. Select the table - table_ref, table
  24. table_ref = dataset_ref.table("taxi_trips")
  25. table = client.get_table(table_ref)
  26. # 3b. Inspect the table by viewing the top 5 rows of the table
  27. client.list_rows(table, max_results=5).to_dataframe()
  28.  
  29.  
  30.  
  31.  
  32. # 4. Query 1 - Group by
  33.  
  34. rides_per_year_query = """
  35.                        SELECT EXTRACT(YEAR FROM trip_start_timestamp) AS year, COUNT(1) AS num_trips
  36.                        FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
  37.                        GROUP BY year
  38.                        ORDER BY year
  39.                        """
  40.  
  41. # Set up the query (cancel the query if it would use too much of your quota)
  42. safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
  43. rides_per_year_query_job = client.query(rides_per_year_query, job_config=safe_config)
  44.  
  45. # API request - run the query, and return a pandas DataFrame
  46. rides_per_year_result = rides_per_year_query_job.to_dataframe()
  47. print(rides_per_year_result)
  48.  
  49.  
  50.  
  51.  
  52. # 5. Query 2 - Similar to the 1st one - Group by
  53.  
  54. rides_per_month_query = """
  55.                       SELECT EXTRACT(MONTH FROM trip_start_timestamp) AS month, COUNT(1) AS num_trips
  56.                       FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
  57.                       WHERE EXTRACT(YEAR FROM trip_start_timestamp) = 2016
  58.                       GROUP BY month
  59.                       ORDER BY month
  60.                       """
  61.  
  62. # Set up the query
  63. safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
  64. rides_per_month_query_job = client.query(rides_per_month_query, job_config=safe_config)
  65.  
  66. # API request - run the query, and return a pandas DataFrame
  67. rides_per_month_result = rides_per_month_query_job.to_dataframe()
  68. print(rides_per_month_result)
  69.  
  70.  
  71.  
  72. # 5. Hard Query - Temporary Table
  73.  
  74. speeds_query = """
  75.               WITH RelevantRides AS
  76.               (
  77.                   SELECT trip_start_timestamp, trip_miles, trip_seconds
  78.                   FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`
  79.                   WHERE trip_start_timestamp > '2016-01-01' AND
  80.                         trip_start_timestamp < '2016-04-01' AND
  81.                         trip_seconds>0 AND
  82.                         trip_miles>0
  83.               )
  84.               SELECT EXTRACT(HOUR FROM trip_start_timestamp) AS hour_of_day, COUNT(1) AS num_trips, 3600*SUM(trip_miles)/SUM(trip_seconds) AS avg_mph
  85.               FROM RelevantRides
  86.               GROUP BY hour_of_day
  87.               ORDER BY hour_of_day
  88.               """
  89.  
  90. # Set up the query
  91. safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
  92. speeds_query_job = client.query(speeds_query, job_config=safe_config)
  93.  
  94. # API request - run the query, and return a pandas DataFrame
  95. speeds_result = speeds_query_job.to_dataframe()
  96. print(speeds_result)
  97.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement