Python-based SQL queries - Group by, Count - Exercise 3

# **********************************************************************
# **********************************************************************
# Step 1 - Setup
# **********************************************************************
# **********************************************************************


# Import
from google.cloud import bigquery

# Create a "Client" object
client = bigquery.Client()

# Construct a reference to the "hacker_news" dataset
dataset_ref = client.dataset("hacker_news", project="bigquery-public-data")

# API request - fetch the dataset
dataset = client.get_dataset(dataset_ref)

# Construct a reference to the "comments" table
table_ref = dataset_ref.table("comments")

# API request - fetch the table
table = client.get_table(table_ref)

# Preview the first five lines of the "comments" table
client.list_rows(table, max_results=5).to_dataframe()


# **********************************************************************
# **********************************************************************
# Step 2 - Group by
# **********************************************************************
# **********************************************************************

# Query to select prolific commenters and post counts
prolific_commenters_query = """
                            SELECT author, COUNT(1) as NumPosts
                            FROM `bigquery-public-data.hacker_news.comments`
                            GROUP BY author
                            HAVING count(1)>10000
                            """

# Set up the query (cancel the query if it would use too much of
# your quota, with the limit set to 1 GB)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
query_job = client.query(prolific_commenters_query, job_config=safe_config)

# API request - run the query, and return a pandas DataFrame
prolific_commenters = query_job.to_dataframe()
# View top few rows of results
print(prolific_commenters.head())


# **********************************************************************
# **********************************************************************
# Step 3 - Count
# **********************************************************************
# **********************************************************************

# Write your query here and figure out the answer
deleted_query = """
                SELECT COUNT(deleted) AS Deleted_Comments
                FROM `bigquery-public-data.hacker_news.comments`
                WHERE deleted = True
                """
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
query_job = client.query(deleted_query, job_config=safe_config)

deleted_commenters = query_job.to_dataframe()
print(deleted_commenters.head())