Advertisement
tsvtln

urban_areas

Feb 8th, 2025
27
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.93 KB | Source Code | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import json
  4. import hashlib
  5. import os
  6.  
  7. # URL for the ASEAN Wikipedia page.
  8. URL = 'https://en.wikipedia.org/wiki/ASEAN'
  9. OUTPUT_FILE = 'countries_data.json'
  10.  
  11.  
  12. def fetch_content(url: str) -> BeautifulSoup:
  13.     # fetch the content from the url and return a beautifulsoup object
  14.     response = requests.get(url)
  15.     response.raise_for_status()  # Raise an error if the request fails.
  16.     return BeautifulSoup(response.content, 'html.parser')
  17.  
  18.  
  19. def find_urban_areas_table(soup: BeautifulSoup) -> BeautifulSoup:
  20.     # we find the h3 Urban areas (seen with f12 inspector), grep the table element and returns it
  21.     header = soup.find(lambda tag: tag.name == "h3" and "Urban areas" in tag.get_text())
  22.     if header:
  23.         table = header.find_next("table")
  24.         return table
  25.  
  26.  
  27. def parse_table(table: BeautifulSoup) -> dict:
  28.     """
  29.    Parses the Urban areas table and returns a dictionary in the following structure:
  30.  
  31.    {
  32.        "Country1": {
  33.            "cities": [
  34.                {
  35.                    "core_city": <city_name>,
  36.                    "population": <population>,
  37.                    "area": <area>,
  38.                    "density": <population density (population/area) rounded to 2 decimals>
  39.                },
  40.                ...
  41.            ],
  42.            "total_population": <sum of all cities' populations>,
  43.            "total_area": <sum of all cities' areas>,
  44.            "density": <overall density (total_population/total_area) rounded to 2 decimals>
  45.        },
  46.        ...
  47.    }
  48.  
  49.    The function identifies the column indexes for:
  50.      - Country
  51.      - Core city
  52.      - Population
  53.      - Area
  54.  
  55.    It then extracts and cleans the data from each row, calculates the density for each metropolitan area,
  56.    and groups them under their respective country.
  57.    """
  58.     countries_dictionary = {}
  59.  
  60.     # get all rows from the table
  61.     rows = table.find_all("tr")
  62.     # get the header cells from the first row
  63.     header = [cell.get_text(strip=True) for cell in rows[0].find_all(["th", "td"])]
  64.  
  65.  
  66.     idx_country = next(i for i, h in enumerate(header) if "Country" in h)
  67.     idx_core_city = next(i for i, h in enumerate(header) if "Core city" in h or "City" in h)
  68.     idx_population = next(i for i, h in enumerate(header) if "Population" in h)
  69.     idx_area = next(i for i, h in enumerate(header) if "Area" in h)
  70.  
  71.  
  72.     # we go through each data row and skip header row
  73.     for row in rows[1:]:
  74.         # include both table header and table data in each row
  75.         cells = row.find_all(["th", "td"])
  76.         if len(cells) < max(idx_country, idx_core_city, idx_population, idx_area) + 1:
  77.             continue  # skip rows that don't have enough cells
  78.  
  79.         country = cells[idx_country].get_text(strip=True)
  80.         core_city = cells[idx_core_city].get_text(strip=True)
  81.  
  82.         # clear population text by removing commas and any notes (split by "[")
  83.         pop_text = cells[idx_population].get_text(strip=True).split("[")[0].replace(",", "")
  84.         try:
  85.             population = int(pop_text)
  86.         except ValueError:
  87.             continue
  88.  
  89.         # clear the area text by removing commas and any notes (split by "[")
  90.         area_text = cells[idx_area].get_text(strip=True).split("[")[0].replace(",", "")
  91.         try:
  92.             area = float(area_text)
  93.         except ValueError:
  94.             continue
  95.  
  96.         # calculate population density for the metropolitan area
  97.         density = population / area if area > 0 else None
  98.  
  99.         city_data = {
  100.             "core_city": core_city,
  101.             "population": population,
  102.             "area": area,
  103.             "density": round(density, 2) if density is not None else None
  104.         }
  105.  
  106.         if country not in countries_dictionary:
  107.             countries_dictionary[country] = {"cities": []}
  108.         countries_dictionary[country]["cities"].append(city_data)
  109.  
  110.     # for every country calculate the overall density
  111.     for country, data in countries_dictionary.items():
  112.         total_population = sum(city["population"] for city in data["cities"])
  113.         total_area = sum(city["area"] for city in data["cities"])
  114.         overall_density = total_population / total_area if total_area > 0 else None
  115.         data["total_population"] = total_population
  116.         data["total_area"] = total_area
  117.         data["density"] = round(overall_density, 2) if overall_density is not None else None
  118.  
  119.     return countries_dictionary
  120.  
  121.  
  122. def generate_hash(data: dict) -> str:
  123.     # this is for generating an md5sum so we can compare easier
  124.     data_str = json.dumps(data, sort_keys=True)
  125.     return hashlib.md5(data_str.encode()).hexdigest()
  126.  
  127.  
  128. def load_previous_data(filename: str) -> dict:
  129.     # open the 'db' file
  130.     if not os.path.exists(filename):
  131.         return {}
  132.     try:
  133.         with open(filename, 'r') as f:
  134.             return json.load(f)
  135.     except json.JSONDecodeError:
  136.         return {}
  137.  
  138.  
  139. def save_data(filename: str, data: dict) -> None:
  140.     # to save the collected data in json format to the 'db'
  141.     with open(filename, 'w') as f:
  142.         json.dump(data, f, indent=4)
  143.  
  144.  
  145. def main():
  146.     soup = fetch_content(URL)
  147.     table = find_urban_areas_table(soup)
  148.  
  149.     # parse the table to build the countries dictionary
  150.     countries_dictionary = parse_table(table)
  151.  
  152.     # pretty-print the dictionary
  153.     print(json.dumps(countries_dictionary, indent=4))
  154.  
  155.     # load previously saved data
  156.     previous_data = load_previous_data(OUTPUT_FILE)
  157.  
  158.     # compare the new data with previous data using md5sum
  159.     new_hash = generate_hash(countries_dictionary)
  160.     old_hash = generate_hash(previous_data)
  161.  
  162.     # save the new data only if it is different from the old one
  163.     if new_hash != old_hash:
  164.         save_data(OUTPUT_FILE, countries_dictionary)
  165.         print("Data updated and saved.")
  166.     else:
  167.         print("No changes detected. File not updated.")
  168.  
  169.  
  170. if __name__ == '__main__':
  171.     main()
  172.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement