Advertisement
YaBoiSwayZ

Regul8r v1

Jun 22nd, 2024
109
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 9.14 KB | Source Code | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import pandas as pd
  4. import matplotlib.pyplot as plt
  5. import sqlite3
  6. import datetime
  7. import logging
  8. from concurrent.futures import ThreadPoolExecutor
  9. import argparse
  10. from requests.adapters import HTTPAdapter
  11. from requests.packages.urllib3.util.retry import Retry
  12. import json
  13. import os
  14.  
  15. # Setup logging
  16. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
  17.  
  18. # Constants
  19. URL = "https://mimer.svk.se/PrimaryRegulation/PrimaryRegulationIndex"
  20. HEADERS = {
  21.     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
  22. }
  23. HEADER_MAPPING_FILE = "header_mappings.json"
  24. HEADER_HISTORY_FILE = "header_history.json"
  25.  
  26. # Default expected headers
  27. DEFAULT_EXPECTED_HEADERS = {
  28.     "date and time": "Date and Time",
  29.     "fcr-n price (eur/mw)": "FCR-N Price (EUR/MW)",
  30.     "fcr-d up price (eur/mw)": "FCR-D Up Price (EUR/MW)",
  31.     "fcr-d down price (eur/mw)": "FCR-D Down Price (EUR/MW)"
  32. }
  33.  
  34. def load_json_file(file_path, default_data):
  35.     if os.path.exists(file_path):
  36.         with open(file_path, "r") as file:
  37.             return json.load(file)
  38.     return default_data
  39.  
  40. def save_json_file(file_path, data):
  41.     with open(file_path, "w") as file:
  42.         json.dump(data, file, indent=4)
  43.  
  44. def requests_retry_session(
  45.     retries=3, backoff_factor=0.3, status_forcelist=(500, 502, 504), session=None
  46. ):
  47.     session = session or requests.Session()
  48.     retry = Retry(
  49.         total=retries,
  50.         read=retries,
  51.         connect=retries,
  52.         backoff_factor=backoff_factor,
  53.         status_forcelist=status_forcelist,
  54.     )
  55.     adapter = HTTPAdapter(max_retries=retry)
  56.     session.mount('http://', adapter)
  57.     session.mount('https://', adapter)
  58.     return session
  59.  
  60. def fetch_webpage(url, headers):
  61.     try:
  62.         logging.info(f"Fetching webpage: {url}")
  63.         response = requests_retry_session().get(url, headers=headers)
  64.         response.raise_for_status()
  65.         logging.info(f"Successfully fetched webpage: {url}")
  66.         return response
  67.     except requests.exceptions.RequestException as e:
  68.         logging.error(f"Error fetching the webpage: {e}")
  69.         raise
  70.  
  71. def parse_html(content):
  72.     logging.info("Parsing HTML content.")
  73.     soup = BeautifulSoup(content, "html.parser")
  74.     table = soup.find("table")
  75.     if not table:
  76.         logging.error("Table not found on the webpage.")
  77.         raise ValueError("Table not found on the webpage.")
  78.     logging.info("Table found and parsed successfully.")
  79.     return table
  80.  
  81. def get_column_indices(table, header_mappings, header_history):
  82.     logging.info("Identifying column indices based on headers.")
  83.     headers = table.find_all("th")
  84.     header_texts = [header.text.strip().lower() for header in headers]
  85.  
  86.     logging.info(f"Actual headers found: {header_texts}")
  87.  
  88.     indices = {}
  89.     for header in header_texts:
  90.         if header in header_mappings:
  91.             indices[header] = header_texts.index(header)
  92.         else:
  93.             logging.info(f"New header '{header}' found. Adding to header mappings and history.")
  94.             display_name = header.replace("_", " ").title()
  95.             header_mappings[header] = display_name
  96.             indices[header] = header_texts.index(header)
  97.             if header not in header_history:
  98.                 header_history[header] = {"first_detected": datetime.datetime.now().isoformat(), "last_detected": datetime.datetime.now().isoformat()}
  99.             else:
  100.                 header_history[header]["last_detected"] = datetime.datetime.now().isoformat()
  101.  
  102.     save_json_file(HEADER_MAPPING_FILE, header_mappings)
  103.     save_json_file(HEADER_HISTORY_FILE, header_history)
  104.  
  105.     return indices
  106.  
  107. def extract_data_for_date(table, date, indices):
  108.     logging.info(f"Extracting data for date: {date}")
  109.     data = []
  110.     date_str = date.strftime("%Y-%m-%d")
  111.  
  112.     for row in table.find_all("tr")[1:]:
  113.         cells = row.find_all("td")
  114.         if len(cells) <= max(indices.values()):
  115.             logging.warning(f"Unexpected table structure on date {date_str}, skipping row.")
  116.             continue
  117.         date_time = cells[indices["date and time"]].text.strip()
  118.         if date_time.startswith(date_str):
  119.             try:
  120.                 fcr_n_price = float(cells[indices["fcr-n price (eur/mw)"]].text.strip().replace(",", ".")) if "fcr-n price (eur/mw)" in indices else None
  121.                 fcr_d_up_price = float(cells[indices["fcr-d up price (eur/mw)"]].text.strip().replace(",", ".")) if "fcr-d up price (eur/mw)" in indices else None
  122.                 fcr_d_down_price = float(cells[indices["fcr-d down price (eur/mw)"]].text.strip().replace(",", ".")) if "fcr-d down price (eur/mw)" in indices else None
  123.                 row_data = [date_time, fcr_n_price, fcr_d_up_price, fcr_d_down_price]
  124.                 for header, index in indices.items():
  125.                     if header not in DEFAULT_EXPECTED_HEADERS:
  126.                         column_value = cells[index].text.strip().replace(",", ".")
  127.                         row_data.append(float(column_value) if column_value else None)
  128.                 data.append(row_data)
  129.             except (IndexError, ValueError) as e:
  130.                 logging.warning(f"Error parsing row data for date {date_str}: {e}")
  131.                 continue
  132.     logging.info(f"Data extraction for date {date} completed with {len(data)} records.")
  133.     return data
  134.  
  135. def extract_data(start_date, end_date):
  136.     try:
  137.         response = fetch_webpage(URL, HEADERS)
  138.         table = parse_html(response.content)
  139.         header_mappings = load_json_file(HEADER_MAPPING_FILE, DEFAULT_EXPECTED_HEADERS)
  140.         header_history = load_json_file(HEADER_HISTORY_FILE, {})
  141.         indices = get_column_indices(table, header_mappings, header_history)
  142.     except Exception as e:
  143.         logging.error(f"Error during data extraction: {e}")
  144.         return pd.DataFrame()
  145.    
  146.     data = []
  147.     with ThreadPoolExecutor() as executor:
  148.         futures = [
  149.             executor.submit(extract_data_for_date, table, current_date, indices)
  150.             for current_date in pd.date_range(start_date, end_date)
  151.         ]
  152.        
  153.         for future in futures:
  154.             try:
  155.                 result = future.result()
  156.                 data.extend(result)
  157.                 logging.info(f"Data extracted for current date range segment with {len(result)} records.")
  158.             except Exception as e:
  159.                 logging.warning(f"Error processing data for a date range: {e}")
  160.  
  161.     all_headers = [header_mappings[header] for header in header_mappings]
  162.     df = pd.DataFrame(data, columns=all_headers)
  163.     logging.info(f"Total records extracted: {len(df)}")
  164.     return df
  165.  
  166. def validate_and_clean_data(df):
  167.     if df.empty:
  168.         logging.warning("No data to validate and clean.")
  169.         return df
  170.  
  171.     logging.info("Validating and cleaning data.")
  172.     initial_length = len(df)
  173.     df.dropna(inplace=True)
  174.     for column in df.columns[1:]:
  175.         Q1 = df[column].quantile(0.25)
  176.         Q3 = df[column].quantile(0.75)
  177.         IQR = Q3 - Q1
  178.         lower_bound = Q1 - 1.5 * IQR
  179.         upper_bound = Q3 + 1.5 * IQR
  180.         df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
  181.     logging.info(f"Data cleaned: removed {initial_length - len(df)} outliers and missing values.")
  182.     return df
  183.  
  184. def analyze_and_visualize_data(df):
  185.     if df.empty:
  186.         logging.warning("No data available for analysis and visualization.")
  187.         return
  188.    
  189.     logging.info("Analyzing and visualizing data.")
  190.     summary = df.describe()
  191.     logging.info(f"Data summary:\n{summary}")
  192.    
  193.     df.set_index("Date and Time", inplace=True)
  194.     df.plot(subplots=True, figsize=(10, 8))
  195.     plt.tight_layout()
  196.     plt.show()
  197.  
  198. def save_to_database(df, db_filename):
  199.     if df.empty:
  200.         logging.warning("No data available to save to database.")
  201.         return
  202.  
  203.     logging.info(f"Saving data to database: {db_filename}")
  204.     conn = sqlite3.connect(db_filename)
  205.     df.to_sql("primary_regulation", conn, if_exists="replace", index=False)
  206.     conn.close()
  207.     logging.info("Data saved to database successfully.")
  208.  
  209. def main(start_date, end_date, db_filename):
  210.     try:
  211.         df = extract_data(start_date, end_date)
  212.         df = validate_and_clean_data(df)
  213.         analyze_and_visualize_data(df)
  214.         save_to_database(df, db_filename)
  215.     except Exception as e:
  216.         logging.error(f"An error occurred: {e}")
  217.  
  218. if __name__ == "__main__":
  219.     parser = argparse.ArgumentParser(description='Fetch and save primary regulation data.')
  220.     parser.add_argument('--start-date', type=str, required=True, help='Start date in YYYY-MM-DD format')
  221.     parser.add_argument('--end-date', type=str, required=True, help='End date in YYYY-MM-DD format')
  222.     parser.add_argument('--db-filename', type=str, default="primary_regulation_data.db", help='Output SQLite database filename')
  223.    
  224.     args = parser.parse_args()
  225.    
  226.     start_date = datetime.datetime.strptime(args.start_date, "%Y-%m-%d").date()
  227.     end_date = datetime.datetime.strptime(args.end_date, "%Y-%m-%d").date()
  228.    
  229.     main(start_date, end_date, args.db_filename)
  230.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement