5. Data Analysis Parliament

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np


# 1. Keeping the columns I need
# 1a. Basics
filepath = "/content/sample_data/parliament.csv"
df = pd.read_csv(filepath)
shape = df.shape
print("----> Original dataset")
print(f"Dataset shape = ({shape[0]} rows-records) x ({shape[1]} columns-features)")
all_columns = list(df.columns)
print("Columns names:", all_columns, '\n\n')

# 1b. Drop 2 unnecessary columns
cols_to_drop = ['Profile Image', 'Slug', 'Electoral Districts']
print(f"----> Dropping {len(cols_to_drop)} columns: {cols_to_drop}")
df.drop(columns=cols_to_drop, inplace=True)
shape = df.shape
all_columns = list(df.columns)
print(f"Dataset shape = ({shape[0]} rows-records) x ({shape[1]} columns-features)")
print("Columns names:", all_columns, '\n\n')

# 1c. Concatenate 2 columns into 1 (using 'agg' function)
cols_to_concatenate = ['First Name', 'Last Name']
print(f'----> Concatenating {len(cols_to_concatenate)} columns ({cols_to_concatenate}) into 1 single column')
df['Name'] = df[cols_to_concatenate].agg(' '.join, axis=1)
df.drop(columns=cols_to_concatenate, inplace=True)
shape = df.shape
all_columns = list(df.columns)
print(f"Dataset shape = ({shape[0]} rows-records) x ({shape[1]} columns-features)")
print("Columns names:", all_columns, '\n\n')

# 1d. Reordering the columns into my desired order
print("----> Reordering the dataframe columns into my desired order")
desired_order = ['Name', 'Current Position', 'Parliament Group Name', 'District', 'Total Revenue', 'Year of Use', 'Parliament Group ID', 'Government Position ID', 'Government Position Name', 'EU', 'ID']
df = df[desired_order]
shape = df.shape
all_columns = list(df.columns)
print(f"Dataset shape = ({shape[0]} rows-records) x ({shape[1]} columns-features)")
print("Columns names:", all_columns, '\n\n')
print(df.head())


# 2. Check for missing data
# 2a. Separate columns into numerical and categorical ones
numerical_cols = [cname for cname in df.columns if df[cname].dtype in ['int64', 'float64']]
categorical_cols = [cname for cname in df.columns if df[cname].dtype == "object"]
my_cols = numerical_cols + categorical_cols
print(f"There are {len(numerical_cols)}/{shape[1]} NUMERICAL columns:\nNumerical cols = {numerical_cols}\n")
print(f"There are {len(categorical_cols)}/{shape[1]} CATEGORICAL columns:\nCategorical cols = {categorical_cols}\n")

# 2b. Find out in how many columns there are missing data
cols_with_missing = [col for col in df.columns if df[col].isnull().any()]
print(f"There are {len(cols_with_missing)}/{shape[1]} columns WITH MISSING VALUES:\n {cols_with_missing}")

# 2c. Separate into numerical and categorical ones with missing data
numerical_with_missing = [col for col in numerical_cols if col in cols_with_missing]
categorical_with_missing = [col for col in categorical_cols if col in cols_with_missing]
selected_with_missing = [col for col in my_cols if col in cols_with_missing]
print(f"Namely: {len(numerical_with_missing)}/{len(numerical_cols)} numerical columns and {len(categorical_with_missing)}/{len(categorical_cols)} categorical columns with missing data")
# print(f"\nThese {len(cols_with_missing)} columns with missing data need IMPUTATION.....")


# 3. Find out the average total revenue for a political party in 2021
from math import floor

# 3a. Filtering
year_to_check = 2021
new_df = df[df['Year of Use'] == year_to_check]
print(f"I will check {len(new_df)}/{shape[0]} rows with filter:\nYear of Use = {year_to_check}")

# 3b. Count how many political parties there are
num_parties = df['Parliament Group Name'].nunique()
parties = df['Parliament Group Name'].unique()
print(f"There are {num_parties} parties in original dataset\n\n")

# 3c. Find the average total revenue - Remove the NaN value
d = dict()
for party in parties:
    new = new_df[new_df['Parliament Group Name'] == party]
    average = new['Total Revenue'].mean()
    if str(party).lower() != "nan" and str(average).lower() != "nan":
        d[party] = floor(average)

# 4d. Reinspect the political parties with data for total revenue (not NaN values)
num_parties = len(d)
parties = list(d.keys())
print(f"There are {num_parties} parties with (non-NaN) data to compare:\n{parties}\n\n")

# 4e. Sort the dictionary 'd' by values
d_sorted = sorted(d.items(), key=lambda x:x[1], reverse=True)
d = dict(d_sorted)
for key, value in d.items():
    print(f"{key} ----> {value}")


# 4f. Now, I am interested in political parties of Greek parliament - I will display them with abbreviations
parties_of_interest = ['Ελληνική Λύση', 'Συνασπισμός Ριζοσπαστικής Αριστεράς', 'Νέα Δημοκρατία', 'ΜέΡΑ25', 'Λαϊκός Σύνδεσμος - Χρυσή Αυγή', 'Το Ποτάμι', 'ΠΑΣΟΚ-Κίνημα Αλλαγής', 'Δημοκρατική Συμπαράταξη (ΠΑ.ΣΟ.Κ. - ΔΗΜ.ΑΡ.)', 'Κομμουνιστικό Κόμμα Ελλάδας', 'Ανεξάρτητοι']
abbreviations = ['Ελλ. Λύση', 'ΣΥΡΙΖΑ', 'ΝΔ', 'ΜέΡΑ25', 'Χρυσή Αυγή', 'Το Ποτάμι', 'ΠΑΣΟΚ-ΚΙΝΑΛ', 'ΔΗΜΑΡ', 'ΚΚΕ', 'Ανεξάρτητοι']
abbs = {parties_of_interest[i] : abbreviations[i] for i in range(len(parties_of_interest))}

data = dict()
for key, value in d.items():
    if key in parties_of_interest:
        party_abb = abbs[key]
        revenue = value
        data[party_abb] = revenue

# 4g. Plot
plt.figure(figsize=(12, 9))
keys = list(data.keys())
values = list(data.values())
plt.bar(keys, values)


# 5. HEATMAP
# Heatmap is ALL ABOUT numerical data, so I will have to play with numerical cols
new = df[numerical_cols]
print(new.head())
sns.heatmap(new)


# 6. Discover the most "rich" year
cols_to_playwith = ['Year of Use', 'Total Revenue']
new_df = df[cols_to_playwith]
print(new.head(),  '\n\n')

num_years = new_df['Year of Use'].nunique()
years = new_df['Year of Use'].unique()
years = sorted(years)
print(f"I will examine {num_years} years: {years}")

avg_list = list()
for year in years:
    new = new_df[new_df['Year of Use'] == year]
    average = new['Total Revenue'].mean()
    avg_list.append(floor(average))
print(avg_list)

plt.plot(years, avg_list)
plt.title("Average revenue vs year (including every political party)")
plt.show()

sns.histplot(data=df, x='Total Revenue', kde=True)
plt.show()
sns.kdeplot(data=df, x='Total Revenue', hue='Year of Use')
plt.show()