Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import matplotlib.pyplot as plt
- import seaborn as sns
- import pandas as pd
- import numpy as np
- # 1. Keeping the columns I need
- # 1a. Basics
- filepath = "/content/sample_data/parliament.csv"
- df = pd.read_csv(filepath)
- shape = df.shape
- print("----> Original dataset")
- print(f"Dataset shape = ({shape[0]} rows-records) x ({shape[1]} columns-features)")
- all_columns = list(df.columns)
- print("Columns names:", all_columns, '\n\n')
- # 1b. Drop 2 unnecessary columns
- cols_to_drop = ['Profile Image', 'Slug', 'Electoral Districts']
- print(f"----> Dropping {len(cols_to_drop)} columns: {cols_to_drop}")
- df.drop(columns=cols_to_drop, inplace=True)
- shape = df.shape
- all_columns = list(df.columns)
- print(f"Dataset shape = ({shape[0]} rows-records) x ({shape[1]} columns-features)")
- print("Columns names:", all_columns, '\n\n')
- # 1c. Concatenate 2 columns into 1 (using 'agg' function)
- cols_to_concatenate = ['First Name', 'Last Name']
- print(f'----> Concatenating {len(cols_to_concatenate)} columns ({cols_to_concatenate}) into 1 single column')
- df['Name'] = df[cols_to_concatenate].agg(' '.join, axis=1)
- df.drop(columns=cols_to_concatenate, inplace=True)
- shape = df.shape
- all_columns = list(df.columns)
- print(f"Dataset shape = ({shape[0]} rows-records) x ({shape[1]} columns-features)")
- print("Columns names:", all_columns, '\n\n')
- # 1d. Reordering the columns into my desired order
- print("----> Reordering the dataframe columns into my desired order")
- desired_order = ['Name', 'Current Position', 'Parliament Group Name', 'District', 'Total Revenue', 'Year of Use', 'Parliament Group ID', 'Government Position ID', 'Government Position Name', 'EU', 'ID']
- df = df[desired_order]
- shape = df.shape
- all_columns = list(df.columns)
- print(f"Dataset shape = ({shape[0]} rows-records) x ({shape[1]} columns-features)")
- print("Columns names:", all_columns, '\n\n')
- print(df.head())
- # 2. Check for missing data
- # 2a. Separate columns into numerical and categorical ones
- numerical_cols = [cname for cname in df.columns if df[cname].dtype in ['int64', 'float64']]
- categorical_cols = [cname for cname in df.columns if df[cname].dtype == "object"]
- my_cols = numerical_cols + categorical_cols
- print(f"There are {len(numerical_cols)}/{shape[1]} NUMERICAL columns:\nNumerical cols = {numerical_cols}\n")
- print(f"There are {len(categorical_cols)}/{shape[1]} CATEGORICAL columns:\nCategorical cols = {categorical_cols}\n")
- # 2b. Find out in how many columns there are missing data
- cols_with_missing = [col for col in df.columns if df[col].isnull().any()]
- print(f"There are {len(cols_with_missing)}/{shape[1]} columns WITH MISSING VALUES:\n {cols_with_missing}")
- # 2c. Separate into numerical and categorical ones with missing data
- numerical_with_missing = [col for col in numerical_cols if col in cols_with_missing]
- categorical_with_missing = [col for col in categorical_cols if col in cols_with_missing]
- selected_with_missing = [col for col in my_cols if col in cols_with_missing]
- print(f"Namely: {len(numerical_with_missing)}/{len(numerical_cols)} numerical columns and {len(categorical_with_missing)}/{len(categorical_cols)} categorical columns with missing data")
- # print(f"\nThese {len(cols_with_missing)} columns with missing data need IMPUTATION.....")
- # 3. Find out the average total revenue for a political party in 2021
- from math import floor
- # 3a. Filtering
- year_to_check = 2021
- new_df = df[df['Year of Use'] == year_to_check]
- print(f"I will check {len(new_df)}/{shape[0]} rows with filter:\nYear of Use = {year_to_check}")
- # 3b. Count how many political parties there are
- num_parties = df['Parliament Group Name'].nunique()
- parties = df['Parliament Group Name'].unique()
- print(f"There are {num_parties} parties in original dataset\n\n")
- # 3c. Find the average total revenue - Remove the NaN value
- d = dict()
- for party in parties:
- new = new_df[new_df['Parliament Group Name'] == party]
- average = new['Total Revenue'].mean()
- if str(party).lower() != "nan" and str(average).lower() != "nan":
- d[party] = floor(average)
- # 4d. Reinspect the political parties with data for total revenue (not NaN values)
- num_parties = len(d)
- parties = list(d.keys())
- print(f"There are {num_parties} parties with (non-NaN) data to compare:\n{parties}\n\n")
- # 4e. Sort the dictionary 'd' by values
- d_sorted = sorted(d.items(), key=lambda x:x[1], reverse=True)
- d = dict(d_sorted)
- for key, value in d.items():
- print(f"{key} ----> {value}")
- # 4f. Now, I am interested in political parties of Greek parliament - I will display them with abbreviations
- parties_of_interest = ['Ελληνική Λύση', 'Συνασπισμός Ριζοσπαστικής Αριστεράς', 'Νέα Δημοκρατία', 'ΜέΡΑ25', 'Λαϊκός Σύνδεσμος - Χρυσή Αυγή', 'Το Ποτάμι', 'ΠΑΣΟΚ-Κίνημα Αλλαγής', 'Δημοκρατική Συμπαράταξη (ΠΑ.ΣΟ.Κ. - ΔΗΜ.ΑΡ.)', 'Κομμουνιστικό Κόμμα Ελλάδας', 'Ανεξάρτητοι']
- abbreviations = ['Ελλ. Λύση', 'ΣΥΡΙΖΑ', 'ΝΔ', 'ΜέΡΑ25', 'Χρυσή Αυγή', 'Το Ποτάμι', 'ΠΑΣΟΚ-ΚΙΝΑΛ', 'ΔΗΜΑΡ', 'ΚΚΕ', 'Ανεξάρτητοι']
- abbs = {parties_of_interest[i] : abbreviations[i] for i in range(len(parties_of_interest))}
- data = dict()
- for key, value in d.items():
- if key in parties_of_interest:
- party_abb = abbs[key]
- revenue = value
- data[party_abb] = revenue
- # 4g. Plot
- plt.figure(figsize=(12, 9))
- keys = list(data.keys())
- values = list(data.values())
- plt.bar(keys, values)
- # 5. HEATMAP
- # Heatmap is ALL ABOUT numerical data, so I will have to play with numerical cols
- new = df[numerical_cols]
- print(new.head())
- sns.heatmap(new)
- # 6. Discover the most "rich" year
- cols_to_playwith = ['Year of Use', 'Total Revenue']
- new_df = df[cols_to_playwith]
- print(new.head(), '\n\n')
- num_years = new_df['Year of Use'].nunique()
- years = new_df['Year of Use'].unique()
- years = sorted(years)
- print(f"I will examine {num_years} years: {years}")
- avg_list = list()
- for year in years:
- new = new_df[new_df['Year of Use'] == year]
- average = new['Total Revenue'].mean()
- avg_list.append(floor(average))
- print(avg_list)
- plt.plot(years, avg_list)
- plt.title("Average revenue vs year (including every political party)")
- plt.show()
- sns.histplot(data=df, x='Total Revenue', kde=True)
- plt.show()
- sns.kdeplot(data=df, x='Total Revenue', hue='Year of Use')
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement