Advertisement
makispaiktis

5. Data Analysis Parliament

Jul 22nd, 2024
171
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.45 KB | None | 0 0
  1. import matplotlib.pyplot as plt
  2. import seaborn as sns
  3. import pandas as pd
  4. import numpy as np
  5.  
  6.  
  7. # 1. Keeping the columns I need
  8. # 1a. Basics
  9. filepath = "/content/sample_data/parliament.csv"
  10. df = pd.read_csv(filepath)
  11. shape = df.shape
  12. print("----> Original dataset")
  13. print(f"Dataset shape = ({shape[0]} rows-records) x ({shape[1]} columns-features)")
  14. all_columns = list(df.columns)
  15. print("Columns names:", all_columns, '\n\n')
  16.  
  17. # 1b. Drop 2 unnecessary columns
  18. cols_to_drop = ['Profile Image', 'Slug', 'Electoral Districts']
  19. print(f"----> Dropping {len(cols_to_drop)} columns: {cols_to_drop}")
  20. df.drop(columns=cols_to_drop, inplace=True)
  21. shape = df.shape
  22. all_columns = list(df.columns)
  23. print(f"Dataset shape = ({shape[0]} rows-records) x ({shape[1]} columns-features)")
  24. print("Columns names:", all_columns, '\n\n')
  25.  
  26. # 1c. Concatenate 2 columns into 1 (using 'agg' function)
  27. cols_to_concatenate = ['First Name', 'Last Name']
  28. print(f'----> Concatenating {len(cols_to_concatenate)} columns ({cols_to_concatenate}) into 1 single column')
  29. df['Name'] = df[cols_to_concatenate].agg(' '.join, axis=1)
  30. df.drop(columns=cols_to_concatenate, inplace=True)
  31. shape = df.shape
  32. all_columns = list(df.columns)
  33. print(f"Dataset shape = ({shape[0]} rows-records) x ({shape[1]} columns-features)")
  34. print("Columns names:", all_columns, '\n\n')
  35.  
  36. # 1d. Reordering the columns into my desired order
  37. print("----> Reordering the dataframe columns into my desired order")
  38. desired_order = ['Name', 'Current Position', 'Parliament Group Name', 'District', 'Total Revenue', 'Year of Use', 'Parliament Group ID', 'Government Position ID', 'Government Position Name', 'EU', 'ID']
  39. df = df[desired_order]
  40. shape = df.shape
  41. all_columns = list(df.columns)
  42. print(f"Dataset shape = ({shape[0]} rows-records) x ({shape[1]} columns-features)")
  43. print("Columns names:", all_columns, '\n\n')
  44. print(df.head())
  45.  
  46.  
  47.  
  48.  
  49. # 2. Check for missing data
  50. # 2a. Separate columns into numerical and categorical ones
  51. numerical_cols = [cname for cname in df.columns if df[cname].dtype in ['int64', 'float64']]
  52. categorical_cols = [cname for cname in df.columns if df[cname].dtype == "object"]
  53. my_cols = numerical_cols + categorical_cols
  54. print(f"There are {len(numerical_cols)}/{shape[1]} NUMERICAL columns:\nNumerical cols = {numerical_cols}\n")
  55. print(f"There are {len(categorical_cols)}/{shape[1]} CATEGORICAL columns:\nCategorical cols = {categorical_cols}\n")
  56.  
  57. # 2b. Find out in how many columns there are missing data
  58. cols_with_missing = [col for col in df.columns if df[col].isnull().any()]
  59. print(f"There are {len(cols_with_missing)}/{shape[1]} columns WITH MISSING VALUES:\n {cols_with_missing}")
  60.  
  61. # 2c. Separate into numerical and categorical ones with missing data
  62. numerical_with_missing = [col for col in numerical_cols if col in cols_with_missing]
  63. categorical_with_missing = [col for col in categorical_cols if col in cols_with_missing]
  64. selected_with_missing = [col for col in my_cols if col in cols_with_missing]
  65. print(f"Namely: {len(numerical_with_missing)}/{len(numerical_cols)} numerical columns and {len(categorical_with_missing)}/{len(categorical_cols)} categorical columns with missing data")
  66. # print(f"\nThese {len(cols_with_missing)} columns with missing data need IMPUTATION.....")
  67.  
  68.  
  69.  
  70. # 3. Find out the average total revenue for a political party in 2021
  71. from math import floor
  72.  
  73. # 3a. Filtering
  74. year_to_check = 2021
  75. new_df = df[df['Year of Use'] == year_to_check]
  76. print(f"I will check {len(new_df)}/{shape[0]} rows with filter:\nYear of Use = {year_to_check}")
  77.  
  78. # 3b. Count how many political parties there are
  79. num_parties = df['Parliament Group Name'].nunique()
  80. parties = df['Parliament Group Name'].unique()
  81. print(f"There are {num_parties} parties in original dataset\n\n")
  82.  
  83. # 3c. Find the average total revenue - Remove the NaN value
  84. d = dict()
  85. for party in parties:
  86.     new = new_df[new_df['Parliament Group Name'] == party]
  87.     average = new['Total Revenue'].mean()
  88.     if str(party).lower() != "nan" and str(average).lower() != "nan":
  89.         d[party] = floor(average)
  90.  
  91. # 4d. Reinspect the political parties with data for total revenue (not NaN values)
  92. num_parties = len(d)
  93. parties = list(d.keys())
  94. print(f"There are {num_parties} parties with (non-NaN) data to compare:\n{parties}\n\n")
  95.  
  96. # 4e. Sort the dictionary 'd' by values
  97. d_sorted = sorted(d.items(), key=lambda x:x[1], reverse=True)
  98. d = dict(d_sorted)
  99. for key, value in d.items():
  100.     print(f"{key} ----> {value}")
  101.  
  102.  
  103.  
  104.  
  105. # 4f. Now, I am interested in political parties of Greek parliament - I will display them with abbreviations
  106. parties_of_interest = ['Ελληνική Λύση', 'Συνασπισμός Ριζοσπαστικής Αριστεράς', 'Νέα Δημοκρατία', 'ΜέΡΑ25', 'Λαϊκός Σύνδεσμος - Χρυσή Αυγή', 'Το Ποτάμι', 'ΠΑΣΟΚ-Κίνημα Αλλαγής', 'Δημοκρατική Συμπαράταξη (ΠΑ.ΣΟ.Κ. - ΔΗΜ.ΑΡ.)', 'Κομμουνιστικό Κόμμα Ελλάδας', 'Ανεξάρτητοι']
  107. abbreviations = ['Ελλ. Λύση', 'ΣΥΡΙΖΑ', 'ΝΔ', 'ΜέΡΑ25', 'Χρυσή Αυγή', 'Το Ποτάμι', 'ΠΑΣΟΚ-ΚΙΝΑΛ', 'ΔΗΜΑΡ', 'ΚΚΕ', 'Ανεξάρτητοι']
  108. abbs = {parties_of_interest[i] : abbreviations[i] for i in range(len(parties_of_interest))}
  109.  
  110. data = dict()
  111. for key, value in d.items():
  112.     if key in parties_of_interest:
  113.         party_abb = abbs[key]
  114.         revenue = value
  115.         data[party_abb] = revenue
  116.  
  117. # 4g. Plot
  118. plt.figure(figsize=(12, 9))
  119. keys = list(data.keys())
  120. values = list(data.values())
  121. plt.bar(keys, values)
  122.  
  123.  
  124.  
  125. # 5. HEATMAP
  126. # Heatmap is ALL ABOUT numerical data, so I will have to play with numerical cols
  127. new = df[numerical_cols]
  128. print(new.head())
  129. sns.heatmap(new)
  130.  
  131.  
  132.  
  133. # 6. Discover the most "rich" year
  134. cols_to_playwith = ['Year of Use', 'Total Revenue']
  135. new_df = df[cols_to_playwith]
  136. print(new.head(),  '\n\n')
  137.  
  138. num_years = new_df['Year of Use'].nunique()
  139. years = new_df['Year of Use'].unique()
  140. years = sorted(years)
  141. print(f"I will examine {num_years} years: {years}")
  142.  
  143. avg_list = list()
  144. for year in years:
  145.     new = new_df[new_df['Year of Use'] == year]
  146.     average = new['Total Revenue'].mean()
  147.     avg_list.append(floor(average))
  148. print(avg_list)
  149.  
  150. plt.plot(years, avg_list)
  151. plt.title("Average revenue vs year (including every political party)")
  152. plt.show()
  153.  
  154. sns.histplot(data=df, x='Total Revenue', kde=True)
  155. plt.show()
  156. sns.kdeplot(data=df, x='Total Revenue', hue='Year of Use')
  157. plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement