Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- import seaborn as sns
- from scipy.stats import norm
- import math
- # Replace 'B5 segmentSummary (1).csv' with the actual CSV file path
- file_path = 'B5 segmentSummary (1).csv'
- # Read the CSV file into a Pandas DataFrame
- df = pd.read_csv(file_path)
- # Check the available column names
- print(df.columns)
- # Example: Replace 'categorical_column' with the correct column name containing categorical data
- categorical_column = df.columns[0] # Change this to the correct column name
- # Define the range of column positions you want to analyze (columns 6 through 10)
- start_column_position = 6 # Corresponds to column 6
- end_column_position = 9 # Corresponds to column 10
- # Create a list of unique categories in the categorical column
- categories = df[categorical_column].unique()
- # Create a Seaborn color palette for differentiation
- colors = sns.color_palette('husl', n_colors=end_column_position - start_column_position + 1)
- # Iterate through each category and create separate figures with six Raincloud plots and six Gaussian distributions per page
- for category in categories:
- category_data = df[df[categorical_column] == category]
- num_plots = end_column_position - start_column_position + 1
- num_pages = math.ceil(num_plots / 6) # Determine the number of pages needed
- for page in range(num_pages):
- plt.figure(figsize=(18, 12))
- for i in range(6):
- plot_num = page * 6 + i
- if plot_num >= num_plots:
- break
- col = df.columns[start_column_position + plot_num]
- # Raincloud plot
- plt.subplot(2, 6, i + 1)
- sns.violinplot(data=category_data, y=col, color=colors[plot_num])
- sns.boxplot(data=category_data, y=col, color='white', width=0.2)
- plt.title(f'Raincloud Plot\n{category} - {col}')
- # Gaussian distribution plot
- plt.subplot(2, 6, i + 7)
- target_column = category_data[col]
- mean = target_column.mean()
- std_dev = target_column.std()
- x_range = np.linspace(target_column.min(), target_column.max(), 1000)
- fitted_data = norm.pdf(x_range, mean, std_dev)
- plt.plot(x_range, fitted_data, 'b-', linewidth=2, label='Fitted Gaussian')
- plt.hist(target_column, bins=30, density=True, alpha=0.7, color='gray', label='Data Histogram')
- plt.xlabel('Values')
- plt.ylabel('Frequency')
- plt.title(f'Gaussian Fit\n{category} - {col}')
- plt.axvline(mean, color='red', linestyle='dashed', linewidth=2, label='Mean')
- plt.legend()
- plt.tight_layout(rect=[0, 0.03, 1, 0.95]) # Adjust the layout to avoid title overlap
- plt.suptitle(f'{category} Plots - Page {page + 1}', fontsize=16)
- plt.show()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement