Case Protein 1-9 Violin/Distribution

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
import math

# Replace 'B5 segmentSummary (1).csv' with the actual CSV file path
file_path = 'B5 segmentSummary (1).csv'

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Check the available column names
print(df.columns)

# Example: Replace 'categorical_column' with the correct column name containing categorical data
categorical_column = df.columns[0]  # Change this to the correct column name

# Define the range of column positions you want to analyze (columns 6 through 10)
start_column_position = 6  # Corresponds to column 6
end_column_position = 9  # Corresponds to column 10

# Create a list of unique categories in the categorical column
categories = df[categorical_column].unique()

# Create a Seaborn color palette for differentiation
colors = sns.color_palette('husl', n_colors=end_column_position - start_column_position + 1)

# Iterate through each category and create separate figures with six Raincloud plots and six Gaussian distributions per page
for category in categories:
    category_data = df[df[categorical_column] == category]
    num_plots = end_column_position - start_column_position + 1
    num_pages = math.ceil(num_plots / 6)  # Determine the number of pages needed

    for page in range(num_pages):
        plt.figure(figsize=(18, 12))

        for i in range(6):
            plot_num = page * 6 + i
            if plot_num >= num_plots:
                break

            col = df.columns[start_column_position + plot_num]

            # Raincloud plot
            plt.subplot(2, 6, i + 1)
            sns.violinplot(data=category_data, y=col, color=colors[plot_num])
            sns.boxplot(data=category_data, y=col, color='white', width=0.2)
            plt.title(f'Raincloud Plot\n{category} - {col}')

            # Gaussian distribution plot
            plt.subplot(2, 6, i + 7)
            target_column = category_data[col]
            mean = target_column.mean()
            std_dev = target_column.std()
            x_range = np.linspace(target_column.min(), target_column.max(), 1000)
            fitted_data = norm.pdf(x_range, mean, std_dev)
            plt.plot(x_range, fitted_data, 'b-', linewidth=2, label='Fitted Gaussian')
            plt.hist(target_column, bins=30, density=True, alpha=0.7, color='gray', label='Data Histogram')
            plt.xlabel('Values')
            plt.ylabel('Frequency')
            plt.title(f'Gaussian Fit\n{category} - {col}')
            plt.axvline(mean, color='red', linestyle='dashed', linewidth=2, label='Mean')
            plt.legend()

        plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust the layout to avoid title overlap
        plt.suptitle(f'{category} Plots - Page {page + 1}', fontsize=16)
        plt.show()