classification using xai

import pandas as pd
import numpy as np
import time
import csv
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, matthews_corrcoef
from sklearn.metrics import jaccard_score, cohen_kappa_score, hamming_loss, zero_one_loss, mean_absolute_error
from sklearn.metrics import mean_squared_error, balanced_accuracy_score, r2_score
from sklearn.base import BaseEstimator
from imblearn.over_sampling import RandomOverSampler

# Import your ML models from sklearn (e.g., DecisionTreeClassifier, etc.)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Utility functions to handle errors and recording results
def store_results(algorithm, metrics, csv_file):
    with open(csv_file, 'a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([algorithm] + metrics)

def execute_model(model_func, X_train, y_train, X_test, y_test, k_fold, algo_name, csv_file):
    try:
        start_train_time = time.time()
        model = model_func()
        model.fit(X_train, y_train)
        train_time = time.time() - start_train_time

        start_predict_time = time.time()
        y_pred = model.predict(X_test)
        predict_time = time.time() - start_predict_time

        # Compute metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        fbeta = fbeta_score(y_test, y_pred, beta=1, average='weighted')
        mcc = matthews_corrcoef(y_test, y_pred)
        jaccard = jaccard_score(y_test, y_pred, average='weighted')
        kappa = cohen_kappa_score(y_test, y_pred)
        h_loss = hamming_loss(y_test, y_pred)
        z_o_loss = zero_one_loss(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        bal_acc = balanced_accuracy_score(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Store results
        metrics = [train_time, predict_time, accuracy, precision, recall, f1, fbeta, mcc, jaccard, kappa, h_loss, z_o_loss, mae, mse, rmse, bal_acc, r2]
        store_results(algo_name, metrics, csv_file)

        print(f'{algo_name}: Train Time: {train_time}s, Predict Time: {predict_time}s, Total Time: {train_time + predict_time}s')

    except Exception as e:
        # In case of an error, store -1 for all metrics
        metrics = [-1] * 17
        store_results(algo_name, metrics, csv_file)
        print(f'{algo_name} failed: {e}')

# Preprocessing and dataset handling
def preprocess_data(input_file, dataset_percent):
    data = pd.read_csv(input_file)

    # Handle missing values (e.g., replacing with median)
    data.fillna(data.median(), inplace=True)

    # Convert categorical data
    for col in data.select_dtypes(include=['object']).columns:
        data[col] = LabelEncoder().fit_transform(data[col])

    # Separate features and target
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]

    # Balance dataset using GAN or SMOTE for simplicity
    oversampler = RandomOverSampler()  # This is a placeholder; replace with a GAN-based resampling method
    X, y = oversampler.fit_resample(X, y)

    # Apply MinMax scaling
    scaler = MinMaxScaler()
    X = scaler.fit_transform(X)

    # Reduce dataset size based on dataset_percent
    X, _, y, _ = train_test_split(X, y, train_size=dataset_percent / 100, stratify=y)

    return X, y

# Define your explainable AI models
def decision_tree_model():
    return DecisionTreeClassifier()

def random_forest_model():
    return RandomForestClassifier()

def svm_model():
    return SVC()

# Main function to execute k-fold cross-validation
def run_classification(input_file, csv_file, k_fold, dataset_percent):
    X, y = preprocess_data(input_file, dataset_percent)

    kf = KFold(n_splits=k_fold, shuffle=True)

    # Loop through each fold
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Execute models with try-except blocks
        execute_model(decision_tree_model, X_train, y_train, X_test, y_test, k_fold, 'Decision Tree', csv_file)
        execute_model(random_forest_model, X_train, y_train, X_test, y_test, k_fold, 'Random Forest', csv_file)
        execute_model(svm_model, X_train, y_train, X_test, y_test, k_fold, 'SVM', csv_file)

# Set parameters and run
k_fold = 10  # Number of folds
dataset_percent = 10  # Percentage of dataset to use
csv_file = 'results.csv'  # CSV file to store the results
input_file = 'input.csv'  # Input dataset file

run_classification(input_file, csv_file, k_fold, dataset_percent)