Advertisement
mayankjoin3

classification using xai

Oct 10th, 2024
67
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.09 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import time
  4. import csv
  5. from sklearn.model_selection import train_test_split, KFold
  6. from sklearn.preprocessing import LabelEncoder, MinMaxScaler
  7. from imblearn.over_sampling import SMOTE
  8. from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, matthews_corrcoef
  9. from sklearn.metrics import jaccard_score, cohen_kappa_score, hamming_loss, zero_one_loss, mean_absolute_error
  10. from sklearn.metrics import mean_squared_error, balanced_accuracy_score, r2_score
  11. from sklearn.base import BaseEstimator
  12. from imblearn.over_sampling import RandomOverSampler
  13.  
  14. # Import your ML models from sklearn (e.g., DecisionTreeClassifier, etc.)
  15. from sklearn.tree import DecisionTreeClassifier
  16. from sklearn.ensemble import RandomForestClassifier
  17. from sklearn.svm import SVC
  18.  
  19. # Utility functions to handle errors and recording results
  20. def store_results(algorithm, metrics, csv_file):
  21.     with open(csv_file, 'a', newline='') as file:
  22.         writer = csv.writer(file)
  23.         writer.writerow([algorithm] + metrics)
  24.  
  25. def execute_model(model_func, X_train, y_train, X_test, y_test, k_fold, algo_name, csv_file):
  26.     try:
  27.         start_train_time = time.time()
  28.         model = model_func()
  29.         model.fit(X_train, y_train)
  30.         train_time = time.time() - start_train_time
  31.        
  32.         start_predict_time = time.time()
  33.         y_pred = model.predict(X_test)
  34.         predict_time = time.time() - start_predict_time
  35.  
  36.         # Compute metrics
  37.         accuracy = accuracy_score(y_test, y_pred)
  38.         precision = precision_score(y_test, y_pred, average='weighted')
  39.         recall = recall_score(y_test, y_pred, average='weighted')
  40.         f1 = f1_score(y_test, y_pred, average='weighted')
  41.         fbeta = fbeta_score(y_test, y_pred, beta=1, average='weighted')
  42.         mcc = matthews_corrcoef(y_test, y_pred)
  43.         jaccard = jaccard_score(y_test, y_pred, average='weighted')
  44.         kappa = cohen_kappa_score(y_test, y_pred)
  45.         h_loss = hamming_loss(y_test, y_pred)
  46.         z_o_loss = zero_one_loss(y_test, y_pred)
  47.         mae = mean_absolute_error(y_test, y_pred)
  48.         mse = mean_squared_error(y_test, y_pred)
  49.         rmse = np.sqrt(mse)
  50.         bal_acc = balanced_accuracy_score(y_test, y_pred)
  51.         r2 = r2_score(y_test, y_pred)
  52.        
  53.         # Store results
  54.         metrics = [train_time, predict_time, accuracy, precision, recall, f1, fbeta, mcc, jaccard, kappa, h_loss, z_o_loss, mae, mse, rmse, bal_acc, r2]
  55.         store_results(algo_name, metrics, csv_file)
  56.  
  57.         print(f'{algo_name}: Train Time: {train_time}s, Predict Time: {predict_time}s, Total Time: {train_time + predict_time}s')
  58.  
  59.     except Exception as e:
  60.         # In case of an error, store -1 for all metrics
  61.         metrics = [-1] * 17
  62.         store_results(algo_name, metrics, csv_file)
  63.         print(f'{algo_name} failed: {e}')
  64.  
  65. # Preprocessing and dataset handling
  66. def preprocess_data(input_file, dataset_percent):
  67.     data = pd.read_csv(input_file)
  68.    
  69.     # Handle missing values (e.g., replacing with median)
  70.     data.fillna(data.median(), inplace=True)
  71.  
  72.     # Convert categorical data
  73.     for col in data.select_dtypes(include=['object']).columns:
  74.         data[col] = LabelEncoder().fit_transform(data[col])
  75.    
  76.     # Separate features and target
  77.     X = data.iloc[:, :-1]
  78.     y = data.iloc[:, -1]
  79.    
  80.     # Balance dataset using GAN or SMOTE for simplicity
  81.     oversampler = RandomOverSampler()  # This is a placeholder; replace with a GAN-based resampling method
  82.     X, y = oversampler.fit_resample(X, y)
  83.    
  84.     # Apply MinMax scaling
  85.     scaler = MinMaxScaler()
  86.     X = scaler.fit_transform(X)
  87.  
  88.     # Reduce dataset size based on dataset_percent
  89.     X, _, y, _ = train_test_split(X, y, train_size=dataset_percent / 100, stratify=y)
  90.    
  91.     return X, y
  92.  
  93. # Define your explainable AI models
  94. def decision_tree_model():
  95.     return DecisionTreeClassifier()
  96.  
  97. def random_forest_model():
  98.     return RandomForestClassifier()
  99.  
  100. def svm_model():
  101.     return SVC()
  102.  
  103. # Main function to execute k-fold cross-validation
  104. def run_classification(input_file, csv_file, k_fold, dataset_percent):
  105.     X, y = preprocess_data(input_file, dataset_percent)
  106.    
  107.     kf = KFold(n_splits=k_fold, shuffle=True)
  108.    
  109.     # Loop through each fold
  110.     for train_index, test_index in kf.split(X):
  111.         X_train, X_test = X[train_index], X[test_index]
  112.         y_train, y_test = y[train_index], y[test_index]
  113.  
  114.         # Execute models with try-except blocks
  115.         execute_model(decision_tree_model, X_train, y_train, X_test, y_test, k_fold, 'Decision Tree', csv_file)
  116.         execute_model(random_forest_model, X_train, y_train, X_test, y_test, k_fold, 'Random Forest', csv_file)
  117.         execute_model(svm_model, X_train, y_train, X_test, y_test, k_fold, 'SVM', csv_file)
  118.  
  119. # Set parameters and run
  120. k_fold = 10  # Number of folds
  121. dataset_percent = 10  # Percentage of dataset to use
  122. csv_file = 'results.csv'  # CSV file to store the results
  123. input_file = 'input.csv'  # Input dataset file
  124.  
  125. run_classification(input_file, csv_file, k_fold, dataset_percent)
  126.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement