Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import time
- import csv
- from sklearn.model_selection import train_test_split, KFold
- from sklearn.preprocessing import LabelEncoder, MinMaxScaler
- from imblearn.over_sampling import SMOTE
- from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, matthews_corrcoef
- from sklearn.metrics import jaccard_score, cohen_kappa_score, hamming_loss, zero_one_loss, mean_absolute_error
- from sklearn.metrics import mean_squared_error, balanced_accuracy_score, r2_score
- from sklearn.base import BaseEstimator
- from imblearn.over_sampling import RandomOverSampler
- # Import your ML models from sklearn (e.g., DecisionTreeClassifier, etc.)
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.svm import SVC
- # Utility functions to handle errors and recording results
- def store_results(algorithm, metrics, csv_file):
- with open(csv_file, 'a', newline='') as file:
- writer = csv.writer(file)
- writer.writerow([algorithm] + metrics)
- def execute_model(model_func, X_train, y_train, X_test, y_test, k_fold, algo_name, csv_file):
- try:
- start_train_time = time.time()
- model = model_func()
- model.fit(X_train, y_train)
- train_time = time.time() - start_train_time
- start_predict_time = time.time()
- y_pred = model.predict(X_test)
- predict_time = time.time() - start_predict_time
- # Compute metrics
- accuracy = accuracy_score(y_test, y_pred)
- precision = precision_score(y_test, y_pred, average='weighted')
- recall = recall_score(y_test, y_pred, average='weighted')
- f1 = f1_score(y_test, y_pred, average='weighted')
- fbeta = fbeta_score(y_test, y_pred, beta=1, average='weighted')
- mcc = matthews_corrcoef(y_test, y_pred)
- jaccard = jaccard_score(y_test, y_pred, average='weighted')
- kappa = cohen_kappa_score(y_test, y_pred)
- h_loss = hamming_loss(y_test, y_pred)
- z_o_loss = zero_one_loss(y_test, y_pred)
- mae = mean_absolute_error(y_test, y_pred)
- mse = mean_squared_error(y_test, y_pred)
- rmse = np.sqrt(mse)
- bal_acc = balanced_accuracy_score(y_test, y_pred)
- r2 = r2_score(y_test, y_pred)
- # Store results
- metrics = [train_time, predict_time, accuracy, precision, recall, f1, fbeta, mcc, jaccard, kappa, h_loss, z_o_loss, mae, mse, rmse, bal_acc, r2]
- store_results(algo_name, metrics, csv_file)
- print(f'{algo_name}: Train Time: {train_time}s, Predict Time: {predict_time}s, Total Time: {train_time + predict_time}s')
- except Exception as e:
- # In case of an error, store -1 for all metrics
- metrics = [-1] * 17
- store_results(algo_name, metrics, csv_file)
- print(f'{algo_name} failed: {e}')
- # Preprocessing and dataset handling
- def preprocess_data(input_file, dataset_percent):
- data = pd.read_csv(input_file)
- # Handle missing values (e.g., replacing with median)
- data.fillna(data.median(), inplace=True)
- # Convert categorical data
- for col in data.select_dtypes(include=['object']).columns:
- data[col] = LabelEncoder().fit_transform(data[col])
- # Separate features and target
- X = data.iloc[:, :-1]
- y = data.iloc[:, -1]
- # Balance dataset using GAN or SMOTE for simplicity
- oversampler = RandomOverSampler() # This is a placeholder; replace with a GAN-based resampling method
- X, y = oversampler.fit_resample(X, y)
- # Apply MinMax scaling
- scaler = MinMaxScaler()
- X = scaler.fit_transform(X)
- # Reduce dataset size based on dataset_percent
- X, _, y, _ = train_test_split(X, y, train_size=dataset_percent / 100, stratify=y)
- return X, y
- # Define your explainable AI models
- def decision_tree_model():
- return DecisionTreeClassifier()
- def random_forest_model():
- return RandomForestClassifier()
- def svm_model():
- return SVC()
- # Main function to execute k-fold cross-validation
- def run_classification(input_file, csv_file, k_fold, dataset_percent):
- X, y = preprocess_data(input_file, dataset_percent)
- kf = KFold(n_splits=k_fold, shuffle=True)
- # Loop through each fold
- for train_index, test_index in kf.split(X):
- X_train, X_test = X[train_index], X[test_index]
- y_train, y_test = y[train_index], y[test_index]
- # Execute models with try-except blocks
- execute_model(decision_tree_model, X_train, y_train, X_test, y_test, k_fold, 'Decision Tree', csv_file)
- execute_model(random_forest_model, X_train, y_train, X_test, y_test, k_fold, 'Random Forest', csv_file)
- execute_model(svm_model, X_train, y_train, X_test, y_test, k_fold, 'SVM', csv_file)
- # Set parameters and run
- k_fold = 10 # Number of folds
- dataset_percent = 10 # Percentage of dataset to use
- csv_file = 'results.csv' # CSV file to store the results
- input_file = 'input.csv' # Input dataset file
- run_classification(input_file, csv_file, k_fold, dataset_percent)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement