Advertisement
mayankjoin3

ml_code_add_algos

Mar 10th, 2025
214
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.36 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. import seaborn as sns
  4. import matplotlib.pyplot as plt
  5. from sklearn.model_selection import train_test_split, KFold
  6. from sklearn.tree import DecisionTreeClassifier
  7. from sklearn.neighbors import KNeighborsClassifier
  8. from sklearn.linear_model import LogisticRegression
  9. from imblearn.under_sampling import RandomUnderSampler
  10. from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
  11.                              balanced_accuracy_score, confusion_matrix,
  12.                              matthews_corrcoef, cohen_kappa_score, log_loss,
  13.                              mean_squared_error, mean_absolute_error, r2_score)
  14. import time
  15. import os
  16.  
  17. # Set the number of K folds as a global variable
  18. K_FOLDS = 2
  19.  
  20. # Read the dataset from CSV file
  21. df = pd.read_csv('data.csv')
  22.  
  23. # Take 20% of the data
  24. df = df.sample(frac=0.2, random_state=42)
  25.  
  26. # Rename the last column as 'label'
  27. df.rename(columns={df.columns[-1]: 'label'}, inplace=True)
  28.  
  29. X = df.drop(columns=['label']).values
  30. y = df['label'].values
  31.  
  32. timing_results = []
  33.  
  34.  
  35. # Define classifiers and metrics
  36. classifiers = {
  37.     'DecisionTree': DecisionTreeClassifier(),
  38.     'KNN': KNeighborsClassifier(),
  39.     'LogisticRegression': LogisticRegression(),
  40. }
  41. # Store results
  42. results = []
  43.  
  44. kf = KFold(n_splits=K_FOLDS, shuffle=True, random_state=42)
  45.  
  46. # Create a directory to save confusion matrices
  47. os.makedirs("confusion_matrices", exist_ok=True)
  48.  
  49. # Helper function for confusion matrix metrics
  50. def confusion_matrix_metrics(cm, classes):
  51.     metrics = {}
  52.     for idx, class_label in enumerate(classes):
  53.         TP = cm[idx, idx]  # True Positives for this class
  54.         FP = cm[:, idx].sum() - TP  # False Positives for this class
  55.         FN = cm[idx, :].sum() - TP  # False Negatives for this class
  56.         TN = cm.sum() - (TP + FP + FN)  # True Negatives for this class
  57.  
  58.         metrics[class_label] = {
  59.             'TPR': TP / (TP + FN + 1e-10) if (TP + FN) > 0 else 0,
  60.             'TNR': TN / (TN + FP + 1e-10) if (TN + FP) > 0 else 0,
  61.             'FPR': FP / (FP + TN + 1e-10) if (FP + TN) > 0 else 0,
  62.             'FNR': FN / (FN + TP + 1e-10) if (FN + TP) > 0 else 0
  63.         }
  64.     return metrics
  65.  
  66. # Iterate over classifiers
  67. for clf_name, clf in classifiers.items():
  68.     fold_idx = 1
  69.     for train_index, test_index in kf.split(X):
  70.         # Split the data
  71.         X_train, X_test = X[train_index], X[test_index]
  72.         y_train, y_test = y[train_index], y[test_index]
  73.  
  74.         # Record start time
  75.         start_train_time = time.time()
  76.         clf.fit(X_train, y_train)
  77.         train_time = time.time() - start_train_time
  78.  
  79.         start_test_time = time.time()
  80.         y_pred = clf.predict(X_test)
  81.         test_time = time.time() - start_test_time
  82.  
  83.         timing_results.append({
  84.             'Classifier': clf_name,
  85.             'Fold': fold_idx,
  86.             'Training Time (s)': train_time,
  87.             'Testing Time (s)': test_time,
  88.             'Total Time (s)': train_time + test_time
  89.         })
  90.  
  91.         # Compute metrics
  92.         unique_classes = np.unique(y)
  93.         cm = confusion_matrix(y_test, y_pred, labels=unique_classes)
  94.         cm_metrics = confusion_matrix_metrics(cm, unique_classes)
  95.  
  96.         class_metrics_list = []
  97.  
  98.         for class_label in unique_classes:
  99.             class_mask = (y_test == class_label)
  100.             if class_mask.sum() == 0:
  101.                 # Skip classes with no instances in the test set for this fold
  102.                 class_specific_metrics = {
  103.                     'Classifier': clf_name,
  104.                     'Fold': fold_idx,
  105.                     'Class': class_label,
  106.                     'Accuracy': np.nan,
  107.                     'Precision': np.nan,
  108.                     'Recall': np.nan,
  109.                     'F1 Score': np.nan,
  110.                     'Balanced Accuracy': np.nan,
  111.                     'True Positive Rate (TPR)': np.nan,
  112.                     'True Negative Rate (TNR)': np.nan,
  113.                     'False Positive Rate (FPR)': np.nan,
  114.                     'False Negative Rate (FNR)': np.nan,
  115.                     'Training Time (s)': train_time,
  116.                     'Testing Time (s)': test_time
  117.                 }
  118.             else:
  119.                 class_specific_metrics = {
  120.                     'Classifier': clf_name,
  121.                     'Fold': fold_idx,
  122.                     'Class': class_label,
  123.                     'Accuracy': accuracy_score(y_test[class_mask], y_pred[class_mask]) if np.any(class_mask) else np.nan,
  124.                     'Precision': precision_score(y_test[class_mask], y_pred[class_mask], average='weighted', zero_division=0) if np.any(class_mask) else np.nan,
  125.                     'Recall': recall_score(y_test[class_mask], y_pred[class_mask], average='weighted') if np.any(class_mask) else np.nan,
  126.                     'F1 Score': f1_score(y_test[class_mask], y_pred[class_mask], average='weighted') if np.any(class_mask) else np.nan,
  127.                     'Balanced Accuracy': balanced_accuracy_score(y_test[class_mask], y_pred[class_mask]) if np.any(class_mask) else np.nan,
  128.                     'True Positive Rate (TPR)': cm_metrics[class_label]['TPR'],
  129.                     'True Negative Rate (TNR)': cm_metrics[class_label]['TNR'],
  130.                     'False Positive Rate (FPR)': cm_metrics[class_label]['FPR'],
  131.                     'False Negative Rate (FNR)': cm_metrics[class_label]['FNR'],
  132.                     'Training Time (s)': train_time,
  133.                     'Testing Time (s)': test_time
  134.                 }
  135.  
  136.             class_metrics_list.append(class_specific_metrics)
  137.  
  138.         # Plot and save confusion matrix
  139.         plt.figure(figsize=(8, 6))
  140.         sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=unique_classes, yticklabels=unique_classes)
  141.         plt.title(f"{clf_name} - Fold {fold_idx} Confusion Matrix")
  142.         plt.xlabel("Predicted")
  143.         plt.ylabel("True")
  144.         plt.savefig(f"confusion_matrices/{clf_name}_fold_{fold_idx}.png")
  145.         plt.close()
  146.  
  147.         # Append results for this fold
  148.         results.extend(class_metrics_list)
  149.         fold_idx += 1
  150.  
  151. timing_df = pd.DataFrame(timing_results)
  152. timing_df.to_csv("time.csv", index=False)
  153.  
  154. # Create a DataFrame for results
  155. results_df = pd.DataFrame(results)
  156. print("Classification Metrics Across Folds:")
  157. print(results_df)
  158.  
  159. # Save results to CSV
  160. results_df.to_csv("metrics.csv", index=False)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement