Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import os
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- import seaborn as sns
- import time
- from scipy import stats
- from sklearn.model_selection import StratifiedKFold, train_test_split
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.preprocessing import StandardScaler, LabelEncoder
- from sklearn.metrics import (
- accuracy_score, precision_score, recall_score, f1_score,
- confusion_matrix, roc_curve, roc_auc_score, precision_recall_curve,
- average_precision_score, matthews_corrcoef, cohen_kappa_score,
- mean_squared_error, mean_absolute_error, log_loss,
- hamming_loss, jaccard_score, balanced_accuracy_score
- )
- from sklearn.preprocessing import label_binarize
- class UNSWRandomForestClassifier:
- def __init__(self, n_estimators=100, random_state=42):
- self.n_estimators = n_estimators
- self.random_state = random_state
- self.model = RandomForestClassifier(
- n_estimators=n_estimators,
- random_state=random_state,
- n_jobs=-1
- )
- def load_and_preprocess_data(self, file_path):
- data = pd.read_csv(file_path)
- X = data.drop('label', axis=1)
- y = data['label']
- le = LabelEncoder()
- y = le.fit_transform(y)
- scaler = StandardScaler()
- X = scaler.fit_transform(X)
- return X, y
- def compute_comprehensive_metrics(self, y_true, y_pred, y_prob, fold_number):
- n_classes = len(np.unique(y_true))
- metrics = {'Fold': fold_number}
- # Global Metrics
- metrics.update({
- 'Accuracy': accuracy_score(y_true, y_pred),
- 'Balanced Accuracy': balanced_accuracy_score(y_true, y_pred),
- 'Matthews Correlation Coefficient': matthews_corrcoef(y_true, y_pred),
- 'Cohen Kappa Score': cohen_kappa_score(y_true, y_pred)
- })
- # Per-Class Metrics
- for i in range(n_classes):
- y_true_binary = (y_true == i)
- y_pred_binary = (y_pred == i)
- tn, fp, fn, tp = confusion_matrix(y_true_binary, y_pred_binary).ravel()
- # Per-Class Metrics
- metrics[f'Class_{i}_Precision'] = precision_score(y_true_binary, y_pred_binary)
- metrics[f'Class_{i}_Recall'] = recall_score(y_true_binary, y_pred_binary)
- metrics[f'Class_{i}_F1_Score'] = f1_score(y_true_binary, y_pred_binary)
- metrics[f'Class_{i}_TPR'] = tp / (tp + fn)
- metrics[f'Class_{i}_TNR'] = tn / (tn + fp)
- metrics[f'Class_{i}_FPR'] = fp / (fp + tn)
- metrics[f'Class_{i}_FNR'] = fn / (fn + tp)
- metrics[f'Class_{i}_PPV'] = tp / (tp + fp)
- metrics[f'Class_{i}_NPV'] = tn / (tn + fn)
- # ROC and PR Metrics
- y_true_bin = label_binarize(y_true, classes=np.unique(y_true))
- metrics.update({
- 'AUC (Macro)': roc_auc_score(y_true_bin, y_prob, multi_class='ovr', average='macro'),
- 'AUC-PR (Macro)': average_precision_score(y_true_bin, y_prob, average='macro')
- })
- # Error Metrics
- metrics.update({
- 'Mean Squared Error': mean_squared_error(y_true, y_pred),
- 'Mean Absolute Error': mean_absolute_error(y_true, y_pred),
- 'Root Mean Squared Error': np.sqrt(mean_squared_error(y_true, y_pred)),
- 'Log Loss': log_loss(y_true, y_prob)
- })
- # Other Metrics
- metrics.update({
- 'Hamming Loss': hamming_loss(y_true, y_pred),
- 'Jaccard Score (Macro)': jaccard_score(y_true, y_pred, average='macro')
- })
- return pd.DataFrame([metrics])
- def train_and_evaluate(self, X, y, n_splits=5, output_csv_path='metrics_output.csv'):
- skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=self.random_state)
- fold_metrics_list = []
- fold_feature_importances = []
- total_training_time = 0
- total_testing_time = 0
- os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
- for fold, (train_index, val_index) in enumerate(skf.split(X, y), 1):
- X_train, X_val = X[train_index], X[val_index]
- y_train, y_val = y[train_index], y[val_index]
- start_train_time = time.time()
- self.model.fit(X_train, y_train)
- training_time = time.time() - start_train_time
- total_training_time += training_time
- start_test_time = time.time()
- y_pred = self.model.predict(X_val)
- y_prob = self.model.predict_proba(X_val)
- testing_time = time.time() - start_test_time
- total_testing_time += testing_time
- fold_metrics = self.compute_comprehensive_metrics(y_val, y_pred, y_prob, fold)
- fold_metrics_list.append(fold_metrics)
- fold_metrics.to_csv(output_csv_path, mode='a', header=not os.path.exists(output_csv_path), index=False)
- fold_feature_importances.append(self.model.feature_importances_)
- all_fold_metrics = pd.concat(fold_metrics_list, ignore_index=True)
- avg_metrics = all_fold_metrics.mean()
- avg_feature_importance = np.mean(fold_feature_importances, axis=0)
- return avg_metrics, avg_feature_importance
- def plot_one_vs_all_auc_roc(self, X, y):
- y_bin = label_binarize(y, classes=np.unique(y))
- n_classes = y_bin.shape[1]
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
- self.model.fit(X_train, y_train)
- y_score = self.model.predict_proba(X_test)
- plt.figure(figsize=(10, 8))
- colors = ['blue', 'red', 'green', 'orange', 'purple']
- for i, color in zip(range(n_classes), colors[:n_classes]):
- fpr, tpr, _ = roc_curve(y_test == i, y_score[:, i])
- roc_auc = auc(fpr, tpr)
- plt.plot(fpr, tpr, color=color,
- label=f'ROC curve (class {i}, AUC = {roc_auc:.2f})')
- plt.plot([0, 1], [0, 1], 'k--')
- plt.xlim([0.0, 1.0])
- plt.ylim([0.0, 1.05])
- plt.xlabel('False Positive Rate')
- plt.ylabel('True Positive Rate')
- plt.title('Receiver Operating Characteristic (ROC) - One-vs-All')
- plt.legend(loc="lower right")
- plt.tight_layout()
- plt.show()
- def main():
- file_path = 'path/to/unsw_nb15_dataset.csv'
- output_csv_path = 'results/metrics_output.csv'
- rf_classifier = UNSWRandomForestClassifier(n_estimators=100)
- X, y = rf_classifier.load_and_preprocess_data(file_path)
- feature_names = pd.read_csv(file_path).drop('label', axis=1).columns
- avg_metrics, feature_importance = rf_classifier.train_and_evaluate(
- X, y,
- n_splits=5,
- output_csv_path=output_csv_path
- )
- print("Average Metrics across 5-Fold Cross-Validation:")
- for metric, value in avg_metrics.items():
- print(f"{metric}: {value}")
- rf_classifier.plot_one_vs_all_auc_roc(X, y)
- if __name__ == "__main__":
- main()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement