Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #Change it
- input_file = 'CIC_IOT_2023_combined.csv' # Input dataset
- import os
- os.environ["CUDA_VISIBLE_DEVICES"] = "-1" # Disable GPU
- import pandas as pd
- import numpy as np
- import time
- from sklearn.model_selection import train_test_split, KFold
- from sklearn.preprocessing import LabelEncoder, MinMaxScaler
- from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, matthews_corrcoef
- from sklearn.metrics import jaccard_score, cohen_kappa_score, hamming_loss, zero_one_loss, mean_absolute_error
- from sklearn.metrics import mean_squared_error, r2_score, balanced_accuracy_score
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier, StackingClassifier
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.svm import SVC, OneClassSVM
- from sklearn.naive_bayes import GaussianNB
- from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron, ElasticNet
- from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
- from sklearn.neural_network import MLPClassifier
- from xgboost import XGBClassifier
- from lightgbm import LGBMClassifier
- from catboost import CatBoostClassifier
- from sklearn.cluster import KMeans, AgglomerativeClustering
- from sklearn.gaussian_process import GaussianProcessClassifier
- from sklearn.neighbors import NearestCentroid
- from sklearn.mixture import GaussianMixture
- from sklearn.ensemble import IsolationForest
- from sklearn.pipeline import Pipeline
- from sklearn.neural_network import BernoulliRBM
- from keras.models import Sequential
- from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, LSTM, GRU, SimpleRNN
- from keras.utils import to_categorical
- import traceback
- import csv
- import warnings
- from collections import defaultdict
- from sklearn.semi_supervised import SelfTrainingClassifier
- # Custom imports
- # from pgmpy.models import BayesianNetwork
- # For Bayesian Networks
- # import geneticalgorithm as ga
- # For Genetic Algorithm-based Classifier (hypothetical)
- # Add fuzzy logic and CRF imports if you have specific packages (e.g., `python-crfsuite` for CRF)
- warnings.filterwarnings("ignore") # Suppress warnings for cleaner output
- # Initialize dataset parameters
- k_fold = 5 # Change as needed
- dataset_percent = 100 # Change as needed
- # Initialize CSV file and columns
- # output_file = 'results.csv'
- output_file = input_file.replace('.csv', '_results.csv')
- class_metrics_file = input_file.replace('.csv', '_class_results.csv')
- csv_columns = ['Algorithm', 'Fold', 'Train Time (s)', 'Test Time (s)', 'Accuracy', 'Precision', 'Recall', 'F1 Score',
- 'Fbeta Score', 'Matthews Correlation Coefficient', 'Jaccard Score', 'Cohen Kappa Score',
- 'Hamming Loss', 'Zero One Loss', 'Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error',
- 'Balanced Accuracy', 'R2 Score']
- # Initialize per-class metrics CSV
- class_metrics_columns = ['Algorithm', 'Fold', 'Class', 'Accuracy', 'Precision', 'Recall', 'F1 Score',
- 'Fbeta Score', 'Matthews Correlation Coefficient', 'Jaccard Score', 'Cohen Kappa Score',
- 'Hamming Loss', 'Zero One Loss', 'Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error',
- 'Balanced Accuracy', 'R2 Score']
- def compute_classwise_metrics(y_true, y_pred):
- class_metrics = defaultdict(dict)
- classes = np.unique(y_true)
- for class_index in classes:
- true_class_name = class_names[class_index]
- y_true_class = (y_true == class_index).astype(int)
- y_pred_class = (y_pred == class_index).astype(int)
- # Calculate metrics for each true class name with rounding to 3 decimal places
- class_metrics[true_class_name]['Accuracy'] = round(accuracy_score(y_true_class, y_pred_class), 3)
- class_metrics[true_class_name]['Precision'] = round(precision_score(y_true, y_pred, labels=[class_index], average='weighted', zero_division=1), 3)
- class_metrics[true_class_name]['Recall'] = round(recall_score(y_true, y_pred, labels=[class_index], average='weighted', zero_division=1), 3)
- class_metrics[true_class_name]['F1 Score'] = round(f1_score(y_true, y_pred, labels=[class_index], average='weighted', zero_division=1), 3)
- class_metrics[true_class_name]['Fbeta Score'] = round(fbeta_score(y_true, y_pred, labels=[class_index], beta=1.0, average='weighted', zero_division=1), 3)
- class_metrics[true_class_name]['Matthews Correlation Coefficient'] = round(matthews_corrcoef(y_true_class, y_pred_class), 3)
- class_metrics[true_class_name]['Jaccard Score'] = round(jaccard_score(y_true, y_pred, labels=[class_index], average='weighted', zero_division=1), 3)
- class_metrics[true_class_name]['Cohen Kappa Score'] = round(cohen_kappa_score(y_true_class, y_pred_class), 3)
- class_metrics[true_class_name]['Hamming Loss'] = round(hamming_loss(y_true_class, y_pred_class), 3)
- class_metrics[true_class_name]['Zero One Loss'] = round(zero_one_loss(y_true_class, y_pred_class), 3)
- class_metrics[true_class_name]['Mean Absolute Error'] = round(mean_absolute_error(y_true_class, y_pred_class), 3)
- class_metrics[true_class_name]['Mean Squared Error'] = round(mean_squared_error(y_true_class, y_pred_class), 3)
- class_metrics[true_class_name]['Root Mean Squared Error'] = round(np.sqrt(class_metrics[true_class_name]['Mean Squared Error']), 3)
- class_metrics[true_class_name]['Balanced Accuracy'] = round(balanced_accuracy_score(y_true_class, y_pred_class), 3)
- class_metrics[true_class_name]['R2 Score'] = round(r2_score(y_true_class, y_pred_class), 3)
- return class_metrics
- # Function to handle metric calculation
- def compute_metrics(y_true, y_pred):
- metrics = {}
- metrics['Accuracy'] = accuracy_score(y_true, y_pred)
- metrics['Precision'] = precision_score(y_true, y_pred, average='weighted', zero_division=1)
- metrics['Recall'] = recall_score(y_true, y_pred, average='weighted', zero_division=1)
- metrics['F1 Score'] = f1_score(y_true, y_pred, average='weighted', zero_division=1)
- metrics['Fbeta Score'] = fbeta_score(y_true, y_pred, beta=1.0, average='weighted', zero_division=1)
- metrics['Matthews Correlation Coefficient'] = matthews_corrcoef(y_true, y_pred)
- metrics['Jaccard Score'] = jaccard_score(y_true, y_pred, average='weighted', zero_division=1)
- metrics['Cohen Kappa Score'] = cohen_kappa_score(y_true, y_pred)
- metrics['Hamming Loss'] = hamming_loss(y_true, y_pred)
- metrics['Zero One Loss'] = zero_one_loss(y_true, y_pred)
- metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
- metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
- metrics['Root Mean Squared Error'] = np.sqrt(metrics['Mean Squared Error'])
- metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
- metrics['R2 Score'] = r2_score(y_true, y_pred)
- return metrics
- # Function to handle each algorithm execution
- def run_algorithm(algo_name, model, X_train, y_train, X_test, y_test, fold):
- """Run a single algorithm and log its results"""
- try:
- start_train = time.time()
- model.fit(X_train, y_train)
- train_time = time.time() - start_train
- start_test = time.time()
- y_pred = model.predict(X_test)
- test_time = time.time() - start_test
- # Compute metrics
- if algo_name == 'ElasticNet': # Handle ElasticNet as a regression model
- metrics = {}
- metrics['Mean Absolute Error'] = mean_absolute_error(y_test, y_pred)
- metrics['Mean Squared Error'] = mean_squared_error(y_test, y_pred)
- metrics['Root Mean Squared Error'] = np.sqrt(metrics['Mean Squared Error'])
- metrics['R2 Score'] = r2_score(y_test, y_pred)
- else:
- # Compute classification metrics
- metrics = compute_metrics(y_test, y_pred)
- metrics.update({'Train Time (s)': train_time, 'Test Time (s)': test_time})
- # Compute class-wise metrics - Add this line
- class_metrics = compute_classwise_metrics(y_test, y_pred)
- # Log results to CSV
- with open(output_file, 'a', newline='') as f:
- writer = csv.writer(f)
- writer.writerow([algo_name, fold] + [metrics.get(m, -1) for m in csv_columns[2:]])
- # Write per-class metrics to `class_metrics.csv`
- with open(class_metrics_file, 'a', newline='') as f:
- writer = csv.writer(f)
- for class_label, cm in class_metrics.items():
- writer.writerow([algo_name, fold, class_label] + [cm.get(m, -1) for m in class_metrics_columns[3:]])
- print(f"{algo_name} | Fold: {fold} | Train Time: {train_time:.2f}s | Test Time: {test_time:.2f}s")
- except Exception as e:
- print(f"Error in {algo_name}: {traceback.format_exc()}")
- # Log error case
- with open(output_file, 'a', newline='') as f:
- writer = csv.writer(f)
- writer.writerow([algo_name, fold] + [-1 for _ in csv_columns[2:]])
- # Load dataset
- df = pd.read_csv(input_file)
- X = df.iloc[:, :-1]
- y = df.iloc[:, -1]
- # Encode categorical features
- X = pd.get_dummies(X)
- label_encoder = LabelEncoder()
- y = label_encoder.fit_transform(y)
- class_names = label_encoder.classes_
- # Min-Max scaling
- scaler = MinMaxScaler()
- X = scaler.fit_transform(X)
- # Take a subset of the dataset if dataset_percent is less than 100
- if dataset_percent < 100:
- _, X, _, y = train_test_split(X, y, test_size=dataset_percent/100, stratify=y)
- # Prepare CSV header if not present
- if not pd.io.common.file_exists(output_file):
- with open(output_file, 'w', newline='') as f:
- writer = csv.writer(f)
- writer.writerow(csv_columns)
- if not pd.io.common.file_exists(class_metrics_file):
- with open(class_metrics_file, 'w', newline='') as f:
- writer = csv.writer(f)
- writer.writerow(class_metrics_columns)
- # K-Fold Cross Validation
- kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
- # Deep Learning (CNN, RNN, LSTM, GRU, Autoencoders)
- def create_cnn(input_shape):
- model = Sequential()
- model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
- model.add(MaxPooling2D(pool_size=(2, 2)))
- model.add(Flatten())
- model.add(Dense(128, activation='relu'))
- model.add(Dense(10, activation='softmax'))
- model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
- return model
- def create_rnn(input_shape):
- model = Sequential()
- model.add(SimpleRNN(128, input_shape=input_shape, return_sequences=True))
- model.add(Flatten())
- model.add(Dense(10, activation='softmax'))
- model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
- return model
- def create_lstm(input_shape):
- model = Sequential()
- model.add(LSTM(128, input_shape=input_shape, return_sequences=True))
- model.add(Flatten())
- model.add(Dense(10, activation='softmax'))
- model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
- return model
- def create_gru(input_shape):
- model = Sequential()
- model.add(GRU(128, input_shape=input_shape, return_sequences=True))
- model.add(Flatten())
- model.add(Dense(10, activation='softmax'))
- model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
- return model
- def create_autoencoder(input_shape):
- model = Sequential()
- model.add(Dense(128, input_shape=input_shape, activation='relu'))
- model.add(Dense(input_shape[0], activation='sigmoid'))
- model.compile(optimizer='adam', loss='mse')
- return model
- # List of algorithms
- algorithms = {
- 'Naive Bayes': GaussianNB(),
- 'LDA': LinearDiscriminantAnalysis(),
- 'QDA': QuadraticDiscriminantAnalysis(),
- # 'SVM': SVC(kernel='linear', max_iter=1000),
- 'Decision Tree': DecisionTreeClassifier(),
- # 'SGD Classifier': SGDClassifier(),
- 'KNN': KNeighborsClassifier(),
- # 'ElasticNet': ElasticNet(),
- # 'Perceptron': Perceptron(),
- 'Logistic Regression': LogisticRegression(),
- 'Bagging': BaggingClassifier(),
- 'K-Means': KMeans(n_clusters=3),
- 'Nearest Centroid Classifier': NearestCentroid(),
- 'XGBoost': XGBClassifier(),
- 'AdaBoost': AdaBoostClassifier(),
- ########'RNN': create_rnn((28, 28)),
- 'RBM + Logistic Regression': Pipeline(steps=[('rbm', BernoulliRBM(n_components=100, learning_rate=0.06, n_iter=10, random_state=42)),('logistic', LogisticRegression())]),
- 'Voting Classifier': VotingClassifier(estimators=[('lr', LogisticRegression()),('rf', RandomForestClassifier()),('gnb', GaussianNB())], voting='hard'),
- 'Random Forest': RandomForestClassifier(n_estimators=10),
- # 'Gradient Boosting': GradientBoostingClassifier(n_estimators=10),
- 'Stacking Classifier': StackingClassifier(estimators=[('log_reg', LogisticRegression()),('knn', KNeighborsClassifier(n_neighbors=3))],final_estimator=LogisticRegression(),n_jobs=-1),
- # 'MLP Classifier': MLPClassifier(),
- ######### 'GRU': create_gru((28, 28)),
- ######### 'LSTM': create_lstm((28, 28)),
- ######### 'CNN': create_cnn((28, 28, 1)),
- ######### 'Autoencoder': create_autoencoder((28,)),
- 'LightGBM': LGBMClassifier(),
- 'CatBoost': CatBoostClassifier(),
- 'Self-Training': SelfTrainingClassifier(LogisticRegression()),
- 'Isolation Forest': IsolationForest(),
- # 'One-Class SVM': OneClassSVM(kernel='linear', max_iter=1000)
- # 'Deep Belief Network': "Implement DBN", # Placeholder for DBN
- # 'Restricted Boltzmann Machine': "Implement RBM", # Placeholder for RBM
- # 'Genetic Algorithm': ga.GeneticAlgorithm(), # Placeholder for Genetic Algorithm-based
- # 'Bayesian Network': BayesianNetwork([('A', 'B'), ('B', 'C')]), # Example Bayesian Network
- # 'Fuzzy Logic': "Implement Fuzzy Logic", # Placeholder for Fuzzy Logic systems
- # 'Conditional Random Field (CRF)': "Implement CRF", # Placeholder for CRF
- }
- # Running algorithms in k-fold
- for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
- X_train, X_test = X[train_idx], X[test_idx]
- y_train, y_test = y[train_idx], y[test_idx]
- for algo_name, model in algorithms.items():
- # if 'CNN' in algo_name or 'RNN' in algo_name or 'LSTM' in algo_name or 'GRU' in algo_name or 'Autoencoder' in algo_name:
- # # Special handling for deep learning models
- # X_train_dl = X_train.reshape(-1, 28, 28, 1)
- # X_test_dl = X_test.reshape(-1, 28, 28, 1)
- # y_train_dl = to_categorical(y_train, num_classes=10)
- # y_test_dl = to_categorical(y_test, num_classes=10)
- # run_algorithm(algo_name, model, X_train_dl, y_train_dl, X_test_dl, y_test_dl, fold)
- # else:
- run_algorithm(algo_name, model, X_train, y_train, X_test, y_test, fold)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement