Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Initialize dataset parameters
- #Change the filename.
- input_file = 'CIC_IOT_2023_combined.csv'
- import pandas as pd
- import numpy as np
- import time
- from sklearn.ensemble import ExtraTreesClassifier, HistGradientBoostingClassifier
- from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
- from sklearn.preprocessing import LabelEncoder, MinMaxScaler
- from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, matthews_corrcoef
- from sklearn.metrics import jaccard_score, cohen_kappa_score, hamming_loss, zero_one_loss, mean_absolute_error
- from sklearn.metrics import mean_squared_error, r2_score, balanced_accuracy_score
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier, StackingClassifier
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.svm import SVC, OneClassSVM
- from sklearn.naive_bayes import GaussianNB
- from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron, ElasticNet
- from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
- from sklearn.neural_network import MLPClassifier
- from xgboost import XGBClassifier
- from lightgbm import LGBMClassifier
- from catboost import CatBoostClassifier
- from sklearn.cluster import KMeans, AgglomerativeClustering
- from sklearn.gaussian_process import GaussianProcessClassifier
- from sklearn.neighbors import NearestCentroid
- from sklearn.mixture import GaussianMixture
- from sklearn.ensemble import IsolationForest
- from sklearn.pipeline import Pipeline
- from sklearn.neural_network import BernoulliRBM
- from sklearn.experimental import enable_iterative_imputer
- from sklearn.impute import IterativeImputer
- from keras.models import Sequential
- from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, LSTM, GRU, SimpleRNN
- from keras.utils import to_categorical
- import traceback
- import csv
- import warnings
- from sklearn.semi_supervised import SelfTrainingClassifier
- warnings.filterwarnings("ignore")
- k_fold = 5
- dataset_percent = 100
- # Initialize CSV file and columns
- output_file = input_file.replace('.csv', '_results.csv')
- csv_columns = ['Algorithm', 'Fold', 'Train Time (s)', 'Test Time (s)', 'Accuracy', 'Precision', 'Recall', 'F1 Score',
- 'Fbeta Score', 'Matthews Correlation Coefficient', 'Jaccard Score', 'Cohen Kappa Score',
- 'Hamming Loss', 'Zero One Loss', 'Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error',
- 'Balanced Accuracy', 'R2 Score']
- # Function to handle metric calculation
- def compute_metrics(y_true, y_pred):
- metrics = {}
- metrics['Accuracy'] = accuracy_score(y_true, y_pred)
- metrics['Precision'] = precision_score(y_true, y_pred, average='weighted', zero_division=1)
- metrics['Recall'] = recall_score(y_true, y_pred, average='weighted', zero_division=1)
- metrics['F1 Score'] = f1_score(y_true, y_pred, average='weighted', zero_division=1)
- metrics['Fbeta Score'] = fbeta_score(y_true, y_pred, beta=1.0, average='weighted', zero_division=1)
- metrics['Matthews Correlation Coefficient'] = matthews_corrcoef(y_true, y_pred)
- metrics['Jaccard Score'] = jaccard_score(y_true, y_pred, average='weighted', zero_division=1)
- metrics['Cohen Kappa Score'] = cohen_kappa_score(y_true, y_pred)
- metrics['Hamming Loss'] = hamming_loss(y_true, y_pred)
- metrics['Zero One Loss'] = zero_one_loss(y_true, y_pred)
- metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
- metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
- metrics['Root Mean Squared Error'] = np.sqrt(metrics['Mean Squared Error'])
- metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
- metrics['R2 Score'] = r2_score(y_true, y_pred)
- return metrics
- # Function to handle each algorithm execution
- def run_algorithm(algo_name, model, X_train, y_train, X_test, y_test, fold):
- try:
- start_train = time.time()
- model.fit(X_train, y_train)
- train_time = time.time() - start_train
- start_test = time.time()
- y_pred = model.predict(X_test)
- test_time = time.time() - start_test
- # Compute metrics
- if algo_name == 'ElasticNet': # Handle ElasticNet as a regression model
- metrics = {}
- metrics['Mean Absolute Error'] = mean_absolute_error(y_test, y_pred)
- metrics['Mean Squared Error'] = mean_squared_error(y_test, y_pred)
- metrics['Root Mean Squared Error'] = np.sqrt(metrics['Mean Squared Error'])
- metrics['R2 Score'] = r2_score(y_test, y_pred)
- else:
- # Compute classification metrics
- metrics = compute_metrics(y_test, y_pred)
- metrics.update({'Train Time (s)': train_time, 'Test Time (s)': test_time})
- # Log results to CSV
- with open(output_file, 'a', newline='') as f:
- writer = csv.writer(f)
- writer.writerow([algo_name, fold] + [metrics.get(m, -1) for m in csv_columns[2:]])
- print(f"{algo_name} | Fold: {fold} | Train Time: {train_time:.2f}s | Test Time: {test_time:.2f}s")
- except Exception as e:
- # Log error case
- with open(output_file, 'a', newline='') as f:
- writer = csv.writer(f)
- writer.writerow([algo_name, fold] + [-1 for _ in csv_columns[2:]])
- print(f"Error in {algo_name}: {traceback.format_exc()}")
- # Load dataset
- df = pd.read_csv(input_file)
- X = df.iloc[:, :-1]
- y = df.iloc[:, -1]
- # Encode categorical features
- label_encoder = LabelEncoder()
- for column in X.columns:
- if X[column].dtype == 'object' or X[column].dtype.name == 'category':
- X[column] = label_encoder.fit_transform(X[column])
- y = LabelEncoder().fit_transform(y)
- # Apply iterative imputation to handle missing data
- imputer = IterativeImputer()
- X = imputer.fit_transform(X)
- # Min-Max scaling
- scaler = MinMaxScaler()
- X = scaler.fit_transform(X)
- # Take a subset of the dataset if dataset_percent is less than 100
- if dataset_percent < 100:
- X, _, y, _ = train_test_split(X, y, train_size=dataset_percent/100, stratify=y, random_state=42)
- # Prepare CSV header if not present
- if not pd.io.common.file_exists(output_file):
- with open(output_file, 'w', newline='') as f:
- writer = csv.writer(f)
- writer.writerow(csv_columns)
- # K-Fold Cross Validation
- kf = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=42)
- # List of algorithms
- algorithms = {
- 'Naive Bayes': GaussianNB(),
- 'LDA': LinearDiscriminantAnalysis(),
- 'QDA': QuadraticDiscriminantAnalysis(),
- 'Decision Tree': DecisionTreeClassifier(),
- 'SGD Classifier': SGDClassifier(),
- 'KNN': KNeighborsClassifier(),
- 'ElasticNet': ElasticNet(),
- 'Perceptron': Perceptron(),
- 'Logistic Regression': LogisticRegression(),
- 'Bagging': BaggingClassifier(),
- 'K-Means': KMeans(n_clusters=3),
- 'Nearest Centroid Classifier': NearestCentroid(),
- 'XGBoost': XGBClassifier(),
- 'AdaBoost': AdaBoostClassifier(),
- 'RBM + Logistic Regression': Pipeline(steps=[('rbm', BernoulliRBM(n_components=
- 0, learning_rate=0.06, n_iter=10, random_state=42)),('logistic', LogisticRegression())]),
- 'Voting Classifier': VotingClassifier(estimators=[('lr', LogisticRegression()),('rf', RandomForestClassifier()),('gnb', GaussianNB())], voting='hard'),
- 'Random Forest': RandomForestClassifier(n_estimators=10),
- 'Gradient Boosting': GradientBoostingClassifier(n_estimators=10),
- 'Stacking Classifier': StackingClassifier(estimators=[('log_reg', LogisticRegression()),('knn', KNeighborsClassifier(n_neighbors=3))],final_estimator=LogisticRegression(),n_jobs=-1),
- 'LightGBM': LGBMClassifier(),
- 'CatBoost': CatBoostClassifier(),
- 'Self-Training': SelfTrainingClassifier(LogisticRegression()),
- 'Isolation Forest': IsolationForest(),
- 'Extra Trees Classifier': ExtraTreesClassifier(n_estimators=100),
- 'HistGradientBoostingClassifier': HistGradientBoostingClassifier(max_iter=100, validation_fraction=None),
- }
- # Running algorithms in k-fold
- for fold, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
- X_train, X_test = X[train_idx], X[test_idx]
- y_train, y_test = y[train_idx], y[test_idx]
- for algo_name, model in algorithms.items():
- run_algorithm(algo_name, model, X_train, y_train, X_test, y_test, fold)
- print("All algorithms have been executed. Results are saved in", output_file)
- # Load the CSV file
- df = pd.read_csv(output_file)
- # Sort the dataframe by the 'F1 Score' column in descending order
- df_sorted = df.sort_values(by='F1 Score', ascending=False)
- # Save the sorted dataframe to a new CSV file
- df_sorted.to_csv(output_file, index=False)
- # Print the first few rows to verify
- print(df_sorted.head())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement