Security_datasets_Code_2 - NO FS NO Smote


input_file = 'CIC_IOT_2023_combined.csv'  # Input dataset

import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, matthews_corrcoef
from sklearn.metrics import jaccard_score, cohen_kappa_score, hamming_loss, zero_one_loss, mean_absolute_error
from sklearn.metrics import mean_squared_error, r2_score, balanced_accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, OneClassSVM
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron, ElasticNet
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import IsolationForest
from sklearn.pipeline import Pipeline
from sklearn.neural_network import BernoulliRBM
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, LSTM, GRU, SimpleRNN
from keras.utils import to_categorical
import traceback
import csv
import warnings
from collections import defaultdict
from sklearn.semi_supervised import SelfTrainingClassifier

# Custom imports
# from pgmpy.models import BayesianNetwork
 # For Bayesian Networks
# import geneticalgorithm as ga
 # For Genetic Algorithm-based Classifier (hypothetical)
# Add fuzzy logic and CRF imports if you have specific packages (e.g., `python-crfsuite` for CRF)

warnings.filterwarnings("ignore")  # Suppress warnings for cleaner output

# Initialize dataset parameters
k_fold = 5  # Change as needed
dataset_percent = 100  # Change as needed

# Initialize CSV file and columns
# output_file = 'results.csv'
output_file = input_file.replace('.csv', '_results.csv')
class_metrics_file = input_file.replace('.csv', '_class_results.csv')
csv_columns = ['Algorithm', 'Fold', 'Train Time (s)', 'Test Time (s)', 'Accuracy', 'Precision', 'Recall', 'F1 Score',
               'Fbeta Score', 'Matthews Correlation Coefficient', 'Jaccard Score', 'Cohen Kappa Score',
               'Hamming Loss', 'Zero One Loss', 'Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error',
               'Balanced Accuracy', 'R2 Score']

# Initialize per-class metrics CSV
class_metrics_columns = ['Algorithm', 'Fold', 'Class', 'Accuracy', 'Precision', 'Recall', 'F1 Score',
               'Fbeta Score', 'Matthews Correlation Coefficient', 'Jaccard Score', 'Cohen Kappa Score',
               'Hamming Loss', 'Zero One Loss', 'Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error',
               'Balanced Accuracy', 'R2 Score']

def compute_classwise_metrics(y_true, y_pred):
    class_metrics = defaultdict(dict)
    classes = np.unique(y_true)

    for class_index in classes:
        true_class_name = class_names[class_index]
        y_true_class = (y_true == class_index).astype(int)
        y_pred_class = (y_pred == class_index).astype(int)

        # Calculate metrics for each true class name with rounding to 3 decimal places
        class_metrics[true_class_name]['Accuracy'] = round(accuracy_score(y_true_class, y_pred_class), 3)
        class_metrics[true_class_name]['Precision'] = round(precision_score(y_true, y_pred, labels=[class_index], average='weighted', zero_division=1), 3)
        class_metrics[true_class_name]['Recall'] = round(recall_score(y_true, y_pred, labels=[class_index], average='weighted', zero_division=1), 3)
        class_metrics[true_class_name]['F1 Score'] = round(f1_score(y_true, y_pred, labels=[class_index], average='weighted', zero_division=1), 3)
        class_metrics[true_class_name]['Fbeta Score'] = round(fbeta_score(y_true, y_pred, labels=[class_index], beta=1.0, average='weighted', zero_division=1), 3)
        class_metrics[true_class_name]['Matthews Correlation Coefficient'] = round(matthews_corrcoef(y_true_class, y_pred_class), 3)
        class_metrics[true_class_name]['Jaccard Score'] = round(jaccard_score(y_true, y_pred, labels=[class_index], average='weighted', zero_division=1), 3)
        class_metrics[true_class_name]['Cohen Kappa Score'] = round(cohen_kappa_score(y_true_class, y_pred_class), 3)
        class_metrics[true_class_name]['Hamming Loss'] = round(hamming_loss(y_true_class, y_pred_class), 3)
        class_metrics[true_class_name]['Zero One Loss'] = round(zero_one_loss(y_true_class, y_pred_class), 3)
        class_metrics[true_class_name]['Mean Absolute Error'] = round(mean_absolute_error(y_true_class, y_pred_class), 3)
        class_metrics[true_class_name]['Mean Squared Error'] = round(mean_squared_error(y_true_class, y_pred_class), 3)
        class_metrics[true_class_name]['Root Mean Squared Error'] = round(np.sqrt(class_metrics[true_class_name]['Mean Squared Error']), 3)
        class_metrics[true_class_name]['Balanced Accuracy'] = round(balanced_accuracy_score(y_true_class, y_pred_class), 3)
        class_metrics[true_class_name]['R2 Score'] = round(r2_score(y_true_class, y_pred_class), 3)

    return class_metrics

# Function to handle metric calculation
def compute_metrics(y_true, y_pred):
    metrics = {}
    metrics['Accuracy'] = accuracy_score(y_true, y_pred)
    metrics['Precision'] = precision_score(y_true, y_pred, average='weighted', zero_division=1)
    metrics['Recall'] = recall_score(y_true, y_pred, average='weighted', zero_division=1)
    metrics['F1 Score'] = f1_score(y_true, y_pred, average='weighted', zero_division=1)
    metrics['Fbeta Score'] = fbeta_score(y_true, y_pred, beta=1.0, average='weighted', zero_division=1)
    metrics['Matthews Correlation Coefficient'] = matthews_corrcoef(y_true, y_pred)
    metrics['Jaccard Score'] = jaccard_score(y_true, y_pred, average='weighted', zero_division=1)
    metrics['Cohen Kappa Score'] = cohen_kappa_score(y_true, y_pred)
    metrics['Hamming Loss'] = hamming_loss(y_true, y_pred)
    metrics['Zero One Loss'] = zero_one_loss(y_true, y_pred)
    metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
    metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
    metrics['Root Mean Squared Error'] = np.sqrt(metrics['Mean Squared Error'])
    metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
    metrics['R2 Score'] = r2_score(y_true, y_pred)

    return metrics

# Function to handle each algorithm execution
def run_algorithm(algo_name, model, X_train, y_train, X_test, y_test, fold):
    """Run a single algorithm and log its results"""
    try:
        start_train = time.time()
        model.fit(X_train, y_train)
        train_time = time.time() - start_train

        start_test = time.time()
        y_pred = model.predict(X_test)
        test_time = time.time() - start_test

        # Compute metrics
        if algo_name == 'ElasticNet':  # Handle ElasticNet as a regression model
            metrics = {}
            metrics['Mean Absolute Error'] = mean_absolute_error(y_test, y_pred)
            metrics['Mean Squared Error'] = mean_squared_error(y_test, y_pred)
            metrics['Root Mean Squared Error'] = np.sqrt(metrics['Mean Squared Error'])
            metrics['R2 Score'] = r2_score(y_test, y_pred)
        else:
            # Compute classification metrics
            metrics = compute_metrics(y_test, y_pred)

        metrics.update({'Train Time (s)': train_time, 'Test Time (s)': test_time})

        # Compute class-wise metrics - Add this line
        class_metrics = compute_classwise_metrics(y_test, y_pred)

        # Log results to CSV
        with open(output_file, 'a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([algo_name, fold] + [metrics.get(m, -1) for m in csv_columns[2:]])

        # Write per-class metrics to `class_metrics.csv`
        with open(class_metrics_file, 'a', newline='') as f:
            writer = csv.writer(f)
            for class_label, cm in class_metrics.items():
                writer.writerow([algo_name, fold, class_label] + [cm.get(m, -1) for m in class_metrics_columns[3:]])

        print(f"{algo_name} | Fold: {fold} | Train Time: {train_time:.2f}s | Test Time: {test_time:.2f}s")

    except Exception as e:
        print(f"Error in {algo_name}: {traceback.format_exc()}")
        # Log error case
        with open(output_file, 'a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([algo_name, fold] + [-1 for _ in csv_columns[2:]])

# Load dataset
df = pd.read_csv(input_file)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Encode categorical features
X = pd.get_dummies(X)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
class_names = label_encoder.classes_

# Min-Max scaling
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Take a subset of the dataset if dataset_percent is less than 100
if dataset_percent < 100:
    _, X, _, y = train_test_split(X, y, test_size=dataset_percent/100, stratify=y)

# Prepare CSV header if not present
if not pd.io.common.file_exists(output_file):
    with open(output_file, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(csv_columns)

if not pd.io.common.file_exists(class_metrics_file):
    with open(class_metrics_file, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(class_metrics_columns)

# K-Fold Cross Validation
kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)

# Deep Learning (CNN, RNN, LSTM, GRU, Autoencoders)
def create_cnn(input_shape):
    model = Sequential()
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(10, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def create_rnn(input_shape):
    model = Sequential()
    model.add(SimpleRNN(128, input_shape=input_shape, return_sequences=True))
    model.add(Flatten())
    model.add(Dense(10, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def create_lstm(input_shape):
    model = Sequential()
    model.add(LSTM(128, input_shape=input_shape, return_sequences=True))
    model.add(Flatten())
    model.add(Dense(10, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def create_gru(input_shape):
    model = Sequential()
    model.add(GRU(128, input_shape=input_shape, return_sequences=True))
    model.add(Flatten())
    model.add(Dense(10, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def create_autoencoder(input_shape):
    model = Sequential()
    model.add(Dense(128, input_shape=input_shape, activation='relu'))
    model.add(Dense(input_shape[0], activation='sigmoid'))
    model.compile(optimizer='adam', loss='mse')
    return model

# List of algorithms
algorithms = {


    'Naive Bayes': GaussianNB(),
    'LDA': LinearDiscriminantAnalysis(),
    'QDA': QuadraticDiscriminantAnalysis(),
    'SVM': SVC(kernel='linear', max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'SGD Classifier': SGDClassifier(),
    'KNN': KNeighborsClassifier(),
    'ElasticNet': ElasticNet(),
    'Perceptron': Perceptron(),
    'Logistic Regression': LogisticRegression(),
    'Bagging': BaggingClassifier(),
    'K-Means': KMeans(n_clusters=3),
    'Nearest Centroid Classifier': NearestCentroid(),
    'XGBoost': XGBClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    ########'RNN': create_rnn((28, 28)),
    'RBM + Logistic Regression': Pipeline(steps=[('rbm', BernoulliRBM(n_components=100, learning_rate=0.06, n_iter=10, random_state=42)),('logistic', LogisticRegression())]),
    'Voting Classifier': VotingClassifier(estimators=[('lr', LogisticRegression()),('rf', RandomForestClassifier()),('gnb', GaussianNB())], voting='hard'),
    'Random Forest': RandomForestClassifier(n_estimators=10),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=10),
    'Stacking Classifier': StackingClassifier(estimators=[('log_reg', LogisticRegression()),('knn', KNeighborsClassifier(n_neighbors=3))],final_estimator=LogisticRegression(),n_jobs=-1),
    # 'MLP Classifier': MLPClassifier(),
    ######### 'GRU': create_gru((28, 28)),
    ######### 'LSTM': create_lstm((28, 28)),
    ######### 'CNN': create_cnn((28, 28, 1)),
    ######### 'Autoencoder': create_autoencoder((28,)),
    'LightGBM': LGBMClassifier(),
    'CatBoost': CatBoostClassifier(),
    'Self-Training': SelfTrainingClassifier(LogisticRegression()),
    'Isolation Forest': IsolationForest(),
    # 'One-Class SVM': OneClassSVM(kernel='linear', max_iter=1000)
    # 'Deep Belief Network': "Implement DBN",  # Placeholder for DBN
    # 'Restricted Boltzmann Machine': "Implement RBM",  # Placeholder for RBM
    # 'Genetic Algorithm': ga.GeneticAlgorithm(),  # Placeholder for Genetic Algorithm-based
    # 'Bayesian Network': BayesianNetwork([('A', 'B'), ('B', 'C')]),  # Example Bayesian Network
    # 'Fuzzy Logic': "Implement Fuzzy Logic",  # Placeholder for Fuzzy Logic systems
    # 'Conditional Random Field (CRF)': "Implement CRF",  # Placeholder for CRF
}

# Running algorithms in k-fold
for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    for algo_name, model in algorithms.items():
        # if 'CNN' in algo_name or 'RNN' in algo_name or 'LSTM' in algo_name or 'GRU' in algo_name or 'Autoencoder' in algo_name:
        #     # Special handling for deep learning models
        #     X_train_dl = X_train.reshape(-1, 28, 28, 1)
        #     X_test_dl = X_test.reshape(-1, 28, 28, 1)
        #     y_train_dl = to_categorical(y_train, num_classes=10)
        #     y_test_dl = to_categorical(y_test, num_classes=10)
        #     run_algorithm(algo_name, model, X_train_dl, y_train_dl, X_test_dl, y_test_dl, fold)
        # else:
        run_algorithm(algo_name, model, X_train, y_train, X_test, y_test, fold)