Security_datasets_Code_3.py

import logging

# Configure logging
logging.basicConfig(
    filename='algorithm_execution.log',  # Log file name
    filemode='a',  # Append mode
    format='%(asctime)s - %(levelname)s - %(message)s',  # Log format
    level=logging.DEBUG  # Set log level to DEBUG to capture all levels of logs
)

logging.info("Logging setup complete. Execution started.")


import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Disable GPU

import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, matthews_corrcoef
from sklearn.metrics import jaccard_score, cohen_kappa_score, hamming_loss, zero_one_loss, mean_absolute_error
from sklearn.metrics import mean_squared_error, r2_score, balanced_accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, OneClassSVM
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron, ElasticNet
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import IsolationForest
from sklearn.pipeline import Pipeline
from sklearn.neural_network import BernoulliRBM
from keras.models import Sequential
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, LSTM, GRU, SimpleRNN
from keras.utils import to_categorical
import traceback
import csv
import warnings
from collections import defaultdict
from sklearn.semi_supervised import SelfTrainingClassifier

input_file = 'FlowerPollination_StandardGAN.csv'  # Input dataset  # Read one by one csv file from the input_gan folder

# Initialize dataset parameters
k_fold = 5  # Change as needed
dataset_percent = 20  # Change as needed


# Initialize CSV file and columns
# output_file = 'results.csv'
output_file = input_file.replace('.csv', '_results.csv')
class_metrics_file = input_file.replace('.csv', '_class_results.csv')


# Custom imports
# from pgmpy.models import BayesianNetwork
 # For Bayesian Networks
# import geneticalgorithm as ga
 # For Genetic Algorithm-based Classifier (hypothetical)
# Add fuzzy logic and CRF imports if you have specific packages (e.g., `python-crfsuite` for CRF)

warnings.filterwarnings("ignore")  # Suppress warnings for cleaner output

csv_columns = ['Algorithm', 'Fold', 'Train Time (s)', 'Test Time (s)', 'Accuracy', 'Precision', 'Recall', 'F1 Score',
               'Fbeta Score', 'Matthews Correlation Coefficient', 'Jaccard Score', 'Cohen Kappa Score',
               'Hamming Loss', 'Zero One Loss', 'Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error',
               'Balanced Accuracy', 'R2 Score']

# Initialize per-class metrics CSV
class_metrics_columns = ['Algorithm', 'Fold', 'Class', 'Accuracy', 'Precision', 'Recall', 'F1 Score',
               'Fbeta Score', 'Matthews Correlation Coefficient', 'Jaccard Score', 'Cohen Kappa Score',
               'Hamming Loss', 'Zero One Loss', 'Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error',
               'Balanced Accuracy', 'R2 Score']

def compute_classwise_metrics(y_true, y_pred):
    class_metrics = defaultdict(dict)
    classes = np.unique(y_true)

    for class_index in classes:
        true_class_name = class_names[class_index]
        y_true_class = (y_true == class_index).astype(int)
        y_pred_class = (y_pred == class_index).astype(int)

        # Calculate metrics for each true class name with rounding to 3 decimal places
        class_metrics[true_class_name]['Accuracy'] = round(accuracy_score(y_true_class, y_pred_class), 3)
        class_metrics[true_class_name]['Precision'] = round(precision_score(y_true, y_pred, labels=[class_index], average='weighted', zero_division=1), 3)
        class_metrics[true_class_name]['Recall'] = round(recall_score(y_true, y_pred, labels=[class_index], average='weighted', zero_division=1), 3)
        class_metrics[true_class_name]['F1 Score'] = round(f1_score(y_true, y_pred, labels=[class_index], average='weighted', zero_division=1), 3)
        class_metrics[true_class_name]['Fbeta Score'] = round(fbeta_score(y_true, y_pred, labels=[class_index], beta=1.0, average='weighted', zero_division=1), 3)
        class_metrics[true_class_name]['Matthews Correlation Coefficient'] = round(matthews_corrcoef(y_true_class, y_pred_class), 3)
        class_metrics[true_class_name]['Jaccard Score'] = round(jaccard_score(y_true, y_pred, labels=[class_index], average='weighted', zero_division=1), 3)
        class_metrics[true_class_name]['Cohen Kappa Score'] = round(cohen_kappa_score(y_true_class, y_pred_class), 3)
        class_metrics[true_class_name]['Hamming Loss'] = round(hamming_loss(y_true_class, y_pred_class), 3)
        class_metrics[true_class_name]['Zero One Loss'] = round(zero_one_loss(y_true_class, y_pred_class), 3)
        class_metrics[true_class_name]['Mean Absolute Error'] = round(mean_absolute_error(y_true_class, y_pred_class), 3)
        class_metrics[true_class_name]['Mean Squared Error'] = round(mean_squared_error(y_true_class, y_pred_class), 3)
        class_metrics[true_class_name]['Root Mean Squared Error'] = round(np.sqrt(class_metrics[true_class_name]['Mean Squared Error']), 3)
        class_metrics[true_class_name]['Balanced Accuracy'] = round(balanced_accuracy_score(y_true_class, y_pred_class), 3)
        class_metrics[true_class_name]['R2 Score'] = round(r2_score(y_true_class, y_pred_class), 3)

    return class_metrics

# Function to handle metric calculation
def compute_metrics(y_true, y_pred):
    metrics = {}
    metrics['Accuracy'] = accuracy_score(y_true, y_pred)
    metrics['Precision'] = precision_score(y_true, y_pred, average='weighted', zero_division=1)
    metrics['Recall'] = recall_score(y_true, y_pred, average='weighted', zero_division=1)
    metrics['F1 Score'] = f1_score(y_true, y_pred, average='weighted', zero_division=1)
    metrics['Fbeta Score'] = fbeta_score(y_true, y_pred, beta=1.0, average='weighted', zero_division=1)
    metrics['Matthews Correlation Coefficient'] = matthews_corrcoef(y_true, y_pred)
    metrics['Jaccard Score'] = jaccard_score(y_true, y_pred, average='weighted', zero_division=1)
    metrics['Cohen Kappa Score'] = cohen_kappa_score(y_true, y_pred)
    metrics['Hamming Loss'] = hamming_loss(y_true, y_pred)
    metrics['Zero One Loss'] = zero_one_loss(y_true, y_pred)
    metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
    metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
    metrics['Root Mean Squared Error'] = np.sqrt(metrics['Mean Squared Error'])
    metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
    metrics['R2 Score'] = r2_score(y_true, y_pred)

    return metrics

# Function to handle each algorithm execution
def run_algorithm(algo_name, model, X_train, y_train, X_test, y_test, fold):
    """Run a single algorithm and log its results"""
    try:
        logging.info(f"Starting algorithm: {algo_name} | Fold: {fold}")

        start_train = time.time()
        model.fit(X_train, y_train)
        train_time = time.time() - start_train
        logging.debug(f"{algo_name} | Fold: {fold} | Training completed in {train_time:.2f} seconds.")


        start_test = time.time()
        y_pred = model.predict(X_test)
        test_time = time.time() - start_test
        logging.debug(f"{algo_name} | Fold: {fold} | Testing completed in {test_time:.2f} seconds.")


        # Compute metrics
        if algo_name == 'ElasticNet':  # Handle ElasticNet as a regression model
            metrics = {}
            metrics['Mean Absolute Error'] = mean_absolute_error(y_test, y_pred)
            metrics['Mean Squared Error'] = mean_squared_error(y_test, y_pred)
            metrics['Root Mean Squared Error'] = np.sqrt(metrics['Mean Squared Error'])
            metrics['R2 Score'] = r2_score(y_test, y_pred)
        else:
            # Compute classification metrics
            metrics = compute_metrics(y_test, y_pred)

        metrics.update({'Train Time (s)': train_time, 'Test Time (s)': test_time})
        logging.info(f"{algo_name} | Fold: {fold} | Metrics: {metrics}")


        # Compute class-wise metrics - Add this line
        class_metrics = compute_classwise_metrics(y_test, y_pred)

        # Log results to CSV
        with open(output_file, 'a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([algo_name, fold] + [metrics.get(m, -1) for m in csv_columns[2:]])

        # Write per-class metrics to `class_metrics.csv`
        with open(class_metrics_file, 'a', newline='') as f:
            writer = csv.writer(f)
            for class_label, cm in class_metrics.items():
                writer.writerow([algo_name, fold, class_label] + [cm.get(m, -1) for m in class_metrics_columns[3:]])

        print(f"{algo_name} | Fold: {fold} | Train Time: {train_time:.2f}s | Test Time: {test_time:.2f}s")
        logging.info(f"{algo_name} | Fold: {fold} | Results successfully logged.")


    except Exception as e:
        print(f"Error in {algo_name}: {traceback.format_exc()}")
        logging.error(error_msg)

        # Log error case
        with open(output_file, 'a', newline='') as f:
            writer = csv.writer(f)
            writer.writerow([algo_name, fold] + [-1 for _ in csv_columns[2:]])

# Load dataset
df = pd.read_csv(input_file)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Encode categorical features
X = pd.get_dummies(X)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
class_names = label_encoder.classes_

# Min-Max scaling
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Take a subset of the dataset if dataset_percent is less than 100
if dataset_percent < 100:
    _, X, _, y = train_test_split(X, y, test_size=dataset_percent/100, stratify=y)

# Prepare CSV header if not present
if not pd.io.common.file_exists(output_file):
    with open(output_file, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(csv_columns)

if not pd.io.common.file_exists(class_metrics_file):
    with open(class_metrics_file, 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(class_metrics_columns)

# K-Fold Cross Validation
kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)

# Deep Learning (CNN, RNN, LSTM, GRU, Autoencoders)
def create_cnn(input_shape):
    model = Sequential()
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(10, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def create_rnn(input_shape):
    model = Sequential()
    model.add(SimpleRNN(128, input_shape=input_shape, return_sequences=True))
    model.add(Flatten())
    model.add(Dense(10, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def create_lstm(input_shape):
    model = Sequential()
    model.add(LSTM(128, input_shape=input_shape, return_sequences=True))
    model.add(Flatten())
    model.add(Dense(10, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def create_gru(input_shape):
    model = Sequential()
    model.add(GRU(128, input_shape=input_shape, return_sequences=True))
    model.add(Flatten())
    model.add(Dense(10, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def create_autoencoder(input_shape):
    model = Sequential()
    model.add(Dense(128, input_shape=input_shape, activation='relu'))
    model.add(Dense(input_shape[0], activation='sigmoid'))
    model.compile(optimizer='adam', loss='mse')
    return model

# List of algorithms
algorithms = {
    'Naive Bayes': GaussianNB(),
    'LDA': LinearDiscriminantAnalysis(),
    'QDA': QuadraticDiscriminantAnalysis(),
    'SVM': SVC(kernel='linear', max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'SGD Classifier': SGDClassifier(),
    'KNN': KNeighborsClassifier(),
    #########'ElasticNet': ElasticNet(),
    'Perceptron': Perceptron(),
    'Logistic Regression': LogisticRegression(),
    'Bagging': BaggingClassifier(),
    'K-Means': KMeans(n_clusters=3),
    'Nearest Centroid Classifier': NearestCentroid(),
    'XGBoost': XGBClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    ########'RNN': create_rnn((28, 28)),
    'RBM + Logistic Regression': Pipeline(steps=[('rbm', BernoulliRBM(n_components=100, learning_rate=0.06, n_iter=10, random_state=42)),('logistic', LogisticRegression())]),
    'Voting Classifier': VotingClassifier(estimators=[('lr', LogisticRegression()),('rf', RandomForestClassifier()),('gnb', GaussianNB())], voting='hard'),
    'Random Forest': RandomForestClassifier(n_estimators=10),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=10),
    # 'MLP Classifier': MLPClassifier(),
    ######### 'GRU': create_gru((28, 28)),
    ######### 'LSTM': create_lstm((28, 28)),
    ######### 'CNN': create_cnn((28, 28, 1)),
    ######### 'Autoencoder': create_autoencoder((28,)),
    'LightGBM': LGBMClassifier(),
    'CatBoost': CatBoostClassifier(),
    'Self-Training': SelfTrainingClassifier(LogisticRegression()),
    'Isolation Forest': IsolationForest(),
    # 'Stacking Classifier': StackingClassifier(estimators=[('log_reg', LogisticRegression()),('knn', KNeighborsClassifier(n_neighbors=3))],final_estimator=LogisticRegression(),n_jobs=2),
    # 'One-Class SVM': OneClassSVM(kernel='linear', max_iter=1000)
    # 'Deep Belief Network': "Implement DBN",  # Placeholder for DBN
    # 'Restricted Boltzmann Machine': "Implement RBM",  # Placeholder for RBM
    # 'Genetic Algorithm': ga.GeneticAlgorithm(),  # Placeholder for Genetic Algorithm-based
    # 'Bayesian Network': BayesianNetwork([('A', 'B'), ('B', 'C')]),  # Example Bayesian Network
    # 'Fuzzy Logic': "Implement Fuzzy Logic",  # Placeholder for Fuzzy Logic systems
    # 'Conditional Random Field (CRF)': "Implement CRF",  # Placeholder for CRF
}

# Running algorithms in k-fold.
logging.info(f"Starting K-Fold Cross Validation with {k_fold} folds.")

for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
    logging.info(f"Starting Fold {fold}.")
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    for algo_name, model in algorithms.items():
        # if 'CNN' in algo_name or 'RNN' in algo_name or 'LSTM' in algo_name or 'GRU' in algo_name or 'Autoencoder' in algo_name:
        #     # Special handling for deep learning models
        #     X_train_dl = X_train.reshape(-1, 28, 28, 1)
        #     X_test_dl = X_test.reshape(-1, 28, 28, 1)
        #     y_train_dl = to_categorical(y_train, num_classes=10)
        #     y_test_dl = to_categorical(y_test, num_classes=10)
        #     run_algorithm(algo_name, model, X_train_dl, y_train_dl, X_test_dl, y_test_dl, fold)
        # else:
        run_algorithm(algo_name, model, X_train, y_train, X_test, y_test, fold)
        logging.info(f"Completed Fold {fold}.")

logging.info("K-Fold Cross Validation completed.")