Advertisement
mayankjoin3

Security_datasets_Code_3.py

Nov 15th, 2024 (edited)
37
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 16.13 KB | None | 0 0
  1. import logging
  2.  
  3. # Configure logging
  4. logging.basicConfig(
  5.     filename='algorithm_execution.log',  # Log file name
  6.     filemode='a',  # Append mode
  7.     format='%(asctime)s - %(levelname)s - %(message)s',  # Log format
  8.     level=logging.DEBUG  # Set log level to DEBUG to capture all levels of logs
  9. )
  10.  
  11. logging.info("Logging setup complete. Execution started.")
  12.  
  13.  
  14. import os
  15. os.environ["CUDA_VISIBLE_DEVICES"] = "-1"  # Disable GPU
  16.  
  17. import pandas as pd
  18. import numpy as np
  19. import time
  20. from sklearn.model_selection import train_test_split, KFold
  21. from sklearn.preprocessing import LabelEncoder, MinMaxScaler
  22. from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, matthews_corrcoef
  23. from sklearn.metrics import jaccard_score, cohen_kappa_score, hamming_loss, zero_one_loss, mean_absolute_error
  24. from sklearn.metrics import mean_squared_error, r2_score, balanced_accuracy_score
  25. from sklearn.tree import DecisionTreeClassifier
  26. from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier, StackingClassifier
  27. from sklearn.neighbors import KNeighborsClassifier
  28. from sklearn.svm import SVC, OneClassSVM
  29. from sklearn.naive_bayes import GaussianNB
  30. from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron, ElasticNet
  31. from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
  32. from sklearn.neural_network import MLPClassifier
  33. from xgboost import XGBClassifier
  34. from lightgbm import LGBMClassifier
  35. from catboost import CatBoostClassifier
  36. from sklearn.cluster import KMeans, AgglomerativeClustering
  37. from sklearn.gaussian_process import GaussianProcessClassifier
  38. from sklearn.neighbors import NearestCentroid
  39. from sklearn.mixture import GaussianMixture
  40. from sklearn.ensemble import IsolationForest
  41. from sklearn.pipeline import Pipeline
  42. from sklearn.neural_network import BernoulliRBM
  43. from keras.models import Sequential
  44. from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, LSTM, GRU, SimpleRNN
  45. from keras.utils import to_categorical
  46. import traceback
  47. import csv
  48. import warnings
  49. from collections import defaultdict
  50. from sklearn.semi_supervised import SelfTrainingClassifier
  51.  
  52. input_file = 'FlowerPollination_StandardGAN.csv'  # Input dataset  # Read one by one csv file from the input_gan folder
  53.  
  54. # Initialize dataset parameters
  55. k_fold = 5  # Change as needed
  56. dataset_percent = 20  # Change as needed
  57.  
  58.  
  59. # Initialize CSV file and columns
  60. # output_file = 'results.csv'
  61. output_file = input_file.replace('.csv', '_results.csv')
  62. class_metrics_file = input_file.replace('.csv', '_class_results.csv')
  63.  
  64.  
  65.  
  66. # Custom imports
  67. # from pgmpy.models import BayesianNetwork
  68.  # For Bayesian Networks
  69. # import geneticalgorithm as ga
  70.  # For Genetic Algorithm-based Classifier (hypothetical)
  71. # Add fuzzy logic and CRF imports if you have specific packages (e.g., `python-crfsuite` for CRF)
  72.  
  73. warnings.filterwarnings("ignore")  # Suppress warnings for cleaner output
  74.  
  75. csv_columns = ['Algorithm', 'Fold', 'Train Time (s)', 'Test Time (s)', 'Accuracy', 'Precision', 'Recall', 'F1 Score',
  76.                'Fbeta Score', 'Matthews Correlation Coefficient', 'Jaccard Score', 'Cohen Kappa Score',
  77.                'Hamming Loss', 'Zero One Loss', 'Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error',
  78.                'Balanced Accuracy', 'R2 Score']
  79.  
  80. # Initialize per-class metrics CSV
  81. class_metrics_columns = ['Algorithm', 'Fold', 'Class', 'Accuracy', 'Precision', 'Recall', 'F1 Score',
  82.                'Fbeta Score', 'Matthews Correlation Coefficient', 'Jaccard Score', 'Cohen Kappa Score',
  83.                'Hamming Loss', 'Zero One Loss', 'Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error',
  84.                'Balanced Accuracy', 'R2 Score']
  85.  
  86. def compute_classwise_metrics(y_true, y_pred):
  87.     class_metrics = defaultdict(dict)
  88.     classes = np.unique(y_true)
  89.    
  90.     for class_index in classes:
  91.         true_class_name = class_names[class_index]
  92.         y_true_class = (y_true == class_index).astype(int)
  93.         y_pred_class = (y_pred == class_index).astype(int)
  94.        
  95.         # Calculate metrics for each true class name with rounding to 3 decimal places
  96.         class_metrics[true_class_name]['Accuracy'] = round(accuracy_score(y_true_class, y_pred_class), 3)
  97.         class_metrics[true_class_name]['Precision'] = round(precision_score(y_true, y_pred, labels=[class_index], average='weighted', zero_division=1), 3)
  98.         class_metrics[true_class_name]['Recall'] = round(recall_score(y_true, y_pred, labels=[class_index], average='weighted', zero_division=1), 3)
  99.         class_metrics[true_class_name]['F1 Score'] = round(f1_score(y_true, y_pred, labels=[class_index], average='weighted', zero_division=1), 3)
  100.         class_metrics[true_class_name]['Fbeta Score'] = round(fbeta_score(y_true, y_pred, labels=[class_index], beta=1.0, average='weighted', zero_division=1), 3)
  101.         class_metrics[true_class_name]['Matthews Correlation Coefficient'] = round(matthews_corrcoef(y_true_class, y_pred_class), 3)
  102.         class_metrics[true_class_name]['Jaccard Score'] = round(jaccard_score(y_true, y_pred, labels=[class_index], average='weighted', zero_division=1), 3)
  103.         class_metrics[true_class_name]['Cohen Kappa Score'] = round(cohen_kappa_score(y_true_class, y_pred_class), 3)
  104.         class_metrics[true_class_name]['Hamming Loss'] = round(hamming_loss(y_true_class, y_pred_class), 3)
  105.         class_metrics[true_class_name]['Zero One Loss'] = round(zero_one_loss(y_true_class, y_pred_class), 3)
  106.         class_metrics[true_class_name]['Mean Absolute Error'] = round(mean_absolute_error(y_true_class, y_pred_class), 3)
  107.         class_metrics[true_class_name]['Mean Squared Error'] = round(mean_squared_error(y_true_class, y_pred_class), 3)
  108.         class_metrics[true_class_name]['Root Mean Squared Error'] = round(np.sqrt(class_metrics[true_class_name]['Mean Squared Error']), 3)
  109.         class_metrics[true_class_name]['Balanced Accuracy'] = round(balanced_accuracy_score(y_true_class, y_pred_class), 3)
  110.         class_metrics[true_class_name]['R2 Score'] = round(r2_score(y_true_class, y_pred_class), 3)
  111.  
  112.     return class_metrics                              
  113.  
  114. # Function to handle metric calculation
  115. def compute_metrics(y_true, y_pred):
  116.     metrics = {}
  117.     metrics['Accuracy'] = accuracy_score(y_true, y_pred)
  118.     metrics['Precision'] = precision_score(y_true, y_pred, average='weighted', zero_division=1)
  119.     metrics['Recall'] = recall_score(y_true, y_pred, average='weighted', zero_division=1)
  120.     metrics['F1 Score'] = f1_score(y_true, y_pred, average='weighted', zero_division=1)
  121.     metrics['Fbeta Score'] = fbeta_score(y_true, y_pred, beta=1.0, average='weighted', zero_division=1)
  122.     metrics['Matthews Correlation Coefficient'] = matthews_corrcoef(y_true, y_pred)
  123.     metrics['Jaccard Score'] = jaccard_score(y_true, y_pred, average='weighted', zero_division=1)
  124.     metrics['Cohen Kappa Score'] = cohen_kappa_score(y_true, y_pred)
  125.     metrics['Hamming Loss'] = hamming_loss(y_true, y_pred)
  126.     metrics['Zero One Loss'] = zero_one_loss(y_true, y_pred)
  127.     metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
  128.     metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
  129.     metrics['Root Mean Squared Error'] = np.sqrt(metrics['Mean Squared Error'])
  130.     metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
  131.     metrics['R2 Score'] = r2_score(y_true, y_pred)
  132.    
  133.     return metrics
  134.  
  135. # Function to handle each algorithm execution
  136. def run_algorithm(algo_name, model, X_train, y_train, X_test, y_test, fold):
  137.     """Run a single algorithm and log its results"""
  138.     try:
  139.         logging.info(f"Starting algorithm: {algo_name} | Fold: {fold}")
  140.  
  141.         start_train = time.time()
  142.         model.fit(X_train, y_train)
  143.         train_time = time.time() - start_train
  144.         logging.debug(f"{algo_name} | Fold: {fold} | Training completed in {train_time:.2f} seconds.")
  145.  
  146.  
  147.         start_test = time.time()
  148.         y_pred = model.predict(X_test)
  149.         test_time = time.time() - start_test        
  150.         logging.debug(f"{algo_name} | Fold: {fold} | Testing completed in {test_time:.2f} seconds.")
  151.  
  152.  
  153.         # Compute metrics
  154.         if algo_name == 'ElasticNet':  # Handle ElasticNet as a regression model
  155.             metrics = {}
  156.             metrics['Mean Absolute Error'] = mean_absolute_error(y_test, y_pred)
  157.             metrics['Mean Squared Error'] = mean_squared_error(y_test, y_pred)
  158.             metrics['Root Mean Squared Error'] = np.sqrt(metrics['Mean Squared Error'])
  159.             metrics['R2 Score'] = r2_score(y_test, y_pred)
  160.         else:
  161.             # Compute classification metrics
  162.             metrics = compute_metrics(y_test, y_pred)
  163.            
  164.         metrics.update({'Train Time (s)': train_time, 'Test Time (s)': test_time})        
  165.         logging.info(f"{algo_name} | Fold: {fold} | Metrics: {metrics}")
  166.  
  167.        
  168.         # Compute class-wise metrics - Add this line
  169.         class_metrics = compute_classwise_metrics(y_test, y_pred)
  170.        
  171.         # Log results to CSV
  172.         with open(output_file, 'a', newline='') as f:
  173.             writer = csv.writer(f)
  174.             writer.writerow([algo_name, fold] + [metrics.get(m, -1) for m in csv_columns[2:]])
  175.  
  176.         # Write per-class metrics to `class_metrics.csv`
  177.         with open(class_metrics_file, 'a', newline='') as f:
  178.             writer = csv.writer(f)
  179.             for class_label, cm in class_metrics.items():
  180.                 writer.writerow([algo_name, fold, class_label] + [cm.get(m, -1) for m in class_metrics_columns[3:]])
  181.  
  182.         print(f"{algo_name} | Fold: {fold} | Train Time: {train_time:.2f}s | Test Time: {test_time:.2f}s")        
  183.         logging.info(f"{algo_name} | Fold: {fold} | Results successfully logged.")
  184.  
  185.        
  186.     except Exception as e:
  187.         print(f"Error in {algo_name}: {traceback.format_exc()}")        
  188.         logging.error(error_msg)
  189.  
  190.         # Log error case
  191.         with open(output_file, 'a', newline='') as f:
  192.             writer = csv.writer(f)
  193.             writer.writerow([algo_name, fold] + [-1 for _ in csv_columns[2:]])
  194.  
  195. # Load dataset
  196. df = pd.read_csv(input_file)
  197. X = df.iloc[:, :-1]
  198. y = df.iloc[:, -1]
  199.  
  200. # Encode categorical features
  201. X = pd.get_dummies(X)
  202. label_encoder = LabelEncoder()
  203. y = label_encoder.fit_transform(y)
  204. class_names = label_encoder.classes_
  205.  
  206. # Min-Max scaling
  207. scaler = MinMaxScaler()
  208. X = scaler.fit_transform(X)
  209.  
  210. # Take a subset of the dataset if dataset_percent is less than 100
  211. if dataset_percent < 100:
  212.     _, X, _, y = train_test_split(X, y, test_size=dataset_percent/100, stratify=y)
  213.  
  214. # Prepare CSV header if not present
  215. if not pd.io.common.file_exists(output_file):
  216.     with open(output_file, 'w', newline='') as f:
  217.         writer = csv.writer(f)
  218.         writer.writerow(csv_columns)
  219.  
  220. if not pd.io.common.file_exists(class_metrics_file):
  221.     with open(class_metrics_file, 'w', newline='') as f:
  222.         writer = csv.writer(f)
  223.         writer.writerow(class_metrics_columns)  
  224.  
  225. # K-Fold Cross Validation
  226. kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
  227.  
  228. # Deep Learning (CNN, RNN, LSTM, GRU, Autoencoders)
  229. def create_cnn(input_shape):
  230.     model = Sequential()
  231.     model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
  232.     model.add(MaxPooling2D(pool_size=(2, 2)))
  233.     model.add(Flatten())
  234.     model.add(Dense(128, activation='relu'))
  235.     model.add(Dense(10, activation='softmax'))
  236.     model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
  237.     return model
  238.  
  239. def create_rnn(input_shape):
  240.     model = Sequential()
  241.     model.add(SimpleRNN(128, input_shape=input_shape, return_sequences=True))
  242.     model.add(Flatten())
  243.     model.add(Dense(10, activation='softmax'))
  244.     model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
  245.     return model
  246.  
  247. def create_lstm(input_shape):
  248.     model = Sequential()
  249.     model.add(LSTM(128, input_shape=input_shape, return_sequences=True))
  250.     model.add(Flatten())
  251.     model.add(Dense(10, activation='softmax'))
  252.     model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
  253.     return model
  254.  
  255. def create_gru(input_shape):
  256.     model = Sequential()
  257.     model.add(GRU(128, input_shape=input_shape, return_sequences=True))
  258.     model.add(Flatten())
  259.     model.add(Dense(10, activation='softmax'))
  260.     model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
  261.     return model
  262.  
  263. def create_autoencoder(input_shape):
  264.     model = Sequential()
  265.     model.add(Dense(128, input_shape=input_shape, activation='relu'))
  266.     model.add(Dense(input_shape[0], activation='sigmoid'))
  267.     model.compile(optimizer='adam', loss='mse')
  268.     return model
  269.  
  270. # List of algorithms
  271. algorithms = {
  272.     'Naive Bayes': GaussianNB(),
  273.     'LDA': LinearDiscriminantAnalysis(),
  274.     'QDA': QuadraticDiscriminantAnalysis(),
  275.     'SVM': SVC(kernel='linear', max_iter=1000),
  276.     'Decision Tree': DecisionTreeClassifier(),
  277.     'SGD Classifier': SGDClassifier(),
  278.     'KNN': KNeighborsClassifier(),
  279.     #########'ElasticNet': ElasticNet(),
  280.     'Perceptron': Perceptron(),
  281.     'Logistic Regression': LogisticRegression(),
  282.     'Bagging': BaggingClassifier(),
  283.     'K-Means': KMeans(n_clusters=3),
  284.     'Nearest Centroid Classifier': NearestCentroid(),
  285.     'XGBoost': XGBClassifier(),
  286.     'AdaBoost': AdaBoostClassifier(),
  287.     ########'RNN': create_rnn((28, 28)),
  288.     'RBM + Logistic Regression': Pipeline(steps=[('rbm', BernoulliRBM(n_components=100, learning_rate=0.06, n_iter=10, random_state=42)),('logistic', LogisticRegression())]),
  289.     'Voting Classifier': VotingClassifier(estimators=[('lr', LogisticRegression()),('rf', RandomForestClassifier()),('gnb', GaussianNB())], voting='hard'),
  290.     'Random Forest': RandomForestClassifier(n_estimators=10),
  291.     'Gradient Boosting': GradientBoostingClassifier(n_estimators=10),
  292.     # 'MLP Classifier': MLPClassifier(),
  293.     ######### 'GRU': create_gru((28, 28)),
  294.     ######### 'LSTM': create_lstm((28, 28)),
  295.     ######### 'CNN': create_cnn((28, 28, 1)),
  296.     ######### 'Autoencoder': create_autoencoder((28,)),
  297.     'LightGBM': LGBMClassifier(),
  298.     'CatBoost': CatBoostClassifier(),
  299.     'Self-Training': SelfTrainingClassifier(LogisticRegression()),
  300.     'Isolation Forest': IsolationForest(),
  301.     # 'Stacking Classifier': StackingClassifier(estimators=[('log_reg', LogisticRegression()),('knn', KNeighborsClassifier(n_neighbors=3))],final_estimator=LogisticRegression(),n_jobs=2),
  302.     # 'One-Class SVM': OneClassSVM(kernel='linear', max_iter=1000)
  303.     # 'Deep Belief Network': "Implement DBN",  # Placeholder for DBN
  304.     # 'Restricted Boltzmann Machine': "Implement RBM",  # Placeholder for RBM
  305.     # 'Genetic Algorithm': ga.GeneticAlgorithm(),  # Placeholder for Genetic Algorithm-based
  306.     # 'Bayesian Network': BayesianNetwork([('A', 'B'), ('B', 'C')]),  # Example Bayesian Network
  307.     # 'Fuzzy Logic': "Implement Fuzzy Logic",  # Placeholder for Fuzzy Logic systems
  308.     # 'Conditional Random Field (CRF)': "Implement CRF",  # Placeholder for CRF
  309. }
  310.  
  311. # Running algorithms in k-fold.
  312. logging.info(f"Starting K-Fold Cross Validation with {k_fold} folds.")
  313.  
  314. for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
  315.     logging.info(f"Starting Fold {fold}.")
  316.     X_train, X_test = X[train_idx], X[test_idx]
  317.     y_train, y_test = y[train_idx], y[test_idx]
  318.  
  319.     for algo_name, model in algorithms.items():
  320.         # if 'CNN' in algo_name or 'RNN' in algo_name or 'LSTM' in algo_name or 'GRU' in algo_name or 'Autoencoder' in algo_name:
  321.         #     # Special handling for deep learning models
  322.         #     X_train_dl = X_train.reshape(-1, 28, 28, 1)
  323.         #     X_test_dl = X_test.reshape(-1, 28, 28, 1)
  324.         #     y_train_dl = to_categorical(y_train, num_classes=10)
  325.         #     y_test_dl = to_categorical(y_test, num_classes=10)
  326.         #     run_algorithm(algo_name, model, X_train_dl, y_train_dl, X_test_dl, y_test_dl, fold)
  327.         # else:
  328.         run_algorithm(algo_name, model, X_train, y_train, X_test, y_test, fold)
  329.         logging.info(f"Completed Fold {fold}.")
  330.  
  331. logging.info("K-Fold Cross Validation completed.")
  332.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement