Advertisement
mayankjoin3

Security_datasets_Code_2 - NO FS NO Smote

Nov 9th, 2024
106
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 14.91 KB | None | 0 0
  1.  
  2. input_file = 'CIC_IOT_2023_combined.csv'  # Input dataset
  3.  
  4. import pandas as pd
  5. import numpy as np
  6. import time
  7. from sklearn.model_selection import train_test_split, KFold
  8. from sklearn.preprocessing import LabelEncoder, MinMaxScaler
  9. from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, matthews_corrcoef
  10. from sklearn.metrics import jaccard_score, cohen_kappa_score, hamming_loss, zero_one_loss, mean_absolute_error
  11. from sklearn.metrics import mean_squared_error, r2_score, balanced_accuracy_score
  12. from sklearn.tree import DecisionTreeClassifier
  13. from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier, StackingClassifier
  14. from sklearn.neighbors import KNeighborsClassifier
  15. from sklearn.svm import SVC, OneClassSVM
  16. from sklearn.naive_bayes import GaussianNB
  17. from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron, ElasticNet
  18. from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
  19. from sklearn.neural_network import MLPClassifier
  20. from xgboost import XGBClassifier
  21. from lightgbm import LGBMClassifier
  22. from catboost import CatBoostClassifier
  23. from sklearn.cluster import KMeans, AgglomerativeClustering
  24. from sklearn.gaussian_process import GaussianProcessClassifier
  25. from sklearn.neighbors import NearestCentroid
  26. from sklearn.mixture import GaussianMixture
  27. from sklearn.ensemble import IsolationForest
  28. from sklearn.pipeline import Pipeline
  29. from sklearn.neural_network import BernoulliRBM
  30. from keras.models import Sequential
  31. from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, LSTM, GRU, SimpleRNN
  32. from keras.utils import to_categorical
  33. import traceback
  34. import csv
  35. import warnings
  36. from collections import defaultdict
  37. from sklearn.semi_supervised import SelfTrainingClassifier
  38.  
  39. # Custom imports
  40. # from pgmpy.models import BayesianNetwork
  41.  # For Bayesian Networks
  42. # import geneticalgorithm as ga
  43.  # For Genetic Algorithm-based Classifier (hypothetical)
  44. # Add fuzzy logic and CRF imports if you have specific packages (e.g., `python-crfsuite` for CRF)
  45.  
  46. warnings.filterwarnings("ignore")  # Suppress warnings for cleaner output
  47.  
  48. # Initialize dataset parameters
  49. k_fold = 5  # Change as needed
  50. dataset_percent = 100  # Change as needed
  51.  
  52. # Initialize CSV file and columns
  53. # output_file = 'results.csv'
  54. output_file = input_file.replace('.csv', '_results.csv')
  55. class_metrics_file = input_file.replace('.csv', '_class_results.csv')
  56. csv_columns = ['Algorithm', 'Fold', 'Train Time (s)', 'Test Time (s)', 'Accuracy', 'Precision', 'Recall', 'F1 Score',
  57.                'Fbeta Score', 'Matthews Correlation Coefficient', 'Jaccard Score', 'Cohen Kappa Score',
  58.                'Hamming Loss', 'Zero One Loss', 'Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error',
  59.                'Balanced Accuracy', 'R2 Score']
  60.  
  61. # Initialize per-class metrics CSV
  62. class_metrics_columns = ['Algorithm', 'Fold', 'Class', 'Accuracy', 'Precision', 'Recall', 'F1 Score',
  63.                'Fbeta Score', 'Matthews Correlation Coefficient', 'Jaccard Score', 'Cohen Kappa Score',
  64.                'Hamming Loss', 'Zero One Loss', 'Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error',
  65.                'Balanced Accuracy', 'R2 Score']
  66.  
  67. def compute_classwise_metrics(y_true, y_pred):
  68.     class_metrics = defaultdict(dict)
  69.     classes = np.unique(y_true)
  70.    
  71.     for class_index in classes:
  72.         true_class_name = class_names[class_index]
  73.         y_true_class = (y_true == class_index).astype(int)
  74.         y_pred_class = (y_pred == class_index).astype(int)
  75.        
  76.         # Calculate metrics for each true class name with rounding to 3 decimal places
  77.         class_metrics[true_class_name]['Accuracy'] = round(accuracy_score(y_true_class, y_pred_class), 3)
  78.         class_metrics[true_class_name]['Precision'] = round(precision_score(y_true, y_pred, labels=[class_index], average='weighted', zero_division=1), 3)
  79.         class_metrics[true_class_name]['Recall'] = round(recall_score(y_true, y_pred, labels=[class_index], average='weighted', zero_division=1), 3)
  80.         class_metrics[true_class_name]['F1 Score'] = round(f1_score(y_true, y_pred, labels=[class_index], average='weighted', zero_division=1), 3)
  81.         class_metrics[true_class_name]['Fbeta Score'] = round(fbeta_score(y_true, y_pred, labels=[class_index], beta=1.0, average='weighted', zero_division=1), 3)
  82.         class_metrics[true_class_name]['Matthews Correlation Coefficient'] = round(matthews_corrcoef(y_true_class, y_pred_class), 3)
  83.         class_metrics[true_class_name]['Jaccard Score'] = round(jaccard_score(y_true, y_pred, labels=[class_index], average='weighted', zero_division=1), 3)
  84.         class_metrics[true_class_name]['Cohen Kappa Score'] = round(cohen_kappa_score(y_true_class, y_pred_class), 3)
  85.         class_metrics[true_class_name]['Hamming Loss'] = round(hamming_loss(y_true_class, y_pred_class), 3)
  86.         class_metrics[true_class_name]['Zero One Loss'] = round(zero_one_loss(y_true_class, y_pred_class), 3)
  87.         class_metrics[true_class_name]['Mean Absolute Error'] = round(mean_absolute_error(y_true_class, y_pred_class), 3)
  88.         class_metrics[true_class_name]['Mean Squared Error'] = round(mean_squared_error(y_true_class, y_pred_class), 3)
  89.         class_metrics[true_class_name]['Root Mean Squared Error'] = round(np.sqrt(class_metrics[true_class_name]['Mean Squared Error']), 3)
  90.         class_metrics[true_class_name]['Balanced Accuracy'] = round(balanced_accuracy_score(y_true_class, y_pred_class), 3)
  91.         class_metrics[true_class_name]['R2 Score'] = round(r2_score(y_true_class, y_pred_class), 3)
  92.  
  93.     return class_metrics                              
  94.  
  95. # Function to handle metric calculation
  96. def compute_metrics(y_true, y_pred):
  97.     metrics = {}
  98.     metrics['Accuracy'] = accuracy_score(y_true, y_pred)
  99.     metrics['Precision'] = precision_score(y_true, y_pred, average='weighted', zero_division=1)
  100.     metrics['Recall'] = recall_score(y_true, y_pred, average='weighted', zero_division=1)
  101.     metrics['F1 Score'] = f1_score(y_true, y_pred, average='weighted', zero_division=1)
  102.     metrics['Fbeta Score'] = fbeta_score(y_true, y_pred, beta=1.0, average='weighted', zero_division=1)
  103.     metrics['Matthews Correlation Coefficient'] = matthews_corrcoef(y_true, y_pred)
  104.     metrics['Jaccard Score'] = jaccard_score(y_true, y_pred, average='weighted', zero_division=1)
  105.     metrics['Cohen Kappa Score'] = cohen_kappa_score(y_true, y_pred)
  106.     metrics['Hamming Loss'] = hamming_loss(y_true, y_pred)
  107.     metrics['Zero One Loss'] = zero_one_loss(y_true, y_pred)
  108.     metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
  109.     metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
  110.     metrics['Root Mean Squared Error'] = np.sqrt(metrics['Mean Squared Error'])
  111.     metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
  112.     metrics['R2 Score'] = r2_score(y_true, y_pred)
  113.    
  114.     return metrics
  115.  
  116. # Function to handle each algorithm execution
  117. def run_algorithm(algo_name, model, X_train, y_train, X_test, y_test, fold):
  118.     """Run a single algorithm and log its results"""
  119.     try:
  120.         start_train = time.time()
  121.         model.fit(X_train, y_train)
  122.         train_time = time.time() - start_train
  123.  
  124.         start_test = time.time()
  125.         y_pred = model.predict(X_test)
  126.         test_time = time.time() - start_test
  127.  
  128.         # Compute metrics
  129.         if algo_name == 'ElasticNet':  # Handle ElasticNet as a regression model
  130.             metrics = {}
  131.             metrics['Mean Absolute Error'] = mean_absolute_error(y_test, y_pred)
  132.             metrics['Mean Squared Error'] = mean_squared_error(y_test, y_pred)
  133.             metrics['Root Mean Squared Error'] = np.sqrt(metrics['Mean Squared Error'])
  134.             metrics['R2 Score'] = r2_score(y_test, y_pred)
  135.         else:
  136.             # Compute classification metrics
  137.             metrics = compute_metrics(y_test, y_pred)
  138.            
  139.         metrics.update({'Train Time (s)': train_time, 'Test Time (s)': test_time})
  140.        
  141.         # Compute class-wise metrics - Add this line
  142.         class_metrics = compute_classwise_metrics(y_test, y_pred)
  143.        
  144.         # Log results to CSV
  145.         with open(output_file, 'a', newline='') as f:
  146.             writer = csv.writer(f)
  147.             writer.writerow([algo_name, fold] + [metrics.get(m, -1) for m in csv_columns[2:]])
  148.  
  149.         # Write per-class metrics to `class_metrics.csv`
  150.         with open(class_metrics_file, 'a', newline='') as f:
  151.             writer = csv.writer(f)
  152.             for class_label, cm in class_metrics.items():
  153.                 writer.writerow([algo_name, fold, class_label] + [cm.get(m, -1) for m in class_metrics_columns[3:]])
  154.  
  155.         print(f"{algo_name} | Fold: {fold} | Train Time: {train_time:.2f}s | Test Time: {test_time:.2f}s")
  156.        
  157.     except Exception as e:
  158.         print(f"Error in {algo_name}: {traceback.format_exc()}")
  159.         # Log error case
  160.         with open(output_file, 'a', newline='') as f:
  161.             writer = csv.writer(f)
  162.             writer.writerow([algo_name, fold] + [-1 for _ in csv_columns[2:]])
  163.  
  164. # Load dataset
  165. df = pd.read_csv(input_file)
  166. X = df.iloc[:, :-1]
  167. y = df.iloc[:, -1]
  168.  
  169. # Encode categorical features
  170. X = pd.get_dummies(X)
  171. label_encoder = LabelEncoder()
  172. y = label_encoder.fit_transform(y)
  173. class_names = label_encoder.classes_
  174.  
  175. # Min-Max scaling
  176. scaler = MinMaxScaler()
  177. X = scaler.fit_transform(X)
  178.  
  179. # Take a subset of the dataset if dataset_percent is less than 100
  180. if dataset_percent < 100:
  181.     _, X, _, y = train_test_split(X, y, test_size=dataset_percent/100, stratify=y)
  182.  
  183. # Prepare CSV header if not present
  184. if not pd.io.common.file_exists(output_file):
  185.     with open(output_file, 'w', newline='') as f:
  186.         writer = csv.writer(f)
  187.         writer.writerow(csv_columns)
  188.  
  189. if not pd.io.common.file_exists(class_metrics_file):
  190.     with open(class_metrics_file, 'w', newline='') as f:
  191.         writer = csv.writer(f)
  192.         writer.writerow(class_metrics_columns)  
  193.  
  194. # K-Fold Cross Validation
  195. kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
  196.  
  197. # Deep Learning (CNN, RNN, LSTM, GRU, Autoencoders)
  198. def create_cnn(input_shape):
  199.     model = Sequential()
  200.     model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
  201.     model.add(MaxPooling2D(pool_size=(2, 2)))
  202.     model.add(Flatten())
  203.     model.add(Dense(128, activation='relu'))
  204.     model.add(Dense(10, activation='softmax'))
  205.     model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
  206.     return model
  207.  
  208. def create_rnn(input_shape):
  209.     model = Sequential()
  210.     model.add(SimpleRNN(128, input_shape=input_shape, return_sequences=True))
  211.     model.add(Flatten())
  212.     model.add(Dense(10, activation='softmax'))
  213.     model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
  214.     return model
  215.  
  216. def create_lstm(input_shape):
  217.     model = Sequential()
  218.     model.add(LSTM(128, input_shape=input_shape, return_sequences=True))
  219.     model.add(Flatten())
  220.     model.add(Dense(10, activation='softmax'))
  221.     model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
  222.     return model
  223.  
  224. def create_gru(input_shape):
  225.     model = Sequential()
  226.     model.add(GRU(128, input_shape=input_shape, return_sequences=True))
  227.     model.add(Flatten())
  228.     model.add(Dense(10, activation='softmax'))
  229.     model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
  230.     return model
  231.  
  232. def create_autoencoder(input_shape):
  233.     model = Sequential()
  234.     model.add(Dense(128, input_shape=input_shape, activation='relu'))
  235.     model.add(Dense(input_shape[0], activation='sigmoid'))
  236.     model.compile(optimizer='adam', loss='mse')
  237.     return model
  238.  
  239. # List of algorithms
  240. algorithms = {
  241.  
  242.  
  243.     'Naive Bayes': GaussianNB(),
  244.     'LDA': LinearDiscriminantAnalysis(),
  245.     'QDA': QuadraticDiscriminantAnalysis(),
  246.     'SVM': SVC(kernel='linear', max_iter=1000),
  247.     'Decision Tree': DecisionTreeClassifier(),
  248.     'SGD Classifier': SGDClassifier(),
  249.     'KNN': KNeighborsClassifier(),
  250.     'ElasticNet': ElasticNet(),
  251.     'Perceptron': Perceptron(),
  252.     'Logistic Regression': LogisticRegression(),
  253.     'Bagging': BaggingClassifier(),
  254.     'K-Means': KMeans(n_clusters=3),
  255.     'Nearest Centroid Classifier': NearestCentroid(),
  256.     'XGBoost': XGBClassifier(),
  257.     'AdaBoost': AdaBoostClassifier(),
  258.     ########'RNN': create_rnn((28, 28)),
  259.     'RBM + Logistic Regression': Pipeline(steps=[('rbm', BernoulliRBM(n_components=100, learning_rate=0.06, n_iter=10, random_state=42)),('logistic', LogisticRegression())]),
  260.     'Voting Classifier': VotingClassifier(estimators=[('lr', LogisticRegression()),('rf', RandomForestClassifier()),('gnb', GaussianNB())], voting='hard'),
  261.     'Random Forest': RandomForestClassifier(n_estimators=10),
  262.     'Gradient Boosting': GradientBoostingClassifier(n_estimators=10),
  263.     'Stacking Classifier': StackingClassifier(estimators=[('log_reg', LogisticRegression()),('knn', KNeighborsClassifier(n_neighbors=3))],final_estimator=LogisticRegression(),n_jobs=-1),
  264.     # 'MLP Classifier': MLPClassifier(),
  265.     ######### 'GRU': create_gru((28, 28)),
  266.     ######### 'LSTM': create_lstm((28, 28)),
  267.     ######### 'CNN': create_cnn((28, 28, 1)),
  268.     ######### 'Autoencoder': create_autoencoder((28,)),
  269.     'LightGBM': LGBMClassifier(),
  270.     'CatBoost': CatBoostClassifier(),
  271.     'Self-Training': SelfTrainingClassifier(LogisticRegression()),
  272.     'Isolation Forest': IsolationForest(),
  273.     # 'One-Class SVM': OneClassSVM(kernel='linear', max_iter=1000)
  274.     # 'Deep Belief Network': "Implement DBN",  # Placeholder for DBN
  275.     # 'Restricted Boltzmann Machine': "Implement RBM",  # Placeholder for RBM
  276.     # 'Genetic Algorithm': ga.GeneticAlgorithm(),  # Placeholder for Genetic Algorithm-based
  277.     # 'Bayesian Network': BayesianNetwork([('A', 'B'), ('B', 'C')]),  # Example Bayesian Network
  278.     # 'Fuzzy Logic': "Implement Fuzzy Logic",  # Placeholder for Fuzzy Logic systems
  279.     # 'Conditional Random Field (CRF)': "Implement CRF",  # Placeholder for CRF
  280. }
  281.  
  282. # Running algorithms in k-fold
  283. for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
  284.     X_train, X_test = X[train_idx], X[test_idx]
  285.     y_train, y_test = y[train_idx], y[test_idx]
  286.  
  287.     for algo_name, model in algorithms.items():
  288.         # if 'CNN' in algo_name or 'RNN' in algo_name or 'LSTM' in algo_name or 'GRU' in algo_name or 'Autoencoder' in algo_name:
  289.         #     # Special handling for deep learning models
  290.         #     X_train_dl = X_train.reshape(-1, 28, 28, 1)
  291.         #     X_test_dl = X_test.reshape(-1, 28, 28, 1)
  292.         #     y_train_dl = to_categorical(y_train, num_classes=10)
  293.         #     y_test_dl = to_categorical(y_test, num_classes=10)
  294.         #     run_algorithm(algo_name, model, X_train_dl, y_train_dl, X_test_dl, y_test_dl, fold)
  295.         # else:
  296.         run_algorithm(algo_name, model, X_train, y_train, X_test, y_test, fold)
  297.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement