Advertisement
mayankjoin3

ml 25 algo classification

Oct 20th, 2024
177
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.87 KB | None | 0 0
  1. # Initialize dataset parameters
  2. #Change the filename.
  3.  
  4. input_file = 'CIC_IOT_2023_combined.csv'
  5.  
  6.  
  7. import pandas as pd
  8. import numpy as np
  9. import time
  10. from sklearn.ensemble import ExtraTreesClassifier, HistGradientBoostingClassifier
  11. from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
  12. from sklearn.preprocessing import LabelEncoder, MinMaxScaler
  13. from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, matthews_corrcoef
  14. from sklearn.metrics import jaccard_score, cohen_kappa_score, hamming_loss, zero_one_loss, mean_absolute_error
  15. from sklearn.metrics import mean_squared_error, r2_score, balanced_accuracy_score
  16. from sklearn.tree import DecisionTreeClassifier
  17. from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier, StackingClassifier
  18. from sklearn.neighbors import KNeighborsClassifier
  19. from sklearn.svm import SVC, OneClassSVM
  20. from sklearn.naive_bayes import GaussianNB
  21. from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron, ElasticNet
  22. from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
  23. from sklearn.neural_network import MLPClassifier
  24. from xgboost import XGBClassifier
  25. from lightgbm import LGBMClassifier
  26. from catboost import CatBoostClassifier
  27. from sklearn.cluster import KMeans, AgglomerativeClustering
  28. from sklearn.gaussian_process import GaussianProcessClassifier
  29. from sklearn.neighbors import NearestCentroid
  30. from sklearn.mixture import GaussianMixture
  31. from sklearn.ensemble import IsolationForest
  32. from sklearn.pipeline import Pipeline
  33. from sklearn.neural_network import BernoulliRBM
  34. from sklearn.experimental import enable_iterative_imputer
  35. from sklearn.impute import IterativeImputer
  36. from keras.models import Sequential
  37. from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, LSTM, GRU, SimpleRNN
  38. from keras.utils import to_categorical
  39. import traceback
  40. import csv
  41. import warnings
  42. from sklearn.semi_supervised import SelfTrainingClassifier
  43.  
  44. warnings.filterwarnings("ignore")
  45.  
  46. k_fold = 5
  47. dataset_percent = 100
  48.  
  49.  
  50. # Initialize CSV file and columns
  51. output_file = input_file.replace('.csv', '_results.csv')
  52. csv_columns = ['Algorithm', 'Fold', 'Train Time (s)', 'Test Time (s)', 'Accuracy', 'Precision', 'Recall', 'F1 Score',
  53.                'Fbeta Score', 'Matthews Correlation Coefficient', 'Jaccard Score', 'Cohen Kappa Score',
  54.                'Hamming Loss', 'Zero One Loss', 'Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error',
  55.                'Balanced Accuracy', 'R2 Score']
  56.  
  57. # Function to handle metric calculation
  58. def compute_metrics(y_true, y_pred):
  59.     metrics = {}
  60.     metrics['Accuracy'] = accuracy_score(y_true, y_pred)
  61.     metrics['Precision'] = precision_score(y_true, y_pred, average='weighted', zero_division=1)
  62.     metrics['Recall'] = recall_score(y_true, y_pred, average='weighted', zero_division=1)
  63.     metrics['F1 Score'] = f1_score(y_true, y_pred, average='weighted', zero_division=1)
  64.     metrics['Fbeta Score'] = fbeta_score(y_true, y_pred, beta=1.0, average='weighted', zero_division=1)
  65.     metrics['Matthews Correlation Coefficient'] = matthews_corrcoef(y_true, y_pred)
  66.     metrics['Jaccard Score'] = jaccard_score(y_true, y_pred, average='weighted', zero_division=1)
  67.     metrics['Cohen Kappa Score'] = cohen_kappa_score(y_true, y_pred)
  68.     metrics['Hamming Loss'] = hamming_loss(y_true, y_pred)
  69.     metrics['Zero One Loss'] = zero_one_loss(y_true, y_pred)
  70.     metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
  71.     metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
  72.     metrics['Root Mean Squared Error'] = np.sqrt(metrics['Mean Squared Error'])
  73.     metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
  74.     metrics['R2 Score'] = r2_score(y_true, y_pred)
  75.    
  76.     return metrics
  77.  
  78. # Function to handle each algorithm execution
  79. def run_algorithm(algo_name, model, X_train, y_train, X_test, y_test, fold):
  80.     try:
  81.         start_train = time.time()
  82.         model.fit(X_train, y_train)
  83.         train_time = time.time() - start_train
  84.  
  85.         start_test = time.time()
  86.         y_pred = model.predict(X_test)
  87.         test_time = time.time() - start_test
  88.  
  89.         # Compute metrics
  90.         if algo_name == 'ElasticNet':  # Handle ElasticNet as a regression model
  91.             metrics = {}
  92.             metrics['Mean Absolute Error'] = mean_absolute_error(y_test, y_pred)
  93.             metrics['Mean Squared Error'] = mean_squared_error(y_test, y_pred)
  94.             metrics['Root Mean Squared Error'] = np.sqrt(metrics['Mean Squared Error'])
  95.             metrics['R2 Score'] = r2_score(y_test, y_pred)
  96.         else:
  97.             # Compute classification metrics
  98.             metrics = compute_metrics(y_test, y_pred)
  99.         metrics.update({'Train Time (s)': train_time, 'Test Time (s)': test_time})
  100.        
  101.         # Log results to CSV
  102.         with open(output_file, 'a', newline='') as f:
  103.             writer = csv.writer(f)
  104.             writer.writerow([algo_name, fold] + [metrics.get(m, -1) for m in csv_columns[2:]])
  105.  
  106.         print(f"{algo_name} | Fold: {fold} | Train Time: {train_time:.2f}s | Test Time: {test_time:.2f}s")
  107.        
  108.     except Exception as e:
  109.         # Log error case
  110.         with open(output_file, 'a', newline='') as f:
  111.             writer = csv.writer(f)
  112.             writer.writerow([algo_name, fold] + [-1 for _ in csv_columns[2:]])
  113.         print(f"Error in {algo_name}: {traceback.format_exc()}")
  114.  
  115. # Load dataset
  116. df = pd.read_csv(input_file)
  117. X = df.iloc[:, :-1]
  118. y = df.iloc[:, -1]
  119.  
  120. # Encode categorical features
  121. label_encoder = LabelEncoder()
  122. for column in X.columns:
  123.     if X[column].dtype == 'object' or X[column].dtype.name == 'category':
  124.         X[column] = label_encoder.fit_transform(X[column])
  125. y = LabelEncoder().fit_transform(y)
  126.  
  127. # Apply iterative imputation to handle missing data
  128. imputer = IterativeImputer()
  129. X = imputer.fit_transform(X)
  130.  
  131. # Min-Max scaling
  132. scaler = MinMaxScaler()
  133. X = scaler.fit_transform(X)
  134.  
  135. # Take a subset of the dataset if dataset_percent is less than 100
  136. if dataset_percent < 100:
  137.     X, _, y, _ = train_test_split(X, y, train_size=dataset_percent/100, stratify=y, random_state=42)
  138.  
  139. # Prepare CSV header if not present
  140. if not pd.io.common.file_exists(output_file):
  141.     with open(output_file, 'w', newline='') as f:
  142.         writer = csv.writer(f)
  143.         writer.writerow(csv_columns)
  144.  
  145. # K-Fold Cross Validation
  146. kf = StratifiedKFold(n_splits=k_fold, shuffle=True, random_state=42)
  147.  
  148. # List of algorithms
  149. algorithms = {
  150.     'Naive Bayes': GaussianNB(),
  151.     'LDA': LinearDiscriminantAnalysis(),
  152.     'QDA': QuadraticDiscriminantAnalysis(),
  153.     'Decision Tree': DecisionTreeClassifier(),
  154.     'SGD Classifier': SGDClassifier(),
  155.     'KNN': KNeighborsClassifier(),
  156.     'ElasticNet': ElasticNet(),
  157.     'Perceptron': Perceptron(),
  158.     'Logistic Regression': LogisticRegression(),
  159.     'Bagging': BaggingClassifier(),
  160.     'K-Means': KMeans(n_clusters=3),
  161.     'Nearest Centroid Classifier': NearestCentroid(),
  162.     'XGBoost': XGBClassifier(),
  163.     'AdaBoost': AdaBoostClassifier(),
  164.     'RBM + Logistic Regression': Pipeline(steps=[('rbm', BernoulliRBM(n_components=
  165.     0, learning_rate=0.06, n_iter=10, random_state=42)),('logistic', LogisticRegression())]),
  166.     'Voting Classifier': VotingClassifier(estimators=[('lr', LogisticRegression()),('rf', RandomForestClassifier()),('gnb', GaussianNB())], voting='hard'),
  167.     'Random Forest': RandomForestClassifier(n_estimators=10),
  168.     'Gradient Boosting': GradientBoostingClassifier(n_estimators=10),
  169.     'Stacking Classifier': StackingClassifier(estimators=[('log_reg', LogisticRegression()),('knn', KNeighborsClassifier(n_neighbors=3))],final_estimator=LogisticRegression(),n_jobs=-1),
  170.     'LightGBM': LGBMClassifier(),
  171.     'CatBoost': CatBoostClassifier(),
  172.     'Self-Training': SelfTrainingClassifier(LogisticRegression()),
  173.     'Isolation Forest': IsolationForest(),
  174.     'Extra Trees Classifier': ExtraTreesClassifier(n_estimators=100),
  175.     'HistGradientBoostingClassifier': HistGradientBoostingClassifier(max_iter=100, validation_fraction=None),
  176. }
  177.  
  178. # Running algorithms in k-fold
  179. for fold, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
  180.     X_train, X_test = X[train_idx], X[test_idx]
  181.     y_train, y_test = y[train_idx], y[test_idx]
  182.  
  183.     for algo_name, model in algorithms.items():
  184.         run_algorithm(algo_name, model, X_train, y_train, X_test, y_test, fold)
  185.  
  186. print("All algorithms have been executed. Results are saved in", output_file)
  187.  
  188.  
  189. # Load the CSV file
  190.  
  191. df = pd.read_csv(output_file)
  192.  
  193. # Sort the dataframe by the 'F1 Score' column in descending order
  194. df_sorted = df.sort_values(by='F1 Score', ascending=False)
  195.  
  196. # Save the sorted dataframe to a new CSV file
  197. df_sorted.to_csv(output_file, index=False)
  198.  
  199. # Print the first few rows to verify
  200. print(df_sorted.head())
  201.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement