Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- from sklearn.model_selection import train_test_split
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.metrics import accuracy_score, classification_report
- import shap
- from sklearn.preprocessing import MinMaxScaler
- from sklearn.base import clone
- from scipy.stats import uniform
- # Grey Wolf Optimizer Implementation
- class GreyWolfOptimizer:
- def __init__(self, fitness_function, n_agents, n_iterations, n_features):
- self.fitness_function = fitness_function
- self.n_agents = n_agents
- self.n_iterations = n_iterations
- self.n_features = n_features
- self.alpha = None
- self.beta = None
- self.delta = None
- def initialize_population(self):
- return np.random.randint(0, 2, (self.n_agents, self.n_features))
- def optimize(self):
- population = self.initialize_population()
- fitness = np.array([self.fitness_function(ind) for ind in population])
- self.alpha = population[np.argmin(fitness)]
- self.beta = population[np.argsort(fitness)[1]]
- self.delta = population[np.argsort(fitness)[2]]
- for iteration in range(self.n_iterations):
- a = 2 - iteration * (2 / self.n_iterations) # Linearly decreasing parameter
- for i in range(self.n_agents):
- for j in range(self.n_features):
- r1, r2 = np.random.random(), np.random.random()
- # Update positions
- A1, C1 = 2 * a * r1 - a, 2 * r2
- D_alpha = abs(C1 * self.alpha[j] - population[i, j])
- X1 = self.alpha[j] - A1 * D_alpha
- r1, r2 = np.random.random(), np.random.random()
- A2, C2 = 2 * a * r1 - a, 2 * r2
- D_beta = abs(C2 * self.beta[j] - population[i, j])
- X2 = self.beta[j] - A2 * D_beta
- r1, r2 = np.random.random(), np.random.random()
- A3, C3 = 2 * a * r1 - a, 2 * r2
- D_delta = abs(C3 * self.delta[j] - population[i, j])
- X3 = self.delta[j] - A3 * D_delta
- # Final position update
- population[i, j] = np.clip((X1 + X2 + X3) / 3, 0, 1)
- # Discretize population (binary)
- population = (population > 0.5).astype(int)
- # Evaluate fitness
- fitness = np.array([self.fitness_function(ind) for ind in population])
- # Update alpha, beta, delta
- self.alpha = population[np.argmin(fitness)]
- self.beta = population[np.argsort(fitness)[1]]
- self.delta = population[np.argsort(fitness)[2]]
- return self.alpha
- # Fitness Function for GWO
- def fitness_function(features):
- selected_features = np.where(features == 1)[0]
- if len(selected_features) == 0: # Avoid empty feature subset
- return 1e10
- X_train_sel = X_train.iloc[:, selected_features]
- X_test_sel = X_test.iloc[:, selected_features]
- model = clone(clf) # Clone the base model
- model.fit(X_train_sel, y_train)
- y_pred = model.predict(X_test_sel)
- return 1 - accuracy_score(y_test, y_pred) # Minimize (1 - accuracy)
- # Load the UNSW-NB15 dataset
- file_path = "UNSW-NB15.csv" # Replace with your actual file path
- data = pd.read_csv(file_path)
- # Check the dataset structure
- print("Dataset Preview:")
- print(data.head())
- # Preprocessing
- X = data.drop(columns=['label', 'id']) # Drop target and ID columns
- y = data['label']
- X = pd.get_dummies(X, drop_first=True) # One-hot encode categorical features
- scaler = MinMaxScaler()
- X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
- # Use a 10% subset of the data
- X, _, y, _ = train_test_split(X, y, test_size=0.9, random_state=42, stratify=y)
- # Train-test split
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
- # Initialize the base classifier
- clf = RandomForestClassifier(n_estimators=50, random_state=42)
- # Apply GWO for Feature Selection
- gwo = GreyWolfOptimizer(fitness_function, n_agents=10, n_iterations=20, n_features=X_train.shape[1])
- best_features = gwo.optimize()
- # Select features
- selected_features = np.where(best_features == 1)[0]
- X_train_selected = X_train.iloc[:, selected_features]
- X_test_selected = X_test.iloc[:, selected_features]
- # Train the classifier with selected features
- clf.fit(X_train_selected, y_train)
- # Evaluate the model
- y_pred = clf.predict(X_test_selected)
- print("\nClassification Report with Selected Features:")
- print(classification_report(y_test, y_pred))
- accuracy = accuracy_score(y_test, y_pred)
- print(f"Accuracy with Selected Features: {accuracy:.2f}")
- # SHAP Analysis
- explainer = shap.TreeExplainer(clf)
- shap_sample = X_test_selected.sample(100, random_state=42)
- shap_values = explainer.shap_values(shap_sample)
- # SHAP Summary Plot
- print("\nSHAP Summary Plot for Selected Features:")
- shap.summary_plot(shap_values[1], shap_sample, feature_names=X_train_selected.columns)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement