gwo unsw xai

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import shap
from sklearn.preprocessing import MinMaxScaler
from sklearn.base import clone
from scipy.stats import uniform

# Grey Wolf Optimizer Implementation
class GreyWolfOptimizer:
    def __init__(self, fitness_function, n_agents, n_iterations, n_features):
        self.fitness_function = fitness_function
        self.n_agents = n_agents
        self.n_iterations = n_iterations
        self.n_features = n_features
        self.alpha = None
        self.beta = None
        self.delta = None

    def initialize_population(self):
        return np.random.randint(0, 2, (self.n_agents, self.n_features))

    def optimize(self):
        population = self.initialize_population()
        fitness = np.array([self.fitness_function(ind) for ind in population])

        self.alpha = population[np.argmin(fitness)]
        self.beta = population[np.argsort(fitness)[1]]
        self.delta = population[np.argsort(fitness)[2]]

        for iteration in range(self.n_iterations):
            a = 2 - iteration * (2 / self.n_iterations)  # Linearly decreasing parameter

            for i in range(self.n_agents):
                for j in range(self.n_features):
                    r1, r2 = np.random.random(), np.random.random()

                    # Update positions
                    A1, C1 = 2 * a * r1 - a, 2 * r2
                    D_alpha = abs(C1 * self.alpha[j] - population[i, j])
                    X1 = self.alpha[j] - A1 * D_alpha

                    r1, r2 = np.random.random(), np.random.random()
                    A2, C2 = 2 * a * r1 - a, 2 * r2
                    D_beta = abs(C2 * self.beta[j] - population[i, j])
                    X2 = self.beta[j] - A2 * D_beta

                    r1, r2 = np.random.random(), np.random.random()
                    A3, C3 = 2 * a * r1 - a, 2 * r2
                    D_delta = abs(C3 * self.delta[j] - population[i, j])
                    X3 = self.delta[j] - A3 * D_delta

                    # Final position update
                    population[i, j] = np.clip((X1 + X2 + X3) / 3, 0, 1)

            # Discretize population (binary)
            population = (population > 0.5).astype(int)

            # Evaluate fitness
            fitness = np.array([self.fitness_function(ind) for ind in population])

            # Update alpha, beta, delta
            self.alpha = population[np.argmin(fitness)]
            self.beta = population[np.argsort(fitness)[1]]
            self.delta = population[np.argsort(fitness)[2]]

        return self.alpha

# Fitness Function for GWO
def fitness_function(features):
    selected_features = np.where(features == 1)[0]
    if len(selected_features) == 0:  # Avoid empty feature subset
        return 1e10
    X_train_sel = X_train.iloc[:, selected_features]
    X_test_sel = X_test.iloc[:, selected_features]

    model = clone(clf)  # Clone the base model
    model.fit(X_train_sel, y_train)
    y_pred = model.predict(X_test_sel)
    return 1 - accuracy_score(y_test, y_pred)  # Minimize (1 - accuracy)

# Load the UNSW-NB15 dataset
file_path = "UNSW-NB15.csv"  # Replace with your actual file path
data = pd.read_csv(file_path)

# Check the dataset structure
print("Dataset Preview:")
print(data.head())

# Preprocessing
X = data.drop(columns=['label', 'id'])  # Drop target and ID columns
y = data['label']
X = pd.get_dummies(X, drop_first=True)  # One-hot encode categorical features
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Use a 10% subset of the data
X, _, y, _ = train_test_split(X, y, test_size=0.9, random_state=42, stratify=y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Initialize the base classifier
clf = RandomForestClassifier(n_estimators=50, random_state=42)

# Apply GWO for Feature Selection
gwo = GreyWolfOptimizer(fitness_function, n_agents=10, n_iterations=20, n_features=X_train.shape[1])
best_features = gwo.optimize()

# Select features
selected_features = np.where(best_features == 1)[0]
X_train_selected = X_train.iloc[:, selected_features]
X_test_selected = X_test.iloc[:, selected_features]

# Train the classifier with selected features
clf.fit(X_train_selected, y_train)

# Evaluate the model
y_pred = clf.predict(X_test_selected)
print("\nClassification Report with Selected Features:")
print(classification_report(y_test, y_pred))
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy with Selected Features: {accuracy:.2f}")

# SHAP Analysis
explainer = shap.TreeExplainer(clf)
shap_sample = X_test_selected.sample(100, random_state=42)
shap_values = explainer.shap_values(shap_sample)

# SHAP Summary Plot
print("\nSHAP Summary Plot for Selected Features:")
shap.summary_plot(shap_values[1], shap_sample, feature_names=X_train_selected.columns)