CW_task4_Gradient_Boosting_with_RandomizedSearchCV

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.metrics import make_scorer, f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import numpy as np

# Assuming x_train, y_train, x_test, y_test are already defined
data = np.load('C:/Users/print15207/MATLAB Drive/Print HVDC/Smartgrid CW/train_dataset.npy',allow_pickle=True)
#The training dataset is loaded from the specified NumPy file.

#Extracting Features and Labels:
x = data.item()['feature']
y = data.item()['label']
#The features (x) and labels (y) are extracted from the loaded data.

#Splitting the Data into Training and Testing Sets:
x1=x[:4800] #Only classify between class 0 (normal measurement) and class 1 (FDI attack measurement)
y1=y[:4800]
x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size=0.313, random_state=42)

gb_model = GradientBoostingClassifier()

# Define the hyperparameter distributions to sample from
param_dist = {
    'n_estimators': np.arange(50, 201, 10),
    'learning_rate': [0.01, 0.1, 0.2, 0.5],
    'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8]
}

# Define F1 score as the evaluation metric for hyperparameter tuning
scorer = make_scorer(f1_score)

# Perform Randomized Search with Cross Validation
random_search = RandomizedSearchCV(estimator=gb_model, param_distributions=param_dist, scoring=scorer, cv=5, n_iter=50, random_state=42)
random_search.fit(x_train, y_train)

# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)

# Evaluate the model with the best hyperparameters on the test set
best_model = random_search.best_estimator_
test_predictions = best_model.predict(x_test)
test_f1_score = f1_score(y_test, test_predictions)
print("F1 Score on Test Set with Best Hyperparameters:", test_f1_score)

# Evaluate the model using TPR and FPR
conf_matrix = confusion_matrix(y_test, test_predictions)
print("Confusion Matrix:")
print(conf_matrix)
TN, FP, FN, TP = conf_matrix.ravel()

# Calculate TPR and FPR
TPR = TP / (TP + FN)
FPR = FP / (FP + TN)

# Print or use the metrics
print("True Positive Rate (TPR):", TPR)
print("False Positive Rate (FPR):", FPR)

#Result with elapsed time: 1742 seconds:
#Best Hyperparameters: {'n_estimators': 160, 'min_samples_split': 5, 'min_samples_leaf': 8, 'max_depth': 8, 'learning_rate': 0.5}
#F1 Score on Test Set with Best Hyperparameters: 0.9972640218878249
#Confusion Matrix:
[[770   0]
 [  4 729]]
#True Positive Rate (TPR): 0.9945429740791268
#False Positive Rate (FPR): 0.0

# Now, you can use the trained model for predictions on new data.
# For example, if 'new_data' is your new dataset, you can do:
# new_predictions = gb_model.predict(new_data)