Advanced Iris Machine Learning Pipeline

import os
import numpy as np
import matplotlib.pyplot as plt
import logging
import asyncio
import psutil
from joblib import Parallel, delayed, dump, load
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_validate, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import SelectKBest, f_classif
from shap import TreeExplainer
import json

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def load_config(config_path="config.json"):
    with open(config_path, "r") as config_file:
        return json.load(config_file)

def save_config(config, config_path="config.json"):
    with open(config_path, "w") as config_file:
        json.dump(config, config_file, indent=4)

config = load_config()
storage_location = config.get("storage_location", "default_storage_directory")

def set_storage_location(new_location):
    global storage_location
    storage_location = new_location
    config["storage_location"] = new_location
    save_config(config)

set_storage_location("new_storage_directory")

os.makedirs(storage_location, exist_ok=True)

def load_and_preprocess_data():
    try:
        iris_data = load_iris()
        X, y = iris_data.data, iris_data.target
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
        return X_train, X_test, y_train, y_test
    except Exception as e:
        logger.error(f"Error loading and preprocessing data: {e}")
        raise

def create_pipeline(clf):
    try:
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('feature_selection', SelectKBest(score_func=f_classif, k=2)),
            ('classifier', clf)
        ])
        return pipeline
    except Exception as e:
        logger.error(f"Error creating pipeline: {e}")
        raise

def get_classifiers():
    return {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "SVC": SVC(),
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(),
        "K-Neighbors": KNeighborsClassifier(),
        "Naive Bayes": GaussianNB(),
        "Gradient Boosting": GradientBoostingClassifier()
    }

def hyperparameter_tuning(clf, param_grid, X_train, y_train):
    try {
        grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
        grid_search.fit(X_train, y_train)
        return grid_search.best_estimator_
    except Exception as e:
        logger.error(f"Error during hyperparameter tuning: {e}")
        raise

def evaluate_classifier(name, clf, X_train, y_train, X_test, y_test, kfold):
    try:
        train_scores = cross_validate(clf, X_train, y_train, cv=kfold, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'], n_jobs=-1)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        test_score = {
            "accuracy": accuracy_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred, average='macro'),
            "recall": recall_score(y_test, y_pred, average='macro'),
            "f1": f1_score(y_test, y_pred, average='macro')
        }
        return name, train_scores, test_score
    except Exception as e:
        logger.error(f"Error evaluating classifier {name}: {e}")
        raise

def plot_scores(training_scores, test_scores):
    metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
    x = np.arange(len(metrics))
    for name, train_scores, test_score in zip(training_scores, test_scores):
        plt.figure(figsize=(10, 6))
        train_values = [np.mean(train_scores['test_' + metric]) for metric in metrics]
        test_values = [test_score[metric.split('_')[0]] for metric in metrics]
        plt.plot(x, train_values, label='Train')
        plt.plot(x, test_values, label='Test')
        plt.xticks(x, metrics)
        plt.ylabel('Score')
        plt.title(f'{name} Scores')
        plt.legend()
        plt.show()

def save_model_locally(model, name):
    try:
        model_path = os.path.join(storage_location, f'{name}_model.joblib')
        dump(model, model_path)
    except Exception as e:
        logger.error(f"Error saving model locally: {e}")
        raise

def monitor_resources():
    try:
        print(f"CPU usage: {psutil.cpu_percent()}%")
        print(f"Memory usage: {psutil.virtual_memory().percent}%")
    except ImportError:
        logger.warning("psutil not installed. Resource monitoring not available.")

async def async_main():
    try:
        X_train, X_test, y_train, y_test = load_and_preprocess_data()
        classifiers = get_classifiers()
        kfold = KFold(n_splits=5, shuffle=True, random_state=42)

        tuned_classifiers = {}
        for name, clf in classifiers.items():
            logger.info(f"Tuning hyperparameters for {name}")
            pipeline = create_pipeline(clf)
            param_grid = {
                'classifier__C': [0.1, 1, 10] if name in ['Logistic Regression', 'SVC'] else {},
                'classifier__max_depth': [3, 5, 7] if name in ['Decision Tree', 'Random Forest'] else {}
            }
            tuned_clf = hyperparameter_tuning(pipeline, param_grid, X_train, y_train)
            tuned_classifiers[name] = tuned_clf

        results = Parallel(n_jobs=-1)(delayed(evaluate_classifier)(name, clf, X_train, y_train, X_test, y_test, kfold) for name, clf in tuned_classifiers.items())

        for name, _, test_score in results:
            save_model_locally(tuned_classifiers[name], name)

        for name, clf in tuned_classifiers.items():
            explainer = TreeExplainer(clf.named_steps['classifier'])
            shap_values = explainer.shap_values(X_test)
            shap.summary_plot(shap_values, X_test, feature_names=load_iris().feature_names)

        plot_scores([result[1] for result in results], [result[2] for result in results])

        monitor_resources()

    except Exception as e:
        logger.error(f"Error in main execution: {e}")
        raise

if __name__ == "__main__":
    asyncio.run(async_main())