Advertisement
YaBoiSwayZ

Advanced Iris Machine Learning Pipeline

May 26th, 2024 (edited)
105
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.58 KB | Source Code | 0 0
  1. import os
  2. import numpy as np
  3. import matplotlib.pyplot as plt
  4. import logging
  5. import asyncio
  6. import psutil
  7. from joblib import Parallel, delayed, dump, load
  8. from sklearn.datasets import load_iris
  9. from sklearn.model_selection import train_test_split, cross_validate, KFold, GridSearchCV
  10. from sklearn.preprocessing import StandardScaler
  11. from sklearn.pipeline import Pipeline
  12. from sklearn.linear_model import LogisticRegression
  13. from sklearn.svm import SVC
  14. from sklearn.tree import DecisionTreeClassifier
  15. from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
  16. from sklearn.neighbors import KNeighborsClassifier
  17. from sklearn.naive_bayes import GaussianNB
  18. from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
  19. from sklearn.feature_selection import SelectKBest, f_classif
  20. from shap import TreeExplainer
  21. import json
  22.  
  23. logging.basicConfig(level=logging.INFO)
  24. logger = logging.getLogger(__name__)
  25.  
  26. def load_config(config_path="config.json"):
  27.     with open(config_path, "r") as config_file:
  28.         return json.load(config_file)
  29.  
  30. def save_config(config, config_path="config.json"):
  31.     with open(config_path, "w") as config_file:
  32.         json.dump(config, config_file, indent=4)
  33.  
  34. config = load_config()
  35. storage_location = config.get("storage_location", "default_storage_directory")
  36.  
  37. def set_storage_location(new_location):
  38.     global storage_location
  39.     storage_location = new_location
  40.     config["storage_location"] = new_location
  41.     save_config(config)
  42.  
  43. set_storage_location("new_storage_directory")
  44.  
  45. os.makedirs(storage_location, exist_ok=True)
  46.  
  47. def load_and_preprocess_data():
  48.     try:
  49.         iris_data = load_iris()
  50.         X, y = iris_data.data, iris_data.target
  51.         X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
  52.         return X_train, X_test, y_train, y_test
  53.     except Exception as e:
  54.         logger.error(f"Error loading and preprocessing data: {e}")
  55.         raise
  56.  
  57. def create_pipeline(clf):
  58.     try:
  59.         pipeline = Pipeline([
  60.             ('scaler', StandardScaler()),
  61.             ('feature_selection', SelectKBest(score_func=f_classif, k=2)),
  62.             ('classifier', clf)
  63.         ])
  64.         return pipeline
  65.     except Exception as e:
  66.         logger.error(f"Error creating pipeline: {e}")
  67.         raise
  68.  
  69. def get_classifiers():
  70.     return {
  71.         "Logistic Regression": LogisticRegression(max_iter=1000),
  72.         "SVC": SVC(),
  73.         "Decision Tree": DecisionTreeClassifier(),
  74.         "Random Forest": RandomForestClassifier(),
  75.         "K-Neighbors": KNeighborsClassifier(),
  76.         "Naive Bayes": GaussianNB(),
  77.         "Gradient Boosting": GradientBoostingClassifier()
  78.     }
  79.  
  80. def hyperparameter_tuning(clf, param_grid, X_train, y_train):
  81.     try {
  82.         grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
  83.         grid_search.fit(X_train, y_train)
  84.         return grid_search.best_estimator_
  85.     except Exception as e:
  86.         logger.error(f"Error during hyperparameter tuning: {e}")
  87.         raise
  88.  
  89. def evaluate_classifier(name, clf, X_train, y_train, X_test, y_test, kfold):
  90.     try:
  91.         train_scores = cross_validate(clf, X_train, y_train, cv=kfold, scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'], n_jobs=-1)
  92.         clf.fit(X_train, y_train)
  93.         y_pred = clf.predict(X_test)
  94.         test_score = {
  95.             "accuracy": accuracy_score(y_test, y_pred),
  96.             "precision": precision_score(y_test, y_pred, average='macro'),
  97.             "recall": recall_score(y_test, y_pred, average='macro'),
  98.             "f1": f1_score(y_test, y_pred, average='macro')
  99.         }
  100.         return name, train_scores, test_score
  101.     except Exception as e:
  102.         logger.error(f"Error evaluating classifier {name}: {e}")
  103.         raise
  104.  
  105. def plot_scores(training_scores, test_scores):
  106.     metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
  107.     x = np.arange(len(metrics))
  108.     for name, train_scores, test_score in zip(training_scores, test_scores):
  109.         plt.figure(figsize=(10, 6))
  110.         train_values = [np.mean(train_scores['test_' + metric]) for metric in metrics]
  111.         test_values = [test_score[metric.split('_')[0]] for metric in metrics]
  112.         plt.plot(x, train_values, label='Train')
  113.         plt.plot(x, test_values, label='Test')
  114.         plt.xticks(x, metrics)
  115.         plt.ylabel('Score')
  116.         plt.title(f'{name} Scores')
  117.         plt.legend()
  118.         plt.show()
  119.  
  120. def save_model_locally(model, name):
  121.     try:
  122.         model_path = os.path.join(storage_location, f'{name}_model.joblib')
  123.         dump(model, model_path)
  124.     except Exception as e:
  125.         logger.error(f"Error saving model locally: {e}")
  126.         raise
  127.  
  128. def monitor_resources():
  129.     try:
  130.         print(f"CPU usage: {psutil.cpu_percent()}%")
  131.         print(f"Memory usage: {psutil.virtual_memory().percent}%")
  132.     except ImportError:
  133.         logger.warning("psutil not installed. Resource monitoring not available.")
  134.  
  135. async def async_main():
  136.     try:
  137.         X_train, X_test, y_train, y_test = load_and_preprocess_data()
  138.         classifiers = get_classifiers()
  139.         kfold = KFold(n_splits=5, shuffle=True, random_state=42)
  140.  
  141.         tuned_classifiers = {}
  142.         for name, clf in classifiers.items():
  143.             logger.info(f"Tuning hyperparameters for {name}")
  144.             pipeline = create_pipeline(clf)
  145.             param_grid = {
  146.                 'classifier__C': [0.1, 1, 10] if name in ['Logistic Regression', 'SVC'] else {},
  147.                 'classifier__max_depth': [3, 5, 7] if name in ['Decision Tree', 'Random Forest'] else {}
  148.             }
  149.             tuned_clf = hyperparameter_tuning(pipeline, param_grid, X_train, y_train)
  150.             tuned_classifiers[name] = tuned_clf
  151.  
  152.         results = Parallel(n_jobs=-1)(delayed(evaluate_classifier)(name, clf, X_train, y_train, X_test, y_test, kfold) for name, clf in tuned_classifiers.items())
  153.  
  154.         for name, _, test_score in results:
  155.             save_model_locally(tuned_classifiers[name], name)
  156.  
  157.         for name, clf in tuned_classifiers.items():
  158.             explainer = TreeExplainer(clf.named_steps['classifier'])
  159.             shap_values = explainer.shap_values(X_test)
  160.             shap.summary_plot(shap_values, X_test, feature_names=load_iris().feature_names)
  161.  
  162.         plot_scores([result[1] for result in results], [result[2] for result in results])
  163.  
  164.         monitor_resources()
  165.  
  166.     except Exception as e:
  167.         logger.error(f"Error in main execution: {e}")
  168.         raise
  169.  
  170. if __name__ == "__main__":
  171.     asyncio.run(async_main())
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement