mayankjoin3

ml generic

Sep 30th, 2024
15
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.16 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import time
  4. from datetime import datetime
  5. from sklearn.tree import DecisionTreeClassifier
  6. from sklearn.ensemble import RandomForestClassifier
  7. from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
  8. from sklearn.preprocessing import LabelEncoder, MinMaxScaler
  9. from sklearn.metrics import (
  10.     accuracy_score, precision_score, recall_score, f1_score, fbeta_score, matthews_corrcoef,
  11.     jaccard_score, cohen_kappa_score, hamming_loss, zero_one_loss, mean_absolute_error,
  12.     mean_squared_error, balanced_accuracy_score, r2_score
  13. )
  14. import csv
  15.  
  16. # Define the dataset percentage and k-fold
  17. dataset_percent = 10
  18. k_fold = 10
  19.  
  20. # Load the dataset
  21. df = pd.read_csv('input.csv')
  22.  
  23. # Fill missing values
  24. df.fillna(df.mean(numeric_only=True), inplace=True)
  25. df.fillna(df.mode().iloc[0], inplace=True)
  26.  
  27. # Encode categorical columns
  28. for col in df.select_dtypes(include=['object']).columns:
  29.     df[col] = LabelEncoder().fit_transform(df[col])
  30.  
  31. # Separate features (X) and labels (y)
  32. X = df.drop('target', axis=1)  # Assuming 'target' is the label column
  33. y = df['target']
  34.  
  35. # Apply MinMaxScaler
  36. scaler = MinMaxScaler()
  37. X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
  38.  
  39. # Stratified sampling based on dataset_percent
  40. def get_stratified_sample(X, y, dataset_percent):
  41.     sss = StratifiedShuffleSplit(n_splits=1, test_size=(dataset_percent / 100), random_state=42)
  42.     for _, sample_index in sss.split(X, y):
  43.         X_sampled = X.iloc[sample_index]
  44.         y_sampled = y.iloc[sample_index]
  45.     return X_sampled, y_sampled
  46.  
  47. X_sampled, y_sampled = get_stratified_sample(X, y, dataset_percent)
  48.  
  49. # Function to compute all required metrics
  50. def compute_metrics(y_true, y_pred, y_proba=None):
  51.     metrics = {}
  52.     metrics['accuracy'] = accuracy_score(y_true, y_pred)
  53.     metrics['precision'] = precision_score(y_true, y_pred, average='weighted')
  54.     metrics['recall'] = recall_score(y_true, y_pred, average='weighted')
  55.     metrics['f1'] = f1_score(y_true, y_pred, average='weighted')
  56.     metrics['fbeta'] = fbeta_score(y_true, y_pred, beta=1.0, average='weighted')
  57.     metrics['mcc'] = matthews_corrcoef(y_true, y_pred)
  58.     metrics['jaccard'] = jaccard_score(y_true, y_pred, average='weighted')
  59.     metrics['cohen_kappa'] = cohen_kappa_score(y_true, y_pred)
  60.     metrics['hamming_loss'] = hamming_loss(y_true, y_pred)
  61.     metrics['zero_one_loss'] = zero_one_loss(y_true, y_pred)
  62.     metrics['mae'] = mean_absolute_error(y_true, y_pred)
  63.     metrics['mse'] = mean_squared_error(y_true, y_pred)
  64.     metrics['rmse'] = np.sqrt(metrics['mse'])
  65.     metrics['balanced_accuracy'] = balanced_accuracy_score(y_true, y_pred)
  66.     metrics['r2_score'] = r2_score(y_true, y_pred)
  67.     return metrics
  68.  
  69. # Function to run and evaluate each classifier
  70. def run_classifier(classifier, classifier_name, X, y, k_fold):
  71.     metrics_list = []
  72.     try:
  73.         # Time tracking
  74.         start_time = time.time()
  75.         clf = classifier()
  76.  
  77.         # Train and predict
  78.         clf.fit(X, y)
  79.         train_time = time.time() - start_time
  80.         y_pred = clf.predict(X)
  81.         predict_time = time.time() - start_time - train_time
  82.         total_time = time.time() - start_time
  83.  
  84.         # Compute metrics
  85.         metrics = compute_metrics(y, y_pred)
  86.         metrics['train_time'] = train_time
  87.         metrics['predict_time'] = predict_time
  88.         metrics['total_time'] = total_time
  89.         metrics['classifier_name'] = classifier_name
  90.         metrics['timestamp'] = datetime.now()
  91.  
  92.     except Exception as e:
  93.         metrics = {metric: -1 for metric in ['accuracy', 'precision', 'recall', 'f1', 'fbeta', 'mcc', 'jaccard',
  94.                                              'cohen_kappa', 'hamming_loss', 'zero_one_loss', 'mae', 'mse',
  95.                                              'rmse', 'balanced_accuracy', 'r2_score']}
  96.         metrics['train_time'] = -1
  97.         metrics['predict_time'] = -1
  98.         metrics['total_time'] = -1
  99.         metrics['classifier_name'] = classifier_name
  100.         metrics['timestamp'] = datetime.now()
  101.  
  102.     metrics_list.append(metrics)
  103.     return metrics_list
  104.  
  105. # Run classifiers
  106. classifiers = {
  107.     'Decision Trees': DecisionTreeClassifier,
  108.     'Random Forest': RandomForestClassifier
  109. }
  110.  
  111. # Append results to the CSV file
  112. csv_file = 'classification_results.csv'
  113.  
  114. # Check if CSV exists, create if not
  115. csv_columns = ['classifier_name', 'timestamp', 'accuracy', 'precision', 'recall', 'f1', 'fbeta', 'mcc', 'jaccard',
  116.                'cohen_kappa', 'hamming_loss', 'zero_one_loss', 'mae', 'mse', 'rmse', 'balanced_accuracy', 'r2_score',
  117.                'train_time', 'predict_time', 'total_time']
  118.  
  119. try:
  120.     with open(csv_file, 'a', newline='') as csvfile:
  121.         writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
  122.         # If file is empty, write the header
  123.         if csvfile.tell() == 0:
  124.             writer.writeheader()
  125.  
  126.         for classifier_name, classifier in classifiers.items():
  127.             metrics = run_classifier(classifier, classifier_name, X_sampled, y_sampled, k_fold)
  128.             for metric in metrics:
  129.                 writer.writerow(metric)
  130.  
  131. except IOError:
  132.     print("I/O error while writing CSV")
  133.  
Add Comment
Please, Sign In to add comment