Advertisement
mayankjoin3

Untitled

Sep 30th, 2024
10
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 5.16 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import time
  4. from datetime import datetime
  5. from sklearn.tree import DecisionTreeClassifier
  6. from sklearn.ensemble import RandomForestClassifier
  7. from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
  8. from sklearn.preprocessing import LabelEncoder, MinMaxScaler
  9. from sklearn.metrics import (
  10. accuracy_score, precision_score, recall_score, f1_score, fbeta_score, matthews_corrcoef,
  11. jaccard_score, cohen_kappa_score, hamming_loss, zero_one_loss, mean_absolute_error,
  12. mean_squared_error, balanced_accuracy_score, r2_score
  13. )
  14. import csv
  15.  
  16. # Define the dataset percentage and k-fold
  17. dataset_percent = 10
  18. k_fold = 10
  19.  
  20. # Load the dataset
  21. df = pd.read_csv('input.csv')
  22.  
  23. # Fill missing values
  24. df.fillna(df.mean(numeric_only=True), inplace=True)
  25. df.fillna(df.mode().iloc[0], inplace=True)
  26.  
  27. # Encode categorical columns
  28. for col in df.select_dtypes(include=['object']).columns:
  29. df[col] = LabelEncoder().fit_transform(df[col])
  30.  
  31. # Separate features (X) and labels (y)
  32. X = df.drop('target', axis=1) # Assuming 'target' is the label column
  33. y = df['target']
  34.  
  35. # Apply MinMaxScaler
  36. scaler = MinMaxScaler()
  37. X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
  38.  
  39. # Stratified sampling based on dataset_percent
  40. def get_stratified_sample(X, y, dataset_percent):
  41. sss = StratifiedShuffleSplit(n_splits=1, test_size=(dataset_percent / 100), random_state=42)
  42. for _, sample_index in sss.split(X, y):
  43. X_sampled = X.iloc[sample_index]
  44. y_sampled = y.iloc[sample_index]
  45. return X_sampled, y_sampled
  46.  
  47. X_sampled, y_sampled = get_stratified_sample(X, y, dataset_percent)
  48.  
  49. # Function to compute all required metrics
  50. def compute_metrics(y_true, y_pred, y_proba=None):
  51. metrics = {}
  52. metrics['accuracy'] = accuracy_score(y_true, y_pred)
  53. metrics['precision'] = precision_score(y_true, y_pred, average='weighted')
  54. metrics['recall'] = recall_score(y_true, y_pred, average='weighted')
  55. metrics['f1'] = f1_score(y_true, y_pred, average='weighted')
  56. metrics['fbeta'] = fbeta_score(y_true, y_pred, beta=1.0, average='weighted')
  57. metrics['mcc'] = matthews_corrcoef(y_true, y_pred)
  58. metrics['jaccard'] = jaccard_score(y_true, y_pred, average='weighted')
  59. metrics['cohen_kappa'] = cohen_kappa_score(y_true, y_pred)
  60. metrics['hamming_loss'] = hamming_loss(y_true, y_pred)
  61. metrics['zero_one_loss'] = zero_one_loss(y_true, y_pred)
  62. metrics['mae'] = mean_absolute_error(y_true, y_pred)
  63. metrics['mse'] = mean_squared_error(y_true, y_pred)
  64. metrics['rmse'] = np.sqrt(metrics['mse'])
  65. metrics['balanced_accuracy'] = balanced_accuracy_score(y_true, y_pred)
  66. metrics['r2_score'] = r2_score(y_true, y_pred)
  67. return metrics
  68.  
  69. # Function to run and evaluate each classifier
  70. def run_classifier(classifier, classifier_name, X, y, k_fold):
  71. metrics_list = []
  72. try:
  73. # Time tracking
  74. start_time = time.time()
  75. clf = classifier()
  76.  
  77. # Train and predict
  78. clf.fit(X, y)
  79. train_time = time.time() - start_time
  80. y_pred = clf.predict(X)
  81. predict_time = time.time() - start_time - train_time
  82. total_time = time.time() - start_time
  83.  
  84. # Compute metrics
  85. metrics = compute_metrics(y, y_pred)
  86. metrics['train_time'] = train_time
  87. metrics['predict_time'] = predict_time
  88. metrics['total_time'] = total_time
  89. metrics['classifier_name'] = classifier_name
  90. metrics['timestamp'] = datetime.now()
  91.  
  92. except Exception as e:
  93. metrics = {metric: -1 for metric in ['accuracy', 'precision', 'recall', 'f1', 'fbeta', 'mcc', 'jaccard',
  94. 'cohen_kappa', 'hamming_loss', 'zero_one_loss', 'mae', 'mse',
  95. 'rmse', 'balanced_accuracy', 'r2_score']}
  96. metrics['train_time'] = -1
  97. metrics['predict_time'] = -1
  98. metrics['total_time'] = -1
  99. metrics['classifier_name'] = classifier_name
  100. metrics['timestamp'] = datetime.now()
  101.  
  102. metrics_list.append(metrics)
  103. return metrics_list
  104.  
  105. # Run classifiers
  106. classifiers = {
  107. 'Decision Trees': DecisionTreeClassifier,
  108. 'Random Forest': RandomForestClassifier
  109. }
  110.  
  111. # Append results to the CSV file
  112. csv_file = 'classification_results.csv'
  113.  
  114. # Check if CSV exists, create if not
  115. csv_columns = ['classifier_name', 'timestamp', 'accuracy', 'precision', 'recall', 'f1', 'fbeta', 'mcc', 'jaccard',
  116. 'cohen_kappa', 'hamming_loss', 'zero_one_loss', 'mae', 'mse', 'rmse', 'balanced_accuracy', 'r2_score',
  117. 'train_time', 'predict_time', 'total_time']
  118.  
  119. try:
  120. with open(csv_file, 'a', newline='') as csvfile:
  121. writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
  122. # If file is empty, write the header
  123. if csvfile.tell() == 0:
  124. writer.writeheader()
  125.  
  126. for classifier_name, classifier in classifiers.items():
  127. metrics = run_classifier(classifier, classifier_name, X_sampled, y_sampled, k_fold)
  128. for metric in metrics:
  129. writer.writerow(metric)
  130.  
  131. except IOError:
  132. print("I/O error while writing CSV")
  133.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement