Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import time
- from datetime import datetime
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.ensemble import RandomForestClassifier
- from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
- from sklearn.preprocessing import LabelEncoder, MinMaxScaler
- from sklearn.metrics import (
- accuracy_score, precision_score, recall_score, f1_score, fbeta_score, matthews_corrcoef,
- jaccard_score, cohen_kappa_score, hamming_loss, zero_one_loss, mean_absolute_error,
- mean_squared_error, balanced_accuracy_score, r2_score
- )
- import csv
- # Define the dataset percentage and k-fold
- dataset_percent = 10
- k_fold = 10
- # Load the dataset
- df = pd.read_csv('input.csv')
- # Fill missing values
- df.fillna(df.mean(numeric_only=True), inplace=True)
- df.fillna(df.mode().iloc[0], inplace=True)
- # Encode categorical columns
- for col in df.select_dtypes(include=['object']).columns:
- df[col] = LabelEncoder().fit_transform(df[col])
- # Separate features (X) and labels (y)
- X = df.drop('target', axis=1) # Assuming 'target' is the label column
- y = df['target']
- # Apply MinMaxScaler
- scaler = MinMaxScaler()
- X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
- # Stratified sampling based on dataset_percent
- def get_stratified_sample(X, y, dataset_percent):
- sss = StratifiedShuffleSplit(n_splits=1, test_size=(dataset_percent / 100), random_state=42)
- for _, sample_index in sss.split(X, y):
- X_sampled = X.iloc[sample_index]
- y_sampled = y.iloc[sample_index]
- return X_sampled, y_sampled
- X_sampled, y_sampled = get_stratified_sample(X, y, dataset_percent)
- # Function to compute all required metrics
- def compute_metrics(y_true, y_pred, y_proba=None):
- metrics = {}
- metrics['accuracy'] = accuracy_score(y_true, y_pred)
- metrics['precision'] = precision_score(y_true, y_pred, average='weighted')
- metrics['recall'] = recall_score(y_true, y_pred, average='weighted')
- metrics['f1'] = f1_score(y_true, y_pred, average='weighted')
- metrics['fbeta'] = fbeta_score(y_true, y_pred, beta=1.0, average='weighted')
- metrics['mcc'] = matthews_corrcoef(y_true, y_pred)
- metrics['jaccard'] = jaccard_score(y_true, y_pred, average='weighted')
- metrics['cohen_kappa'] = cohen_kappa_score(y_true, y_pred)
- metrics['hamming_loss'] = hamming_loss(y_true, y_pred)
- metrics['zero_one_loss'] = zero_one_loss(y_true, y_pred)
- metrics['mae'] = mean_absolute_error(y_true, y_pred)
- metrics['mse'] = mean_squared_error(y_true, y_pred)
- metrics['rmse'] = np.sqrt(metrics['mse'])
- metrics['balanced_accuracy'] = balanced_accuracy_score(y_true, y_pred)
- metrics['r2_score'] = r2_score(y_true, y_pred)
- return metrics
- # Function to run and evaluate each classifier
- def run_classifier(classifier, classifier_name, X, y, k_fold):
- metrics_list = []
- try:
- # Time tracking
- start_time = time.time()
- clf = classifier()
- # Train and predict
- clf.fit(X, y)
- train_time = time.time() - start_time
- y_pred = clf.predict(X)
- predict_time = time.time() - start_time - train_time
- total_time = time.time() - start_time
- # Compute metrics
- metrics = compute_metrics(y, y_pred)
- metrics['train_time'] = train_time
- metrics['predict_time'] = predict_time
- metrics['total_time'] = total_time
- metrics['classifier_name'] = classifier_name
- metrics['timestamp'] = datetime.now()
- except Exception as e:
- metrics = {metric: -1 for metric in ['accuracy', 'precision', 'recall', 'f1', 'fbeta', 'mcc', 'jaccard',
- 'cohen_kappa', 'hamming_loss', 'zero_one_loss', 'mae', 'mse',
- 'rmse', 'balanced_accuracy', 'r2_score']}
- metrics['train_time'] = -1
- metrics['predict_time'] = -1
- metrics['total_time'] = -1
- metrics['classifier_name'] = classifier_name
- metrics['timestamp'] = datetime.now()
- metrics_list.append(metrics)
- return metrics_list
- # Run classifiers
- classifiers = {
- 'Decision Trees': DecisionTreeClassifier,
- 'Random Forest': RandomForestClassifier
- }
- # Append results to the CSV file
- csv_file = 'classification_results.csv'
- # Check if CSV exists, create if not
- csv_columns = ['classifier_name', 'timestamp', 'accuracy', 'precision', 'recall', 'f1', 'fbeta', 'mcc', 'jaccard',
- 'cohen_kappa', 'hamming_loss', 'zero_one_loss', 'mae', 'mse', 'rmse', 'balanced_accuracy', 'r2_score',
- 'train_time', 'predict_time', 'total_time']
- try:
- with open(csv_file, 'a', newline='') as csvfile:
- writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
- # If file is empty, write the header
- if csvfile.tell() == 0:
- writer.writeheader()
- for classifier_name, classifier in classifiers.items():
- metrics = run_classifier(classifier, classifier_name, X_sampled, y_sampled, k_fold)
- for metric in metrics:
- writer.writerow(metric)
- except IOError:
- print("I/O error while writing CSV")
Add Comment
Please, Sign In to add comment