ml generic

import pandas as pd
import numpy as np
import time
from datetime import datetime
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit, cross_val_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, fbeta_score, matthews_corrcoef,
    jaccard_score, cohen_kappa_score, hamming_loss, zero_one_loss, mean_absolute_error,
    mean_squared_error, balanced_accuracy_score, r2_score
)
import csv

# Define the dataset percentage and k-fold
dataset_percent = 10
k_fold = 10

# Load the dataset
df = pd.read_csv('input.csv')

# Fill missing values
df.fillna(df.mean(numeric_only=True), inplace=True)
df.fillna(df.mode().iloc[0], inplace=True)

# Encode categorical columns
for col in df.select_dtypes(include=['object']).columns:
    df[col] = LabelEncoder().fit_transform(df[col])

# Separate features (X) and labels (y)
X = df.drop('target', axis=1)  # Assuming 'target' is the label column
y = df['target']

# Apply MinMaxScaler
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

# Stratified sampling based on dataset_percent
def get_stratified_sample(X, y, dataset_percent):
    sss = StratifiedShuffleSplit(n_splits=1, test_size=(dataset_percent / 100), random_state=42)
    for _, sample_index in sss.split(X, y):
        X_sampled = X.iloc[sample_index]
        y_sampled = y.iloc[sample_index]
    return X_sampled, y_sampled

X_sampled, y_sampled = get_stratified_sample(X, y, dataset_percent)

# Function to compute all required metrics
def compute_metrics(y_true, y_pred, y_proba=None):
    metrics = {}
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['precision'] = precision_score(y_true, y_pred, average='weighted')
    metrics['recall'] = recall_score(y_true, y_pred, average='weighted')
    metrics['f1'] = f1_score(y_true, y_pred, average='weighted')
    metrics['fbeta'] = fbeta_score(y_true, y_pred, beta=1.0, average='weighted')
    metrics['mcc'] = matthews_corrcoef(y_true, y_pred)
    metrics['jaccard'] = jaccard_score(y_true, y_pred, average='weighted')
    metrics['cohen_kappa'] = cohen_kappa_score(y_true, y_pred)
    metrics['hamming_loss'] = hamming_loss(y_true, y_pred)
    metrics['zero_one_loss'] = zero_one_loss(y_true, y_pred)
    metrics['mae'] = mean_absolute_error(y_true, y_pred)
    metrics['mse'] = mean_squared_error(y_true, y_pred)
    metrics['rmse'] = np.sqrt(metrics['mse'])
    metrics['balanced_accuracy'] = balanced_accuracy_score(y_true, y_pred)
    metrics['r2_score'] = r2_score(y_true, y_pred)
    return metrics

# Function to run and evaluate each classifier
def run_classifier(classifier, classifier_name, X, y, k_fold):
    metrics_list = []
    try:
        # Time tracking
        start_time = time.time()
        clf = classifier()

        # Train and predict
        clf.fit(X, y)
        train_time = time.time() - start_time
        y_pred = clf.predict(X)
        predict_time = time.time() - start_time - train_time
        total_time = time.time() - start_time

        # Compute metrics
        metrics = compute_metrics(y, y_pred)
        metrics['train_time'] = train_time
        metrics['predict_time'] = predict_time
        metrics['total_time'] = total_time
        metrics['classifier_name'] = classifier_name
        metrics['timestamp'] = datetime.now()

    except Exception as e:
        metrics = {metric: -1 for metric in ['accuracy', 'precision', 'recall', 'f1', 'fbeta', 'mcc', 'jaccard',
                                             'cohen_kappa', 'hamming_loss', 'zero_one_loss', 'mae', 'mse',
                                             'rmse', 'balanced_accuracy', 'r2_score']}
        metrics['train_time'] = -1
        metrics['predict_time'] = -1
        metrics['total_time'] = -1
        metrics['classifier_name'] = classifier_name
        metrics['timestamp'] = datetime.now()

    metrics_list.append(metrics)
    return metrics_list

# Run classifiers
classifiers = {
    'Decision Trees': DecisionTreeClassifier,
    'Random Forest': RandomForestClassifier
}

# Append results to the CSV file
csv_file = 'classification_results.csv'

# Check if CSV exists, create if not
csv_columns = ['classifier_name', 'timestamp', 'accuracy', 'precision', 'recall', 'f1', 'fbeta', 'mcc', 'jaccard',
               'cohen_kappa', 'hamming_loss', 'zero_one_loss', 'mae', 'mse', 'rmse', 'balanced_accuracy', 'r2_score',
               'train_time', 'predict_time', 'total_time']

try:
    with open(csv_file, 'a', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
        # If file is empty, write the header
        if csvfile.tell() == 0:
            writer.writeheader()

        for classifier_name, classifier in classifiers.items():
            metrics = run_classifier(classifier, classifier_name, X_sampled, y_sampled, k_fold)
            for metric in metrics:
                writer.writerow(metric)

except IOError:
    print("I/O error while writing CSV")