Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import time
- from sklearn.ensemble import ExtraTreesClassifier, HistGradientBoostingClassifier
- from sklearn.model_selection import train_test_split, KFold
- from sklearn.preprocessing import LabelEncoder, MinMaxScaler
- from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, matthews_corrcoef
- from sklearn.metrics import jaccard_score, cohen_kappa_score, hamming_loss, zero_one_loss, mean_absolute_error
- from sklearn.metrics import mean_squared_error, r2_score, balanced_accuracy_score
- from sklearn.tree import DecisionTreeClassifier
- from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier, StackingClassifier
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.svm import SVC, OneClassSVM
- from sklearn.naive_bayes import GaussianNB
- from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron, ElasticNet
- from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
- from sklearn.neural_network import MLPClassifier
- from xgboost import XGBClassifier
- from lightgbm import LGBMClassifier
- from catboost import CatBoostClassifier
- from sklearn.cluster import KMeans, AgglomerativeClustering
- from sklearn.gaussian_process import GaussianProcessClassifier
- from sklearn.neighbors import NearestCentroid
- from sklearn.mixture import GaussianMixture
- from sklearn.ensemble import IsolationForest
- from sklearn.pipeline import Pipeline
- from sklearn.neural_network import BernoulliRBM
- from sklearn.experimental import enable_iterative_imputer # Enable the Iterative Imputer
- from sklearn.impute import IterativeImputer # Import Iterative Imputer
- from keras.models import Sequential
- from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, LSTM, GRU, SimpleRNN
- from keras.utils import to_categorical
- import traceback
- import csv
- import warnings
- from sklearn.semi_supervised import SelfTrainingClassifier
- # Custom imports
- # from pgmpy.models import BayesianNetwork
- # For Bayesian Networks
- # import geneticalgorithm as ga
- # For Genetic Algorithm-based Classifier (hypothetical)
- # Add fuzzy logic and CRF imports if you have specific packages (e.g., `python-crfsuite` for CRF)
- warnings.filterwarnings("ignore") # Suppress warnings for cleaner output
- # Initialize dataset parameters
- k_fold = 5 # Change as needed
- dataset_percent = 10 # Change as needed
- input_file = 'input.csv' # Input dataset
- # Initialize CSV file and columns
- # output_file = 'results.csv'
- output_file = input_file.replace('.csv', '_results.csv')
- csv_columns = ['Algorithm', 'Fold', 'Train Time (s)', 'Test Time (s)', 'Accuracy', 'Precision', 'Recall', 'F1 Score',
- 'Fbeta Score', 'Matthews Correlation Coefficient', 'Jaccard Score', 'Cohen Kappa Score',
- 'Hamming Loss', 'Zero One Loss', 'Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error',
- 'Balanced Accuracy', 'R2 Score']
- # Function to handle metric calculation
- def compute_metrics(y_true, y_pred):
- metrics = {}
- metrics['Accuracy'] = accuracy_score(y_true, y_pred)
- metrics['Precision'] = precision_score(y_true, y_pred, average='weighted', zero_division=1)
- metrics['Recall'] = recall_score(y_true, y_pred, average='weighted', zero_division=1)
- metrics['F1 Score'] = f1_score(y_true, y_pred, average='weighted', zero_division=1)
- metrics['Fbeta Score'] = fbeta_score(y_true, y_pred, beta=1.0, average='weighted', zero_division=1)
- metrics['Matthews Correlation Coefficient'] = matthews_corrcoef(y_true, y_pred)
- metrics['Jaccard Score'] = jaccard_score(y_true, y_pred, average='weighted', zero_division=1)
- metrics['Cohen Kappa Score'] = cohen_kappa_score(y_true, y_pred)
- metrics['Hamming Loss'] = hamming_loss(y_true, y_pred)
- metrics['Zero One Loss'] = zero_one_loss(y_true, y_pred)
- metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
- metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
- metrics['Root Mean Squared Error'] = np.sqrt(metrics['Mean Squared Error'])
- metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
- metrics['R2 Score'] = r2_score(y_true, y_pred)
- return metrics
- # Function to handle each algorithm execution
- def run_algorithm(algo_name, model, X_train, y_train, X_test, y_test, fold):
- try:
- # print(f"Running {algo_name} on Fold {fold}...") # Track progress
- start_train = time.time()
- model.fit(X_train, y_train)
- train_time = time.time() - start_train
- start_test = time.time()
- y_pred = model.predict(X_test)
- test_time = time.time() - start_test
- # Compute metrics
- if algo_name == 'ElasticNet': # Handle ElasticNet as a regression model
- metrics = {}
- metrics['Mean Absolute Error'] = mean_absolute_error(y_test, y_pred)
- metrics['Mean Squared Error'] = mean_squared_error(y_test, y_pred)
- metrics['Root Mean Squared Error'] = np.sqrt(metrics['Mean Squared Error'])
- metrics['R2 Score'] = r2_score(y_test, y_pred)
- else:
- # Compute classification metrics
- metrics = compute_metrics(y_test, y_pred)
- # metrics = compute_metrics(y_test, y_pred)
- metrics.update({'Train Time (s)': train_time, 'Test Time (s)': test_time})
- # Log results to CSV
- with open(output_file, 'a', newline='') as f:
- writer = csv.writer(f)
- writer.writerow([algo_name, fold] + [metrics.get(m, -1) for m in csv_columns[2:]])
- print(f"{algo_name} | Fold: {fold} | Train Time: {train_time:.2f}s | Test Time: {test_time:.2f}s")
- except Exception as e:
- # Log error case
- with open(output_file, 'a', newline='') as f:
- writer = csv.writer(f)
- writer.writerow([algo_name, fold] + [-1 for _ in csv_columns[2:]])
- print(f"Error in {algo_name}: {traceback.format_exc()}")
- # Load dataset
- df = pd.read_csv(input_file)
- X = df.iloc[:, :-1]
- y = df.iloc[:, -1]
- # Encode categorical features
- # X = pd.get_dummies(X)
- label_encoder = LabelEncoder()
- # Apply label encoding only to categorical columns
- for column in X.columns:
- if X[column].dtype == 'object' or X[column].dtype.name == 'category':
- X[column] = label_encoder.fit_transform(X[column])
- y = LabelEncoder().fit_transform(y)
- # Apply iterative imputation to handle missing data
- imputer = IterativeImputer()
- X = imputer.fit_transform(X)
- # Min-Max scaling
- scaler = MinMaxScaler()
- X = scaler.fit_transform(X)
- # Take a subset of the dataset if dataset_percent is less than 100
- if dataset_percent < 100:
- _, X, _, y = train_test_split(X, y, test_size=dataset_percent/100, stratify=y)
- # Prepare CSV header if not present
- if not pd.io.common.file_exists(output_file):
- with open(output_file, 'w', newline='') as f:
- writer = csv.writer(f)
- writer.writerow(csv_columns)
- # K-Fold Cross Validation
- kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
- # Deep Learning (CNN, RNN, LSTM, GRU, Autoencoders)
- def create_cnn(input_shape):
- model = Sequential()
- model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
- model.add(MaxPooling2D(pool_size=(2, 2)))
- model.add(Flatten())
- model.add(Dense(128, activation='relu'))
- model.add(Dense(10, activation='softmax'))
- model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
- return model
- def create_rnn(input_shape):
- model = Sequential()
- model.add(SimpleRNN(128, input_shape=input_shape, return_sequences=True))
- model.add(Flatten())
- model.add(Dense(10, activation='softmax'))
- model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
- return model
- def create_lstm(input_shape):
- model = Sequential()
- model.add(LSTM(128, input_shape=input_shape, return_sequences=True))
- model.add(Flatten())
- model.add(Dense(10, activation='softmax'))
- model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
- return model
- def create_gru(input_shape):
- model = Sequential()
- model.add(GRU(128, input_shape=input_shape, return_sequences=True))
- model.add(Flatten())
- model.add(Dense(10, activation='softmax'))
- model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
- return model
- def create_autoencoder(input_shape):
- model = Sequential()
- model.add(Dense(128, input_shape=input_shape, activation='relu'))
- model.add(Dense(input_shape[0], activation='sigmoid'))
- model.compile(optimizer='adam', loss='mse')
- return model
- # List of algorithms
- algorithms = {
- 'Naive Bayes': GaussianNB(),
- 'LDA': LinearDiscriminantAnalysis(),
- 'QDA': QuadraticDiscriminantAnalysis(),
- # 'SVM': SVC(kernel='linear', max_iter=1000),
- 'Decision Tree': DecisionTreeClassifier(),
- 'SGD Classifier': SGDClassifier(),
- 'KNN': KNeighborsClassifier(),
- 'ElasticNet': ElasticNet(),
- 'Perceptron': Perceptron(),
- 'Logistic Regression': LogisticRegression(),
- 'Bagging': BaggingClassifier(),
- 'K-Means': KMeans(n_clusters=3),
- 'Nearest Centroid Classifier': NearestCentroid(),
- 'XGBoost': XGBClassifier(),
- 'AdaBoost': AdaBoostClassifier(),
- 'RBM + Logistic Regression': Pipeline(steps=[('rbm', BernoulliRBM(n_components=100, learning_rate=0.06, n_iter=10, random_state=42)),('logistic', LogisticRegression())]),
- 'Voting Classifier': VotingClassifier(estimators=[('lr', LogisticRegression()),('rf', RandomForestClassifier()),('gnb', GaussianNB())], voting='hard'),
- 'Random Forest': RandomForestClassifier(n_estimators=10),
- 'Gradient Boosting': GradientBoostingClassifier(n_estimators=10),
- 'Stacking Classifier': StackingClassifier(estimators=[('log_reg', LogisticRegression()),('knn', KNeighborsClassifier(n_neighbors=3))],final_estimator=LogisticRegression(),n_jobs=-1),
- 'LightGBM': LGBMClassifier(),
- 'CatBoost': CatBoostClassifier(),
- 'Self-Training': SelfTrainingClassifier(LogisticRegression()),
- 'Isolation Forest': IsolationForest(),
- 'Extra Trees Classifier': ExtraTreesClassifier(n_estimators=100),
- 'HistGradientBoostingClassifier': HistGradientBoostingClassifier(),
- }
- # Running algorithms in k-fold
- for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
- X_train, X_test = X[train_idx], X[test_idx]
- y_train, y_test = y[train_idx], y[test_idx]
- for algo_name, model in algorithms.items():
- run_algorithm(algo_name, model, X_train, y_train, X_test, y_test, fold)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement