Advertisement
mayankjoin3

ml generic abhinav+2

Oct 19th, 2024
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 10.71 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import time
  4. from sklearn.ensemble import ExtraTreesClassifier, HistGradientBoostingClassifier
  5. from sklearn.model_selection import train_test_split, KFold
  6. from sklearn.preprocessing import LabelEncoder, MinMaxScaler
  7. from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, fbeta_score, matthews_corrcoef
  8. from sklearn.metrics import jaccard_score, cohen_kappa_score, hamming_loss, zero_one_loss, mean_absolute_error
  9. from sklearn.metrics import mean_squared_error, r2_score, balanced_accuracy_score
  10. from sklearn.tree import DecisionTreeClassifier
  11. from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier, StackingClassifier
  12. from sklearn.neighbors import KNeighborsClassifier
  13. from sklearn.svm import SVC, OneClassSVM
  14. from sklearn.naive_bayes import GaussianNB
  15. from sklearn.linear_model import LogisticRegression, SGDClassifier, Perceptron, ElasticNet
  16. from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
  17. from sklearn.neural_network import MLPClassifier
  18. from xgboost import XGBClassifier
  19. from lightgbm import LGBMClassifier
  20. from catboost import CatBoostClassifier
  21. from sklearn.cluster import KMeans, AgglomerativeClustering
  22. from sklearn.gaussian_process import GaussianProcessClassifier
  23. from sklearn.neighbors import NearestCentroid
  24. from sklearn.mixture import GaussianMixture
  25. from sklearn.ensemble import IsolationForest
  26. from sklearn.pipeline import Pipeline
  27. from sklearn.neural_network import BernoulliRBM
  28. from sklearn.experimental import enable_iterative_imputer  # Enable the Iterative Imputer
  29. from sklearn.impute import IterativeImputer  # Import Iterative Imputer
  30. from keras.models import Sequential
  31. from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, LSTM, GRU, SimpleRNN
  32. from keras.utils import to_categorical
  33. import traceback
  34. import csv
  35. import warnings
  36. from sklearn.semi_supervised import SelfTrainingClassifier
  37.  
  38. # Custom imports
  39. # from pgmpy.models import BayesianNetwork
  40.  # For Bayesian Networks
  41. # import geneticalgorithm as ga
  42.  # For Genetic Algorithm-based Classifier (hypothetical)
  43. # Add fuzzy logic and CRF imports if you have specific packages (e.g., `python-crfsuite` for CRF)
  44.  
  45. warnings.filterwarnings("ignore")  # Suppress warnings for cleaner output
  46.  
  47. # Initialize dataset parameters
  48. k_fold = 5  # Change as needed
  49. dataset_percent = 10  # Change as needed
  50. input_file = 'input.csv'  # Input dataset
  51.  
  52. # Initialize CSV file and columns
  53. # output_file = 'results.csv'
  54. output_file = input_file.replace('.csv', '_results.csv')
  55. csv_columns = ['Algorithm', 'Fold', 'Train Time (s)', 'Test Time (s)', 'Accuracy', 'Precision', 'Recall', 'F1 Score',
  56.                'Fbeta Score', 'Matthews Correlation Coefficient', 'Jaccard Score', 'Cohen Kappa Score',
  57.                'Hamming Loss', 'Zero One Loss', 'Mean Absolute Error', 'Mean Squared Error', 'Root Mean Squared Error',
  58.                'Balanced Accuracy', 'R2 Score']
  59.  
  60. # Function to handle metric calculation
  61. def compute_metrics(y_true, y_pred):
  62.     metrics = {}
  63.     metrics['Accuracy'] = accuracy_score(y_true, y_pred)
  64.     metrics['Precision'] = precision_score(y_true, y_pred, average='weighted', zero_division=1)
  65.     metrics['Recall'] = recall_score(y_true, y_pred, average='weighted', zero_division=1)
  66.     metrics['F1 Score'] = f1_score(y_true, y_pred, average='weighted', zero_division=1)
  67.     metrics['Fbeta Score'] = fbeta_score(y_true, y_pred, beta=1.0, average='weighted', zero_division=1)
  68.     metrics['Matthews Correlation Coefficient'] = matthews_corrcoef(y_true, y_pred)
  69.     metrics['Jaccard Score'] = jaccard_score(y_true, y_pred, average='weighted', zero_division=1)
  70.     metrics['Cohen Kappa Score'] = cohen_kappa_score(y_true, y_pred)
  71.     metrics['Hamming Loss'] = hamming_loss(y_true, y_pred)
  72.     metrics['Zero One Loss'] = zero_one_loss(y_true, y_pred)
  73.     metrics['Mean Absolute Error'] = mean_absolute_error(y_true, y_pred)
  74.     metrics['Mean Squared Error'] = mean_squared_error(y_true, y_pred)
  75.     metrics['Root Mean Squared Error'] = np.sqrt(metrics['Mean Squared Error'])
  76.     metrics['Balanced Accuracy'] = balanced_accuracy_score(y_true, y_pred)
  77.     metrics['R2 Score'] = r2_score(y_true, y_pred)
  78.    
  79.     return metrics
  80.  
  81. # Function to handle each algorithm execution
  82. def run_algorithm(algo_name, model, X_train, y_train, X_test, y_test, fold):
  83.     try:
  84.         # print(f"Running {algo_name} on Fold {fold}...")  # Track progress
  85.         start_train = time.time()
  86.         model.fit(X_train, y_train)
  87.         train_time = time.time() - start_train
  88.  
  89.         start_test = time.time()
  90.         y_pred = model.predict(X_test)
  91.         test_time = time.time() - start_test
  92.  
  93.         # Compute metrics
  94.         if algo_name == 'ElasticNet':  # Handle ElasticNet as a regression model
  95.             metrics = {}
  96.             metrics['Mean Absolute Error'] = mean_absolute_error(y_test, y_pred)
  97.             metrics['Mean Squared Error'] = mean_squared_error(y_test, y_pred)
  98.             metrics['Root Mean Squared Error'] = np.sqrt(metrics['Mean Squared Error'])
  99.             metrics['R2 Score'] = r2_score(y_test, y_pred)
  100.         else:
  101.             # Compute classification metrics
  102.             metrics = compute_metrics(y_test, y_pred)
  103.         # metrics = compute_metrics(y_test, y_pred)
  104.         metrics.update({'Train Time (s)': train_time, 'Test Time (s)': test_time})
  105.        
  106.         # Log results to CSV
  107.         with open(output_file, 'a', newline='') as f:
  108.             writer = csv.writer(f)
  109.             writer.writerow([algo_name, fold] + [metrics.get(m, -1) for m in csv_columns[2:]])
  110.  
  111.         print(f"{algo_name} | Fold: {fold} | Train Time: {train_time:.2f}s | Test Time: {test_time:.2f}s")
  112.        
  113.     except Exception as e:
  114.         # Log error case
  115.         with open(output_file, 'a', newline='') as f:
  116.             writer = csv.writer(f)
  117.             writer.writerow([algo_name, fold] + [-1 for _ in csv_columns[2:]])
  118.         print(f"Error in {algo_name}: {traceback.format_exc()}")
  119.  
  120. # Load dataset
  121. df = pd.read_csv(input_file)
  122. X = df.iloc[:, :-1]
  123. y = df.iloc[:, -1]
  124.  
  125. # Encode categorical features
  126. # X = pd.get_dummies(X)
  127. label_encoder = LabelEncoder()
  128. # Apply label encoding only to categorical columns
  129. for column in X.columns:
  130.     if X[column].dtype == 'object' or X[column].dtype.name == 'category':
  131.         X[column] = label_encoder.fit_transform(X[column])
  132. y = LabelEncoder().fit_transform(y)
  133.  
  134. # Apply iterative imputation to handle missing data
  135. imputer = IterativeImputer()
  136. X = imputer.fit_transform(X)
  137.  
  138. # Min-Max scaling
  139. scaler = MinMaxScaler()
  140. X = scaler.fit_transform(X)
  141.  
  142. # Take a subset of the dataset if dataset_percent is less than 100
  143. if dataset_percent < 100:
  144.     _, X, _, y = train_test_split(X, y, test_size=dataset_percent/100, stratify=y)
  145.  
  146. # Prepare CSV header if not present
  147. if not pd.io.common.file_exists(output_file):
  148.     with open(output_file, 'w', newline='') as f:
  149.         writer = csv.writer(f)
  150.         writer.writerow(csv_columns)
  151.  
  152. # K-Fold Cross Validation
  153. kf = KFold(n_splits=k_fold, shuffle=True, random_state=42)
  154.  
  155. # Deep Learning (CNN, RNN, LSTM, GRU, Autoencoders)
  156. def create_cnn(input_shape):
  157.     model = Sequential()
  158.     model.add(Conv2D(32, (3, 3), activation='relu', input_shape=input_shape))
  159.     model.add(MaxPooling2D(pool_size=(2, 2)))
  160.     model.add(Flatten())
  161.     model.add(Dense(128, activation='relu'))
  162.     model.add(Dense(10, activation='softmax'))
  163.     model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
  164.     return model
  165.  
  166. def create_rnn(input_shape):
  167.     model = Sequential()
  168.     model.add(SimpleRNN(128, input_shape=input_shape, return_sequences=True))
  169.     model.add(Flatten())
  170.     model.add(Dense(10, activation='softmax'))
  171.     model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
  172.     return model
  173.  
  174. def create_lstm(input_shape):
  175.     model = Sequential()
  176.     model.add(LSTM(128, input_shape=input_shape, return_sequences=True))
  177.     model.add(Flatten())
  178.     model.add(Dense(10, activation='softmax'))
  179.     model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
  180.     return model
  181.  
  182. def create_gru(input_shape):
  183.     model = Sequential()
  184.     model.add(GRU(128, input_shape=input_shape, return_sequences=True))
  185.     model.add(Flatten())
  186.     model.add(Dense(10, activation='softmax'))
  187.     model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
  188.     return model
  189.  
  190. def create_autoencoder(input_shape):
  191.     model = Sequential()
  192.     model.add(Dense(128, input_shape=input_shape, activation='relu'))
  193.     model.add(Dense(input_shape[0], activation='sigmoid'))
  194.     model.compile(optimizer='adam', loss='mse')
  195.     return model
  196.  
  197. # List of algorithms
  198. algorithms = {
  199.  
  200.  
  201.     'Naive Bayes': GaussianNB(),
  202.     'LDA': LinearDiscriminantAnalysis(),
  203.     'QDA': QuadraticDiscriminantAnalysis(),
  204.     # 'SVM': SVC(kernel='linear', max_iter=1000),
  205.     'Decision Tree': DecisionTreeClassifier(),
  206.     'SGD Classifier': SGDClassifier(),
  207.     'KNN': KNeighborsClassifier(),
  208.     'ElasticNet': ElasticNet(),
  209.     'Perceptron': Perceptron(),
  210.     'Logistic Regression': LogisticRegression(),
  211.     'Bagging': BaggingClassifier(),
  212.     'K-Means': KMeans(n_clusters=3),
  213.     'Nearest Centroid Classifier': NearestCentroid(),
  214.     'XGBoost': XGBClassifier(),
  215.     'AdaBoost': AdaBoostClassifier(),
  216.     'RBM + Logistic Regression': Pipeline(steps=[('rbm', BernoulliRBM(n_components=100, learning_rate=0.06, n_iter=10, random_state=42)),('logistic', LogisticRegression())]),
  217.     'Voting Classifier': VotingClassifier(estimators=[('lr', LogisticRegression()),('rf', RandomForestClassifier()),('gnb', GaussianNB())], voting='hard'),
  218.     'Random Forest': RandomForestClassifier(n_estimators=10),
  219.     'Gradient Boosting': GradientBoostingClassifier(n_estimators=10),
  220.     'Stacking Classifier': StackingClassifier(estimators=[('log_reg', LogisticRegression()),('knn', KNeighborsClassifier(n_neighbors=3))],final_estimator=LogisticRegression(),n_jobs=-1),
  221.     'LightGBM': LGBMClassifier(),
  222.     'CatBoost': CatBoostClassifier(),
  223.     'Self-Training': SelfTrainingClassifier(LogisticRegression()),
  224.     'Isolation Forest': IsolationForest(),
  225.     'Extra Trees Classifier': ExtraTreesClassifier(n_estimators=100),
  226.     'HistGradientBoostingClassifier': HistGradientBoostingClassifier(),
  227.  
  228. }
  229.  
  230. # Running algorithms in k-fold
  231. for fold, (train_idx, test_idx) in enumerate(kf.split(X), 1):
  232.     X_train, X_test = X[train_idx], X[test_idx]
  233.     y_train, y_test = y[train_idx], y[test_idx]
  234.  
  235.     for algo_name, model in algorithms.items():
  236.         run_algorithm(algo_name, model, X_train, y_train, X_test, y_test, fold)
  237.        
  238.        
  239.        
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement