Advertisement
mayankjoin3

Untitled

Jun 17th, 2024
33
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 7.41 KB | None | 0 0
  1.  
  2. from configuration import *
  3. ## **IMPORT LIBRARIES**
  4. # importing required libraries
  5. import numpy as np
  6. import pandas as pd
  7.  
  8. import seaborn as sns
  9. import matplotlib.pyplot as plt
  10. import time
  11. import os
  12. from itertools import combinations
  13.  
  14. import pickle
  15. from os import path
  16.  
  17. import pandas as pd
  18. from sklearn.model_selection import train_test_split
  19. from imblearn.over_sampling import RandomOverSampler
  20. from sklearn.ensemble import RandomForestClassifier
  21. from sklearn.metrics import classification_report
  22. from imblearn.under_sampling import EditedNearestNeighbours, TomekLinks
  23. from imblearn.over_sampling import SMOTE
  24. from imblearn.combine import SMOTEENN, SMOTETomek
  25.  
  26. from sklearn.preprocessing import MinMaxScaler
  27. from sklearn.preprocessing import StandardScaler
  28. from sklearn.preprocessing import LabelEncoder
  29. from sklearn.pipeline import Pipeline
  30. from sklearn.metrics import ConfusionMatrixDisplay
  31. from sklearn.metrics import multilabel_confusion_matrix
  32. from sklearn.metrics import confusion_matrix
  33.  
  34. from sklearn import metrics
  35. from sklearn import preprocessing
  36. from sklearn.metrics import accuracy_score
  37. from sklearn.model_selection import train_test_split
  38. from sklearn.metrics import classification_report
  39.  
  40. from sklearn.tree import DecisionTreeClassifier
  41.  
  42. ## **Importing Datasets**
  43. filename = dataset_name
  44. featurename="ABC"
  45. train_data = pd.read_csv(dataset_path +'/'+ train_dataset_name, sep=',', encoding='utf-8')
  46. test_data = pd.read_csv(dataset_path +'/'+ test_dataset_name, sep=',', encoding='utf-8')
  47. X_train = train_data.drop(columns=['label'],axis=1)
  48. y_train = train_data['label']
  49. X_test = test_data.drop(columns=['label'],axis=1)
  50. y_test = test_data['label']
  51. ## **Feature selection Wrapper Methods**
  52. # FS_TOOL
  53. # No. Abbreviation Extra Parameters
  54. # *13 hho Harris Hawk Optimization No
  55. # *12 ssa Salp Swarm Algorithm No
  56. # *11 woa Whale Optimization Algorithm Yes
  57. # *10 sca Sine Cosine Algorithm Yes
  58. # *09 ja Jaya Algorithm No
  59. # *08 gwo Grey Wolf Optimizer No
  60. # *07 fpa Flower Pollination Algorithm Yes
  61. # *06 ba Bat Algorithm Yes
  62. # *05 fa Firefly Algorithm Yes
  63. # *04 cs Cuckoo Search Algorithm Yes
  64. # *03 de Differential Evolution Yes
  65. # *02 pso Particle Swarm Optimization Yes
  66. # *01 ga Genetic Algorithm Yes
  67. # %1 of train data taken as input to optimization
  68. X_t,temp1,y_t,temp2 = train_test_split(X_train,y_train,train_size=opt_percent, random_state=7)
  69. del temp1,temp2
  70. feature_name = output_path+"/"+filename+"_"+featurename+"_feature.csv"
  71. file = open(feature_name, 'w')
  72. file.write("optimization,execution time of optimzier,no of feature selected,selected feature \n")
  73. file.write(featurename+",")
  74. file.close()
  75. from FS.abc import jfs # change this to switch algorithm
  76. # split data into train & validation (70 -- 30)
  77. feat = np.asarray(X_t)
  78. label= np.asarray(y_t)
  79. del X_t,y_t
  80. xtrain, xtest, ytrain, ytest = train_test_split(feat, label, test_size=0.3, stratify=label)
  81. fold = {'xt':xtrain, 'yt':ytrain, 'xv':xtest, 'yv':ytest}
  82.  
  83. # parameter
  84. k = 5 # k-value in KNN
  85. N = 10 # number of chromosomes
  86. T = 5 # maximum number of generations
  87. # Extra parameters of listed methods other than population size / number of solutions and maximum number of iterations
  88.  
  89. # Flower Pollination Algorithm (FPA)
  90. # FPA contains 1 extra parameter
  91.  
  92. P = 0.8 # switch probability
  93. opts = {'k':k, 'fold':fold, 'N':N, 'T':T, 'P':P}
  94.  
  95. # perform feature selection
  96. import time
  97. start_time = time.time()
  98. fmdl = jfs(feat, label, opts)
  99. end_time = time.time()
  100. sf = fmdl['sf']
  101.  
  102. # sf is selected_feature
  103. sf = fmdl['sf']
  104. exe_time = end_time - start_time
  105.  
  106.  
  107. file = open(feature_name, 'a')
  108. file.write(str(exe_time) +",")
  109. file.write(str(len(sf)) +",")
  110. file.write("\"")
  111. column_headers = list(X_train.columns.values)
  112. for i in sf:
  113. file.write(column_headers[i]+",")
  114. file.write("\"\n")
  115. file.close()
  116. ## **Selection of feature**
  117. feature_df = pd.read_csv(feature_name, sep=',', encoding='utf-8')
  118. selected_feature = feature_df.iat[0, 3]
  119. selected_feature = selected_feature[0:-1]
  120. selected_feature
  121. selected_feature = list(selected_feature.split(","))
  122. selected_feature
  123. X_train = X_train[selected_feature]
  124. X_train
  125. X_test = X_test[selected_feature]
  126. X_test
  127.  
  128.  
  129. # Save the filtered test dataset
  130.  
  131. test_data_filtered = pd.concat([X_test, y_test], axis=1)
  132. dataSetName = output_path+"/"+ filename + "_"+ featurename + "_test.csv"
  133. test_data_filtered.to_csv(dataSetName, index=False)
  134.  
  135. train_data_filtered = pd.concat([X_train, y_train], axis=1)
  136. dataSetName = output_path+"/"+ filename + "_"+ featurename + "_No_SMOTE.csv"
  137. train_data_filtered.to_csv(dataSetName, index=False)
  138.  
  139. # Further split the filtered training data and apply SMOTEENN for oversampling
  140. X_train, temp1, y_train, temp2 = train_test_split(X_train, y_train, train_size=smote_percent, random_state=7)
  141. del temp1, temp2
  142.  
  143. # Apply SMOTEENN to balance the dataset
  144. sm = SMOTE(sampling_strategy='auto', k_neighbors=4, n_jobs=2)
  145. enn = EditedNearestNeighbours(sampling_strategy='auto', n_neighbors=3, kind_sel='all', n_jobs=2)
  146. smenn = SMOTEENN(sampling_strategy='auto', smote=sm, enn=enn, n_jobs=4)
  147. X_train_oversampled, y_train_oversampled = smenn.fit_resample(X_train, y_train)
  148.  
  149. # Combine balanced features and labels into a single DataFrame
  150. balanced_df = pd.DataFrame(X_train_oversampled, columns=X_train.columns)
  151. balanced_df['label'] = y_train_oversampled
  152.  
  153. dataSetName = output_path+"/"+ filename + "_"+ featurename + "_SMOTE_ENN.csv"
  154. # Save the entire balanced dataset with SMOTEENN applied
  155. balanced_df.to_csv(dataSetName, index=False)
  156.  
  157. # Optionally, apply SMOTETomek for another resampling technique
  158. smt = SMOTETomek(random_state=42)
  159. X_train_resampled, y_train_resampled = smt.fit_resample(X_train, y_train)
  160.  
  161. # Combine balanced features and labels into a single DataFrame
  162. balanced_df_ipf = pd.DataFrame(X_train_resampled, columns=X_train.columns)
  163. balanced_df_ipf['label'] = y_train_resampled
  164. balanced_df_ipf.reset_index(drop=True, inplace=True)
  165.  
  166. dataSetName = output_path+"/"+ filename + "_"+ featurename + "_SMOTE_IPF.csv"
  167.  
  168. # Save the balanced dataset with SMOTETomek applied
  169. balanced_df_ipf.to_csv(dataSetName, index=False)
  170.  
  171.  
  172. import smote_variants as sv
  173. import pandas as pd
  174.  
  175.  
  176. algorithms = [
  177. "Supervised_SMOTE", "Safe_Level_SMOTE", "RWO_sampling", "ROSE",
  178. "SMOTE_OUT", "SMOTE_Cosine", "Selected_SMOTE", "SN_SMOTE", "CCR"
  179. ]
  180.  
  181. for algorithm in algorithms:
  182. try:
  183. oversampler = sv.MulticlassOversampling(oversampler=algorithm,
  184. oversampler_params={'random_state': 5})
  185.  
  186. # X_samp and y_samp contain the oversampled dataset
  187. X_samp, y_samp = oversampler.sample(X_train, y_train)
  188.  
  189. # Create DataFrame from oversampled data
  190. oversampled_df = pd.DataFrame(data=X_samp, columns=[f'feature_{i}' for i in range(X_samp.shape[1])])
  191. oversampled_df['label'] = y_samp
  192. oversampled_df.reset_index(drop=True, inplace=True) # Reset the index
  193.  
  194. # Define output CSV file name
  195. dataSetName = f"{output_path}/{filename}_{featurename}_{algorithm}.csv"
  196. oversampled_df.to_csv(dataSetName, index=False)
  197.  
  198. print(f'Oversampled dataset saved to {dataSetName}')
  199. except Exception as e:
  200. print(f"Error processing {algorithm}: {str(e)}")
  201. continue
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement