Advertisement
d3l3t3

Untitled

Apr 16th, 2025
242
0
4 days
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.93 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import os
  4. from sklearn.svm import SVC
  5. from sklearn.model_selection import StratifiedKFold
  6. from sklearn.preprocessing import StandardScaler
  7. from sklearn.metrics import roc_auc_score
  8. from sklearn.linear_model import LogisticRegression
  9. from sklearn.feature_selection import SelectFromModel
  10. from datetime import datetime
  11. import json
  12.  
  13. DATA_DIR = 'dataset/'
  14. TRAIN_FILE = os.path.join(DATA_DIR, 'kaggle_train.csv')
  15. TEST_FILE = os.path.join(DATA_DIR, 'kaggle_test.csv')
  16. LOG_FILE = 'svc_cv_log.txt'
  17. OUTPUT_FILENAME = 'submission_svc_fixed_cv.csv'
  18. MODEL_NAME = "SVC"
  19. SEED = 42
  20. L1_C_VAL = 0.5
  21. FEATURE_SETUP_NAME = f"L1_C{L1_C_VAL}"
  22. N_SPLITS = 20
  23.  
  24. FIXED_PARAMS = {"kernel": "linear","C": 0.025,"random_state": SEED,"class_weight": "balanced","probability": True}
  25. np.random.seed(SEED)
  26.  
  27. n_features = 318
  28. feature_cols = [f'feature_{i}' for i in range(n_features)]
  29.  
  30. df_train_raw = pd.read_csv(TRAIN_FILE, header=None)
  31. df_train_raw.columns = ['target'] + feature_cols
  32. df_test = pd.read_csv(TEST_FILE, header=None)
  33. df_test.columns = ['ID'] + feature_cols
  34. test_ids = df_test['ID']
  35. df_test_features = df_test[feature_cols]
  36.  
  37. df_train_cleaned = df_train_raw.dropna(subset=['target'])
  38. valid_targets = [0.0, 1.0, 0, 1]
  39. df_train_cleaned = df_train_cleaned[df_train_cleaned['target'].isin(valid_targets)].copy()
  40. df_train_cleaned['target'] = df_train_cleaned['target'].astype(int)
  41. y = df_train_cleaned['target']
  42. X = df_train_cleaned[feature_cols]
  43. X_test = df_test_features
  44.  
  45. X = X.fillna(X.mean())
  46. X_test = X_test.fillna(X.mean())
  47.  
  48. print(f"X={X.shape}, y={y.shape}, X_test={X_test.shape}")
  49.  
  50. scaler_fs = StandardScaler()
  51. X_scaled_fs = scaler_fs.fit_transform(X)
  52.  
  53. l1_model = LogisticRegression(C=L1_C_VAL, penalty='l1', solver='liblinear', random_state=SEED, max_iter=200, class_weight='balanced')
  54. selector = SelectFromModel(l1_model, threshold=1e-5)
  55. selector.fit(X_scaled_fs, y)
  56.  
  57. l1_selected_mask = selector.get_support()
  58. num_l1_features = l1_selected_mask.sum()
  59. print(f"Selected {num_l1_features} features.")
  60.  
  61. X_selected = X.loc[:, l1_selected_mask]
  62. X_test_selected = X_test.loc[:, l1_selected_mask]
  63.  
  64. oof_auc_scores = []
  65. cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
  66.  
  67. for fold, (train_idx, val_idx) in enumerate(cv.split(X_selected, y)):
  68.     X_train_fold, X_val_fold = X_selected.iloc[train_idx], X_selected.iloc[val_idx]
  69.     y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
  70.  
  71.     scaler_cv = StandardScaler()
  72.     X_train_scaled_fold = scaler_cv.fit_transform(X_train_fold)
  73.     X_val_scaled_fold = scaler_cv.transform(X_val_fold)
  74.  
  75.     model_fold = SVC(**FIXED_PARAMS)
  76.     model_fold.fit(X_train_scaled_fold, y_train_fold)
  77.     val_proba = model_fold.predict_proba(X_val_scaled_fold)[:, 1]
  78.     fold_auc = roc_auc_score(y_val_fold, val_proba)
  79.     oof_auc_scores.append(fold_auc)
  80.  
  81. avg_cv_auc = np.mean(oof_auc_scores) if oof_auc_scores else 0.0
  82. std_cv_auc = np.std(oof_auc_scores) if oof_auc_scores else 0.0
  83. print(f"Average CV AUC: {avg_cv_auc:.5f}")
  84. print(f"Std Dev CV AUC: {std_cv_auc:.5f}")
  85.  
  86. final_scaler = StandardScaler()
  87. X_train_scaled_final = final_scaler.fit_transform(X_selected)
  88. X_test_scaled_final = final_scaler.transform(X_test_selected)
  89.  
  90. final_model = SVC(**FIXED_PARAMS)
  91. training_successful = False
  92. train_auc = 0.0
  93. test_preds = np.zeros(len(X_test_selected))
  94.  
  95. final_model.fit(X_train_scaled_final, y)
  96. train_proba = final_model.predict_proba(X_train_scaled_final)[:, 1]
  97. train_auc = roc_auc_score(y, train_proba)
  98. test_preds = final_model.predict_proba(X_test_scaled_final)[:, 1]
  99. print(f"Final model Train AUC: {train_auc:.5f}")
  100. training_successful = True
  101.  
  102. submission_df = pd.DataFrame({'ID': test_ids, 'Labels': test_preds})
  103. submission_df['ID'] = submission_df['ID'].astype(int)
  104. submission_df['Labels'] = submission_df['Labels'].astype(float)
  105. submission_df.to_csv(OUTPUT_FILENAME, index=False, float_format='%.8f')
  106. print(f"Submission saved to: {OUTPUT_FILENAME}")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement