Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import os
- from sklearn.svm import SVC
- from sklearn.model_selection import StratifiedKFold
- from sklearn.preprocessing import StandardScaler
- from sklearn.metrics import roc_auc_score
- from sklearn.linear_model import LogisticRegression
- from sklearn.feature_selection import SelectFromModel
- from datetime import datetime
- import json
- DATA_DIR = 'dataset/'
- TRAIN_FILE = os.path.join(DATA_DIR, 'kaggle_train.csv')
- TEST_FILE = os.path.join(DATA_DIR, 'kaggle_test.csv')
- LOG_FILE = 'svc_cv_log.txt'
- OUTPUT_FILENAME = 'submission_svc_fixed_cv.csv'
- MODEL_NAME = "SVC"
- SEED = 42
- L1_C_VAL = 0.5
- FEATURE_SETUP_NAME = f"L1_C{L1_C_VAL}"
- N_SPLITS = 20
- FIXED_PARAMS = {"kernel": "linear","C": 0.025,"random_state": SEED,"class_weight": "balanced","probability": True}
- np.random.seed(SEED)
- n_features = 318
- feature_cols = [f'feature_{i}' for i in range(n_features)]
- df_train_raw = pd.read_csv(TRAIN_FILE, header=None)
- df_train_raw.columns = ['target'] + feature_cols
- df_test = pd.read_csv(TEST_FILE, header=None)
- df_test.columns = ['ID'] + feature_cols
- test_ids = df_test['ID']
- df_test_features = df_test[feature_cols]
- df_train_cleaned = df_train_raw.dropna(subset=['target'])
- valid_targets = [0.0, 1.0, 0, 1]
- df_train_cleaned = df_train_cleaned[df_train_cleaned['target'].isin(valid_targets)].copy()
- df_train_cleaned['target'] = df_train_cleaned['target'].astype(int)
- y = df_train_cleaned['target']
- X = df_train_cleaned[feature_cols]
- X_test = df_test_features
- X = X.fillna(X.mean())
- X_test = X_test.fillna(X.mean())
- print(f"X={X.shape}, y={y.shape}, X_test={X_test.shape}")
- scaler_fs = StandardScaler()
- X_scaled_fs = scaler_fs.fit_transform(X)
- l1_model = LogisticRegression(C=L1_C_VAL, penalty='l1', solver='liblinear', random_state=SEED, max_iter=200, class_weight='balanced')
- selector = SelectFromModel(l1_model, threshold=1e-5)
- selector.fit(X_scaled_fs, y)
- l1_selected_mask = selector.get_support()
- num_l1_features = l1_selected_mask.sum()
- print(f"Selected {num_l1_features} features.")
- X_selected = X.loc[:, l1_selected_mask]
- X_test_selected = X_test.loc[:, l1_selected_mask]
- oof_auc_scores = []
- cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
- for fold, (train_idx, val_idx) in enumerate(cv.split(X_selected, y)):
- X_train_fold, X_val_fold = X_selected.iloc[train_idx], X_selected.iloc[val_idx]
- y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
- scaler_cv = StandardScaler()
- X_train_scaled_fold = scaler_cv.fit_transform(X_train_fold)
- X_val_scaled_fold = scaler_cv.transform(X_val_fold)
- model_fold = SVC(**FIXED_PARAMS)
- model_fold.fit(X_train_scaled_fold, y_train_fold)
- val_proba = model_fold.predict_proba(X_val_scaled_fold)[:, 1]
- fold_auc = roc_auc_score(y_val_fold, val_proba)
- oof_auc_scores.append(fold_auc)
- avg_cv_auc = np.mean(oof_auc_scores) if oof_auc_scores else 0.0
- std_cv_auc = np.std(oof_auc_scores) if oof_auc_scores else 0.0
- print(f"Average CV AUC: {avg_cv_auc:.5f}")
- print(f"Std Dev CV AUC: {std_cv_auc:.5f}")
- final_scaler = StandardScaler()
- X_train_scaled_final = final_scaler.fit_transform(X_selected)
- X_test_scaled_final = final_scaler.transform(X_test_selected)
- final_model = SVC(**FIXED_PARAMS)
- training_successful = False
- train_auc = 0.0
- test_preds = np.zeros(len(X_test_selected))
- final_model.fit(X_train_scaled_final, y)
- train_proba = final_model.predict_proba(X_train_scaled_final)[:, 1]
- train_auc = roc_auc_score(y, train_proba)
- test_preds = final_model.predict_proba(X_test_scaled_final)[:, 1]
- print(f"Final model Train AUC: {train_auc:.5f}")
- training_successful = True
- submission_df = pd.DataFrame({'ID': test_ids, 'Labels': test_preds})
- submission_df['ID'] = submission_df['ID'].astype(int)
- submission_df['Labels'] = submission_df['Labels'].astype(float)
- submission_df.to_csv(OUTPUT_FILENAME, index=False, float_format='%.8f')
- print(f"Submission saved to: {OUTPUT_FILENAME}")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement