Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import time
- import psutil
- import os
- from sklearn.model_selection import StratifiedKFold
- from sklearn.ensemble import ExtraTreesClassifier
- from sklearn.metrics import classification_report
- import joblib
- from sklearn.model_selection import train_test_split
- from sklearn.preprocessing import LabelEncoder
- import numpy as np
- # Step 1: Load the data
- file_path = "m5.csv"
- df = pd.read_csv(file_path)
- # Preprocessing: If categorical columns exist, encode them
- label_encoder = LabelEncoder()
- # Assuming 'category' column is the target and there may be categorical features
- # Encode categorical features if needed, e.g., if there are string columns
- # Here we will assume 'category' is the target and all other columns are features
- X = df.drop(columns=['label'])
- y = df['label']
- # Encoding categorical features (if they exist)
- X = X.apply(lambda col: label_encoder.fit_transform(col.astype(str)), axis=0, result_type='expand')
- # Step 2: Initialize the ExtraTreesClassifier
- clf = ExtraTreesClassifier(random_state=42, n_jobs=-1) # n_jobs=-1 allows for parallel processing
- # Step 3: Set up 2-Fold Cross-Validation
- skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
- # Step 4: Evaluate Model Using Cross-Validation
- fold = 1
- for train_index, test_index in skf.split(X, y):
- print(f"Fold {fold}...")
- X_train, X_test = X.iloc[train_index], X.iloc[test_index]
- y_train, y_test = y.iloc[train_index], y.iloc[test_index]
- # Step 5: Train the model
- start_time = time.time()
- clf.fit(X_train, y_train)
- training_time = time.time() - start_time
- print(f"Training time for fold {fold}: {training_time:.4f} seconds")
- # Step 6: Test the model and measure latency
- start_time = time.time()
- predictions = clf.predict(X_test)
- prediction_time = time.time() - start_time
- print(f"Prediction time for fold {fold}: {prediction_time:.4f} seconds per instance")
- # Step 7: Memory and CPU Usage during prediction
- process = psutil.Process(os.getpid())
- memory_before = process.memory_info().rss / (1024 * 1024) # in MB
- cpu_before = psutil.cpu_percent(interval=1)
- # Perform prediction to measure CPU and Memory usage
- clf.predict(X_test)
- memory_after = process.memory_info().rss / (1024 * 1024) # in MB
- cpu_after = psutil.cpu_percent(interval=1)
- memory_usage = memory_after - memory_before
- cpu_usage = cpu_after - cpu_before
- print(f"Memory usage during prediction: {memory_usage:.4f} MB")
- print(f"CPU usage during prediction: {cpu_usage:.4f}%")
- # Step 8: Output classification report
- # print("Classification Report:")
- # print(classification_report(y_test, predictions))
- # Step 9: Model size on disk
- model_filename = f'extratrees_model_fold{fold}.pkl'
- joblib.dump(clf, model_filename)
- model_size = os.path.getsize(model_filename) / (1024 * 1024) # in MB
- print(f"Model size for fold {fold}: {model_size:.4f} MB")
- fold += 1
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement