Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import logging
- import os
- import numpy as np
- import pandas as pd
- from sklearn.preprocessing import MinMaxScaler, LabelEncoder
- # Set up logging
- logging.basicConfig(
- level=logging.INFO,
- format='%(asctime)s - %(levelname)s - %(message)s',
- datefmt='%Y-%m-%d %H:%M:%S',
- handlers=[
- logging.FileHandler('preprocessing.log'),
- logging.StreamHandler()
- ]
- )
- # Create directories for the preprocessed files and label encodings if they do not exist
- os.makedirs('preprocessed', exist_ok=True)
- os.makedirs('encoded_label', exist_ok=True)
- # Get the list of all files in the inputFiles directory
- input_files = os.listdir('inputFiles')
- # Iterate over each file
- for file_name in input_files:
- if file_name.endswith('.csv'): # Check if the file is a CSV
- logging.info(f"Processing file: {file_name}")
- file_path = os.path.join('inputFiles', file_name)
- # Read the CSV file
- df = pd.read_csv(file_path)
- # Rename the last column to 'label'
- df.rename(columns={df.columns[-1]: 'label'}, inplace=True)
- # Remove null and infinity values
- df.replace([np.inf, -np.inf], np.nan, inplace=True)
- df.dropna(inplace=True)
- # Separate features and label
- features = df.drop('label', axis=1)
- labels = df['label']
- # Apply Min-Max Scaling to numeric features only
- numeric_features = features.select_dtypes(include=[np.number])
- scaler = MinMaxScaler()
- scaled_numeric_features = scaler.fit_transform(numeric_features)
- # Convert scaled numeric features back to DataFrame with original column names
- scaled_numeric_df = pd.DataFrame(scaled_numeric_features, columns=numeric_features.columns)
- # Round float values to 4 decimal places while keeping integers unchanged
- for col in scaled_numeric_df.columns:
- if np.issubdtype(scaled_numeric_df[col].dtype, np.floating):
- scaled_numeric_df[col] = scaled_numeric_df[col].round(4)
- # Extract non-numeric features
- non_numeric_features = features.select_dtypes(exclude=[np.number])
- # Concatenate processed numeric and non-numeric features
- processed_features = pd.concat([scaled_numeric_df, non_numeric_features], axis=1)
- # Encode labels
- label_encoder = LabelEncoder()
- encoded_labels = label_encoder.fit_transform(labels)
- processed_features['label'] = encoded_labels
- # Save the processed dataset to a CSV file in the preprocessed directory
- preprocessed_file_path = os.path.join('preprocessed', file_name.replace('.csv', '_preprocessed.csv'))
- processed_features.to_csv(preprocessed_file_path, index=False)
- # Save the label encodings to a CSV file in the label directory
- label_file_path = os.path.join('encoded_label', file_name.replace('.csv', '_encoded_labels.csv'))
- label_mappings = pd.DataFrame(list(label_encoder.classes_), columns=['OriginalLabel'])
- label_mappings['EncodedLabel'] = label_mappings.index
- label_mappings.to_csv(label_file_path, index=False)
- logging.info(f"Processed dataset has been saved to {preprocessed_file_path}")
- logging.info(f"Label encodings have been saved to {label_file_path}")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement