step_01_large_float_pre-processing

"""
This script processes all CSV files within the 'inputFiles' directory and applies several preprocessing steps:
- It renames the last column of each CSV file to 'label'.
- It removes rows with null or infinite values.
- It applies Min-Max scaling to numeric features while leaving non-numeric features unchanged.
- It encodes categorical labels into numeric values.

The preprocessed data is saved in a new directory named 'preprocessed', with each file's name suffixed with '_preprocessed'.
Additionally, a separate directory named 'label' is created to store CSV files containing mappings of original labels to their encoded numeric values, with each file's name suffixed with '_labels'.

The script ensures the creation of 'preprocessed' and 'label' directories if they do not already exist and provides console output to indicate the progress and completion of file processing.
"""
import logging

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    handlers=[
        logging.FileHandler('preprocessing.log'),
        logging.StreamHandler()
    ]
)

import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Create directories for the preprocessed files and label encodings if they do not exist
os.makedirs('preprocessed', exist_ok=True)
os.makedirs('encoded_label', exist_ok=True)

# Get the list of all files in the inputFiles directory
input_files = os.listdir('inputFiles')

# Iterate over each file
for file_name in input_files:
    if file_name.endswith('.csv'):  # Check if the file is a CSV
        logging.info(f"Processing file: {file_name}")
        file_path = os.path.join('inputFiles', file_name)

        # Read the CSV file
        df = pd.read_csv(file_path)

        # Rename the last column to 'label'
        df.rename(columns={df.columns[-1]: 'label'}, inplace=True)

        # Remove null and infinity values
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.dropna(inplace=True)

        # Separate features and label
        features = df.drop('label', axis=1)
        labels = df['label']

        # Apply Min-Max Scaling to numeric features only
        numeric_features = features.select_dtypes(include=[np.number])
        scaler = MinMaxScaler()
        scaled_numeric_features = scaler.fit_transform(numeric_features)

        # Combine scaled numeric features with non-numeric features
        non_numeric_features = features.select_dtypes(exclude=[np.number])
        scaled_features = np.hstack((scaled_numeric_features, non_numeric_features))

        # Encode labels
        label_encoder = LabelEncoder()
        encoded_labels = label_encoder.fit_transform(labels)

        # Combine scaled features and encoded labels
        feature_columns = numeric_features.columns.tolist() + non_numeric_features.columns.tolist()
        processed_data = pd.DataFrame(scaled_features, columns=feature_columns)
        processed_data['label'] = encoded_labels

        # Save the processed dataset to a CSV file in the preprocessed directory
        preprocessed_file_path = os.path.join('preprocessed', file_name.replace('.csv', '_preprocessed.csv'))
        processed_data.to_csv(preprocessed_file_path, index=False)

        # Save the label encodings to a CSV file in the label directory
        label_file_path = os.path.join('encoded_label', file_name.replace('.csv', '_encoded_labels.csv'))
        label_mappings = pd.DataFrame(list(label_encoder.classes_), columns=['OriginalLabel'])
        label_mappings['EncodedLabel'] = label_mappings.index
        label_mappings.to_csv(label_file_path, index=False)

        logging.info(f"Processed dataset has been saved to {preprocessed_file_path}")
        logging.info(f"Label encodings have been saved to {label_file_path}")