step_01_four_digit_float_pre-processing

import logging
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S',
    handlers=[
        logging.FileHandler('preprocessing.log'),
        logging.StreamHandler()
    ]
)

# Create directories for the preprocessed files and label encodings if they do not exist
os.makedirs('preprocessed', exist_ok=True)
os.makedirs('encoded_label', exist_ok=True)

# Get the list of all files in the inputFiles directory
input_files = os.listdir('inputFiles')

# Iterate over each file
for file_name in input_files:
    if file_name.endswith('.csv'):  # Check if the file is a CSV
        logging.info(f"Processing file: {file_name}")
        file_path = os.path.join('inputFiles', file_name)

        # Read the CSV file
        df = pd.read_csv(file_path)

        # Rename the last column to 'label'
        df.rename(columns={df.columns[-1]: 'label'}, inplace=True)

        # Remove null and infinity values
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.dropna(inplace=True)

        # Separate features and label
        features = df.drop('label', axis=1)
        labels = df['label']

        # Apply Min-Max Scaling to numeric features only
        numeric_features = features.select_dtypes(include=[np.number])
        scaler = MinMaxScaler()
        scaled_numeric_features = scaler.fit_transform(numeric_features)

        # Convert scaled numeric features back to DataFrame with original column names
        scaled_numeric_df = pd.DataFrame(scaled_numeric_features, columns=numeric_features.columns)

        # Round float values to 4 decimal places while keeping integers unchanged
        for col in scaled_numeric_df.columns:
            if np.issubdtype(scaled_numeric_df[col].dtype, np.floating):
                scaled_numeric_df[col] = scaled_numeric_df[col].round(4)

        # Extract non-numeric features
        non_numeric_features = features.select_dtypes(exclude=[np.number])

        # Concatenate processed numeric and non-numeric features
        processed_features = pd.concat([scaled_numeric_df, non_numeric_features], axis=1)

        # Encode labels
        label_encoder = LabelEncoder()
        encoded_labels = label_encoder.fit_transform(labels)
        processed_features['label'] = encoded_labels

        # Save the processed dataset to a CSV file in the preprocessed directory
        preprocessed_file_path = os.path.join('preprocessed', file_name.replace('.csv', '_preprocessed.csv'))
        processed_features.to_csv(preprocessed_file_path, index=False)

        # Save the label encodings to a CSV file in the label directory
        label_file_path = os.path.join('encoded_label', file_name.replace('.csv', '_encoded_labels.csv'))
        label_mappings = pd.DataFrame(list(label_encoder.classes_), columns=['OriginalLabel'])
        label_mappings['EncodedLabel'] = label_mappings.index
        label_mappings.to_csv(label_file_path, index=False)

        logging.info(f"Processed dataset has been saved to {preprocessed_file_path}")
        logging.info(f"Label encodings have been saved to {label_file_path}")