Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- This script processes all CSV files within the 'inputFiles' directory and applies several preprocessing steps:
- - It renames the last column of each CSV file to 'label'.
- - It removes rows with null or infinite values.
- - It applies Min-Max scaling to numeric features while leaving non-numeric features unchanged.
- - It encodes categorical labels into numeric values.
- The preprocessed data is saved in a new directory named 'preprocessed', with each file's name suffixed with '_preprocessed'.
- Additionally, a separate directory named 'label' is created to store CSV files containing mappings of original labels to their encoded numeric values, with each file's name suffixed with '_labels'.
- The script ensures the creation of 'preprocessed' and 'label' directories if they do not already exist and provides console output to indicate the progress and completion of file processing.
- """
- import logging
- # Set up logging
- logging.basicConfig(
- level=logging.INFO,
- format='%(asctime)s - %(levelname)s - %(message)s',
- datefmt='%Y-%m-%d %H:%M:%S',
- handlers=[
- logging.FileHandler('preprocessing.log'),
- logging.StreamHandler()
- ]
- )
- import os
- import numpy as np
- import pandas as pd
- from sklearn.preprocessing import MinMaxScaler, LabelEncoder
- # Create directories for the preprocessed files and label encodings if they do not exist
- os.makedirs('preprocessed', exist_ok=True)
- os.makedirs('encoded_label', exist_ok=True)
- # Get the list of all files in the inputFiles directory
- input_files = os.listdir('inputFiles')
- # Iterate over each file
- for file_name in input_files:
- if file_name.endswith('.csv'): # Check if the file is a CSV
- logging.info(f"Processing file: {file_name}")
- file_path = os.path.join('inputFiles', file_name)
- # Read the CSV file
- df = pd.read_csv(file_path)
- # Rename the last column to 'label'
- df.rename(columns={df.columns[-1]: 'label'}, inplace=True)
- # Remove null and infinity values
- df.replace([np.inf, -np.inf], np.nan, inplace=True)
- df.dropna(inplace=True)
- # Separate features and label
- features = df.drop('label', axis=1)
- labels = df['label']
- # Apply Min-Max Scaling to numeric features only
- numeric_features = features.select_dtypes(include=[np.number])
- scaler = MinMaxScaler()
- scaled_numeric_features = scaler.fit_transform(numeric_features)
- # Combine scaled numeric features with non-numeric features
- non_numeric_features = features.select_dtypes(exclude=[np.number])
- scaled_features = np.hstack((scaled_numeric_features, non_numeric_features))
- # Encode labels
- label_encoder = LabelEncoder()
- encoded_labels = label_encoder.fit_transform(labels)
- # Combine scaled features and encoded labels
- feature_columns = numeric_features.columns.tolist() + non_numeric_features.columns.tolist()
- processed_data = pd.DataFrame(scaled_features, columns=feature_columns)
- processed_data['label'] = encoded_labels
- # Save the processed dataset to a CSV file in the preprocessed directory
- preprocessed_file_path = os.path.join('preprocessed', file_name.replace('.csv', '_preprocessed.csv'))
- processed_data.to_csv(preprocessed_file_path, index=False)
- # Save the label encodings to a CSV file in the label directory
- label_file_path = os.path.join('encoded_label', file_name.replace('.csv', '_encoded_labels.csv'))
- label_mappings = pd.DataFrame(list(label_encoder.classes_), columns=['OriginalLabel'])
- label_mappings['EncodedLabel'] = label_mappings.index
- label_mappings.to_csv(label_file_path, index=False)
- logging.info(f"Processed dataset has been saved to {preprocessed_file_path}")
- logging.info(f"Label encodings have been saved to {label_file_path}")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement