Advertisement
mayankjoin3

step_01_large_float_pre-processing

Feb 12th, 2025
82
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.89 KB | None | 0 0
  1. """
  2. This script processes all CSV files within the 'inputFiles' directory and applies several preprocessing steps:
  3. - It renames the last column of each CSV file to 'label'.
  4. - It removes rows with null or infinite values.
  5. - It applies Min-Max scaling to numeric features while leaving non-numeric features unchanged.
  6. - It encodes categorical labels into numeric values.
  7.  
  8. The preprocessed data is saved in a new directory named 'preprocessed', with each file's name suffixed with '_preprocessed'.
  9. Additionally, a separate directory named 'label' is created to store CSV files containing mappings of original labels to their encoded numeric values, with each file's name suffixed with '_labels'.
  10.  
  11. The script ensures the creation of 'preprocessed' and 'label' directories if they do not already exist and provides console output to indicate the progress and completion of file processing.
  12. """
  13. import logging
  14.  
  15. # Set up logging
  16. logging.basicConfig(
  17.     level=logging.INFO,
  18.     format='%(asctime)s - %(levelname)s - %(message)s',
  19.     datefmt='%Y-%m-%d %H:%M:%S',
  20.     handlers=[
  21.         logging.FileHandler('preprocessing.log'),
  22.         logging.StreamHandler()
  23.     ]
  24. )
  25.  
  26. import os
  27. import numpy as np
  28. import pandas as pd
  29. from sklearn.preprocessing import MinMaxScaler, LabelEncoder
  30.  
  31. # Create directories for the preprocessed files and label encodings if they do not exist
  32. os.makedirs('preprocessed', exist_ok=True)
  33. os.makedirs('encoded_label', exist_ok=True)
  34.  
  35. # Get the list of all files in the inputFiles directory
  36. input_files = os.listdir('inputFiles')
  37.  
  38. # Iterate over each file
  39. for file_name in input_files:
  40.     if file_name.endswith('.csv'):  # Check if the file is a CSV
  41.         logging.info(f"Processing file: {file_name}")
  42.         file_path = os.path.join('inputFiles', file_name)
  43.        
  44.         # Read the CSV file
  45.         df = pd.read_csv(file_path)
  46.  
  47.         # Rename the last column to 'label'
  48.         df.rename(columns={df.columns[-1]: 'label'}, inplace=True)
  49.  
  50.         # Remove null and infinity values
  51.         df.replace([np.inf, -np.inf], np.nan, inplace=True)
  52.         df.dropna(inplace=True)
  53.  
  54.         # Separate features and label
  55.         features = df.drop('label', axis=1)
  56.         labels = df['label']
  57.  
  58.         # Apply Min-Max Scaling to numeric features only
  59.         numeric_features = features.select_dtypes(include=[np.number])
  60.         scaler = MinMaxScaler()
  61.         scaled_numeric_features = scaler.fit_transform(numeric_features)
  62.  
  63.         # Combine scaled numeric features with non-numeric features
  64.         non_numeric_features = features.select_dtypes(exclude=[np.number])
  65.         scaled_features = np.hstack((scaled_numeric_features, non_numeric_features))
  66.  
  67.         # Encode labels
  68.         label_encoder = LabelEncoder()
  69.         encoded_labels = label_encoder.fit_transform(labels)
  70.  
  71.         # Combine scaled features and encoded labels
  72.         feature_columns = numeric_features.columns.tolist() + non_numeric_features.columns.tolist()
  73.         processed_data = pd.DataFrame(scaled_features, columns=feature_columns)
  74.         processed_data['label'] = encoded_labels
  75.  
  76.         # Save the processed dataset to a CSV file in the preprocessed directory
  77.         preprocessed_file_path = os.path.join('preprocessed', file_name.replace('.csv', '_preprocessed.csv'))
  78.         processed_data.to_csv(preprocessed_file_path, index=False)
  79.  
  80.         # Save the label encodings to a CSV file in the label directory
  81.         label_file_path = os.path.join('encoded_label', file_name.replace('.csv', '_encoded_labels.csv'))
  82.         label_mappings = pd.DataFrame(list(label_encoder.classes_), columns=['OriginalLabel'])
  83.         label_mappings['EncodedLabel'] = label_mappings.index
  84.         label_mappings.to_csv(label_file_path, index=False)
  85.  
  86.         logging.info(f"Processed dataset has been saved to {preprocessed_file_path}")
  87.         logging.info(f"Label encodings have been saved to {label_file_path}")
  88.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement