Advertisement
mayankjoin3

step_01_four_digit_float_pre-processing

Feb 12th, 2025
91
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.30 KB | None | 0 0
  1. import logging
  2. import os
  3. import numpy as np
  4. import pandas as pd
  5. from sklearn.preprocessing import MinMaxScaler, LabelEncoder
  6.  
  7. # Set up logging
  8. logging.basicConfig(
  9.     level=logging.INFO,
  10.     format='%(asctime)s - %(levelname)s - %(message)s',
  11.     datefmt='%Y-%m-%d %H:%M:%S',
  12.     handlers=[
  13.         logging.FileHandler('preprocessing.log'),
  14.         logging.StreamHandler()
  15.     ]
  16. )
  17.  
  18. # Create directories for the preprocessed files and label encodings if they do not exist
  19. os.makedirs('preprocessed', exist_ok=True)
  20. os.makedirs('encoded_label', exist_ok=True)
  21.  
  22. # Get the list of all files in the inputFiles directory
  23. input_files = os.listdir('inputFiles')
  24.  
  25. # Iterate over each file
  26. for file_name in input_files:
  27.     if file_name.endswith('.csv'):  # Check if the file is a CSV
  28.         logging.info(f"Processing file: {file_name}")
  29.         file_path = os.path.join('inputFiles', file_name)
  30.        
  31.         # Read the CSV file
  32.         df = pd.read_csv(file_path)
  33.  
  34.         # Rename the last column to 'label'
  35.         df.rename(columns={df.columns[-1]: 'label'}, inplace=True)
  36.  
  37.         # Remove null and infinity values
  38.         df.replace([np.inf, -np.inf], np.nan, inplace=True)
  39.         df.dropna(inplace=True)
  40.  
  41.         # Separate features and label
  42.         features = df.drop('label', axis=1)
  43.         labels = df['label']
  44.  
  45.         # Apply Min-Max Scaling to numeric features only
  46.         numeric_features = features.select_dtypes(include=[np.number])
  47.         scaler = MinMaxScaler()
  48.         scaled_numeric_features = scaler.fit_transform(numeric_features)
  49.  
  50.         # Convert scaled numeric features back to DataFrame with original column names
  51.         scaled_numeric_df = pd.DataFrame(scaled_numeric_features, columns=numeric_features.columns)
  52.  
  53.         # Round float values to 4 decimal places while keeping integers unchanged
  54.         for col in scaled_numeric_df.columns:
  55.             if np.issubdtype(scaled_numeric_df[col].dtype, np.floating):
  56.                 scaled_numeric_df[col] = scaled_numeric_df[col].round(4)
  57.  
  58.         # Extract non-numeric features
  59.         non_numeric_features = features.select_dtypes(exclude=[np.number])
  60.        
  61.         # Concatenate processed numeric and non-numeric features
  62.         processed_features = pd.concat([scaled_numeric_df, non_numeric_features], axis=1)
  63.  
  64.         # Encode labels
  65.         label_encoder = LabelEncoder()
  66.         encoded_labels = label_encoder.fit_transform(labels)
  67.         processed_features['label'] = encoded_labels
  68.  
  69.         # Save the processed dataset to a CSV file in the preprocessed directory
  70.         preprocessed_file_path = os.path.join('preprocessed', file_name.replace('.csv', '_preprocessed.csv'))
  71.         processed_features.to_csv(preprocessed_file_path, index=False)
  72.  
  73.         # Save the label encodings to a CSV file in the label directory
  74.         label_file_path = os.path.join('encoded_label', file_name.replace('.csv', '_encoded_labels.csv'))
  75.         label_mappings = pd.DataFrame(list(label_encoder.classes_), columns=['OriginalLabel'])
  76.         label_mappings['EncodedLabel'] = label_mappings.index
  77.         label_mappings.to_csv(label_file_path, index=False)
  78.  
  79.         logging.info(f"Processed dataset has been saved to {preprocessed_file_path}")
  80.         logging.info(f"Label encodings have been saved to {label_file_path}")
  81.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement