Advertisement
mayankjoin3

Problematic Values nan inf drop from csv

Nov 13th, 2024
48
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.91 KB | None | 0 0
  1.  
  2. inputfile='Wednesday-workingHours.pcap_ISCX.csv'
  3.  
  4.  
  5. import pandas as pd
  6. import numpy as np
  7. import logging
  8.  
  9. # Set up logging to output to a file
  10. logging.basicConfig(filename='error_log.txt', level=logging.INFO, format='%(message)s')
  11.  
  12. # Load the CSV file
  13. try:
  14.     df = pd.read_csv(inputfile)
  15. except FileNotFoundError:
  16.     logging.error("Error: 'input.csv' not found.")
  17.     exit()
  18.  
  19. # Select only numeric columns to avoid issues with string data
  20. numeric_df = df.select_dtypes(include=[np.number])
  21.  
  22. # Identify problematic values
  23. infinity_mask = numeric_df.isin([np.inf, -np.inf])
  24. threshold = np.finfo(np.float64).max
  25. large_value_mask = numeric_df.abs() > threshold
  26.  
  27. # Combine masks for all problematic values
  28. problematic_mask = infinity_mask | large_value_mask
  29.  
  30. # Log problematic values
  31. if problematic_mask.any().any():
  32.     logging.info("Problematic values found in 'input.csv':")
  33.     problematic_indices = np.where(problematic_mask)
  34.     for row, col in zip(*problematic_indices):
  35.         logging.info(f"Row: {row}, Column: '{numeric_df.columns[col]}', Value: {numeric_df.iat[row, col]}")
  36. else:
  37.     logging.info("No infinity or extremely large values found in 'input.csv'.")
  38.  
  39. # Remove rows with any problematic values
  40. # First, replace `inf`, `-inf`, and extremely large values with NaN, then drop rows with NaN
  41. df.replace([np.inf, -np.inf], np.nan, inplace=True)
  42. numeric_columns = df.select_dtypes(include=[np.number])
  43. problematic_mask = (numeric_columns.abs() > threshold) | numeric_columns.isna()
  44. rows_to_drop = problematic_mask.any(axis=1)  # Identify rows to drop
  45.  
  46. # Drop the identified rows
  47. df_cleaned = df[~rows_to_drop]
  48.  
  49. # Save cleaned DataFrame to a new CSV file without problematic rows
  50. df_cleaned.to_csv('cleaned_output.csv', index=False)
  51. print("Data cleaning complete. Rows with problematic values have been removed. Output saved to 'cleaned_output.csv' and errors logged to 'error_log.txt'.")
  52.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement