Advertisement
makispaiktis

ML - Lab 6 - Preprocessing

Oct 20th, 2022 (edited)
901
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.10 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. import matplotlib.pyplot as plt
  4. import sklearn
  5.  
  6.  
  7. # Read data
  8. engdata = pd.read_csv("./engdata.txt")
  9. pdata = engdata.loc[:, ["Age", "Salary"]]       # Like 'X'
  10. print("********************************************")
  11. print("pdata: ")
  12. print(pdata)
  13. print()
  14. print("pdata summary: ")
  15. print(pdata.describe())
  16. print("********************************************")
  17. print()
  18.  
  19.  
  20. # Preprocessing and cleansing data
  21. # 1. Drop Duplicates
  22. print("Data length with duplicates = " + str(len(pdata)))
  23. pdata = pdata.drop_duplicates()
  24. print("Data length without duplicates = " + str(len(pdata)))
  25. print()
  26.  
  27. # 2. Subtract mean, divide by std (Zero-mean Data with std_new = 1)
  28. from sklearn.preprocessing import StandardScaler
  29. scaler = StandardScaler()
  30. scaler = scaler.fit(pdata)
  31. transformed = pd.DataFrame(scaler.transform(pdata), columns=["Age", "Salary"])
  32. print("********************************************")
  33. print("transformed pdata: ")
  34. print(transformed)
  35. print()
  36. print("transformed pdata summary: ")
  37. print(transformed.describe())
  38. print("********************************************")
  39. print()
  40.  
  41. plt.figure()
  42. plt.subplot(1, 2, 1)
  43. plt.title("Initial Data")
  44. plt.scatter(pdata["Age"], pdata["Salary"])
  45. plt.subplot(1, 2, 2)
  46. plt.title("Transformed Data (mean' = 0, std' = 1)")
  47. plt.scatter(transformed["Age"], transformed["Salary"])
  48. plt.show()
  49.  
  50.  
  51.  
  52. # Sampling
  53. n = 150
  54. data_sample = pdata.sample(n=n, random_state=1, replace=True)
  55.  
  56. plt.figure()
  57. plt.subplot(1, 2, 1)
  58. plt.title("Initial Data with length = " + str(len(pdata)))
  59. plt.scatter(pdata["Age"], pdata["Salary"])
  60. plt.subplot(1, 2, 2)
  61. plt.title("Sampled Data with length = " + str(n))
  62. plt.scatter(data_sample["Age"], data_sample["Salary"])
  63. plt.show()
  64.  
  65.  
  66.  
  67. # Discretization
  68. discAge = pd.cut(pdata.Age, [0, 10, 20, 30, 40, 50, 60, 70, 80])
  69. discSalary = pd.cut(pdata.Salary, pd.interval_range(start=0, freq=400, end=4000))
  70. print("********************************************")
  71. print("discAge = ")
  72. print(discAge)
  73. print()
  74. print("discSalary = ")
  75. print(discSalary)
  76. print("********************************************")
  77. print()
  78.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement