Advertisement
makispaiktis

ML - Lab 6 - PCA

Oct 20th, 2022 (edited)
892
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.00 KB | None | 0 0
  1. import numpy as np
  2. import pandas as pd
  3. import matplotlib.pyplot as plt
  4. import sklearn
  5.  
  6. '''
  7. # Create a DataFrame
  8. X = [1, 0, -1, 0, -1, 1]
  9. Y = [0, 1, 1, -1, 0, -1]
  10. Z = [-1, -1, 0, 1, 1, 0]
  11. labels = ["x1", "x2", "x3", "x4", "x5", "x6"]
  12. pdata = pd.DataFrame({"X": X, "Y": Y, "Z": Z}, index=labels)
  13. print("pdata = ")
  14. print(pdata)
  15. print()
  16.  
  17.  
  18. # Plots - 3D scatter
  19. fig = plt.figure()
  20. ax = fig.add_subplot(projection='3d')
  21. ax.scatter(pdata.X, pdata.Y, pdata.Z)
  22. for i in range(len(pdata.index)):
  23.    ax.text(pdata.loc[labels[i], "X"], pdata.loc[labels[i], "Y"], pdata.loc[labels[i], "Z"], '%s' % (str(labels[i])), size=20, zorder=1)
  24. ax.set_xlabel('X Axis')
  25. ax.set_ylabel('Y Axis')
  26. ax.set_zlabel('Z Axis')
  27. plt.show()
  28. '''
  29.  
  30.  
  31. # Read Data
  32. engdata = pd.read_csv("./engdata.txt")
  33. print("engdata = ")
  34. print(engdata)
  35. print()
  36. print("engdata summary = ")
  37. print(engdata.describe())
  38. print()
  39. # Find the Class
  40. location = engdata.Location
  41. engdata = engdata.drop(["Location"], axis=1)
  42.  
  43. # Plot data with different colors
  44. plt.figure()
  45. plt.scatter(engdata[location == "EU"].Age, engdata[location == "EU"].Salary, color="red", marker="+", label="EU")
  46. plt.scatter(engdata[location == "US"].Age, engdata[location == "US"].Salary, color="blue", marker="o", label="US")
  47. plt.title("Age - Salary")
  48. plt.xlabel("Age")
  49. plt.ylabel("Salary")
  50. plt.legend()
  51. plt.show()
  52. print("Correlation Matrix (between 2 features) = ")
  53. print(engdata.corr())
  54. print()
  55. print("We observe that Age and WorkExp have a high correlation.")
  56. print("This means that also the correalation Age-FeatureX and WorkExp-FeatureX have approximately the same value.")
  57. print("We could drop the WorkExp column, but we will continue with 4 features.")
  58. print()
  59. print()
  60.  
  61.  
  62. # PCA: X features ---> X eigenvalues and eigenvectors. NEEDS A SCALER
  63. from sklearn.preprocessing import StandardScaler
  64. from sklearn.decomposition import PCA
  65. # Scaler
  66. scaler = StandardScaler()
  67. scaler = scaler.fit(engdata)
  68. transformed = pd.DataFrame(scaler.transform(engdata), columns=engdata.columns)
  69.  
  70.  
  71. # PCA - 4 components
  72. pca = PCA()
  73. pca = pca.fit(transformed)
  74. pca_transformed = pca.transform(transformed)
  75. # Eigen
  76. eigenvalues = pca.explained_variance_
  77. eigenvectors = pca.components_
  78. print("**************************************************************")
  79. print("Eigenvalues = " + str(eigenvalues))
  80. print("Eigenvectors = ")
  81. print(eigenvectors)
  82. print("**************************************************************")
  83. print()
  84. # Principal Components
  85. plt.figure()
  86. plt.bar(range(1, len(eigenvalues)+1), 100 * eigenvalues/sum(eigenvalues))
  87. plt.title("Principal components with 4 features")
  88. plt.xlabel("Eigenvalue #")
  89. plt.ylabel("Component's weight (%)")
  90. plt.show()
  91.  
  92.  
  93. # PCA - 2 components
  94. pca = PCA(n_components=2)
  95. pca_transformed = pd.DataFrame(pca.fit_transform(transformed))
  96. plt.figure()
  97. plt.scatter(pca_transformed.loc[:, 0], pca_transformed.loc[:, 1])
  98. plt.title("2-D data (from 4-D data)")
  99. plt.xlabel("New var 1")
  100. plt.ylabel("New var 2")
  101. plt.show()
  102.  
  103.  
  104. # PCA - 2 components - Go back and compare the 2-D plots
  105. pca_inverse = pd.DataFrame(pca.inverse_transform(pca_transformed), columns=engdata.columns)
  106. plt.figure()
  107. plt.scatter(pca_transformed.loc[:, 0], pca_transformed.loc[:, 1])
  108. plt.title("Inversed 2-D data (from 4-D data) can now be depicted")
  109. plt.xlabel("New var 1")
  110. plt.ylabel("New var 2")
  111. plt.show()
  112. print()
  113. print()
  114. print("transformed = ")
  115. print(transformed)
  116. print()
  117. print("inversed (having kept 2-D instead of 4-D) = ")
  118. print(pca_inverse)
  119. print()
  120.  
  121.  
  122. # New plot with new variables Age' and Salary'
  123. plt.figure()
  124. plt.scatter(pca_inverse[(location == "EU")].Age, pca_inverse[(location == "EU")].Salary, c="red", marker="+", label="EU")
  125. plt.scatter(pca_inverse[(location == "US")].Age, pca_inverse[(location == "US")].Salary, c="blue", marker="o", label="US")
  126. plt.title("Age' - Salary'")
  127. plt.xlabel("Age'")
  128. plt.ylabel("Salary'")
  129. plt.legend()
  130. plt.show()
  131.  
  132. # Info loss
  133. info_loss = (eigenvalues[2] + eigenvalues[3]) / sum(eigenvalues)
  134. print("Info loss = " + str(100 * info_loss) + " %")
  135.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement