Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- import sklearn
- '''
- # Create a DataFrame
- X = [1, 0, -1, 0, -1, 1]
- Y = [0, 1, 1, -1, 0, -1]
- Z = [-1, -1, 0, 1, 1, 0]
- labels = ["x1", "x2", "x3", "x4", "x5", "x6"]
- pdata = pd.DataFrame({"X": X, "Y": Y, "Z": Z}, index=labels)
- print("pdata = ")
- print(pdata)
- print()
- # Plots - 3D scatter
- fig = plt.figure()
- ax = fig.add_subplot(projection='3d')
- ax.scatter(pdata.X, pdata.Y, pdata.Z)
- for i in range(len(pdata.index)):
- ax.text(pdata.loc[labels[i], "X"], pdata.loc[labels[i], "Y"], pdata.loc[labels[i], "Z"], '%s' % (str(labels[i])), size=20, zorder=1)
- ax.set_xlabel('X Axis')
- ax.set_ylabel('Y Axis')
- ax.set_zlabel('Z Axis')
- plt.show()
- '''
- # Read Data
- engdata = pd.read_csv("./engdata.txt")
- print("engdata = ")
- print(engdata)
- print()
- print("engdata summary = ")
- print(engdata.describe())
- print()
- # Find the Class
- location = engdata.Location
- engdata = engdata.drop(["Location"], axis=1)
- # Plot data with different colors
- plt.figure()
- plt.scatter(engdata[location == "EU"].Age, engdata[location == "EU"].Salary, color="red", marker="+", label="EU")
- plt.scatter(engdata[location == "US"].Age, engdata[location == "US"].Salary, color="blue", marker="o", label="US")
- plt.title("Age - Salary")
- plt.xlabel("Age")
- plt.ylabel("Salary")
- plt.legend()
- plt.show()
- print("Correlation Matrix (between 2 features) = ")
- print(engdata.corr())
- print()
- print("We observe that Age and WorkExp have a high correlation.")
- print("This means that also the correalation Age-FeatureX and WorkExp-FeatureX have approximately the same value.")
- print("We could drop the WorkExp column, but we will continue with 4 features.")
- print()
- print()
- # PCA: X features ---> X eigenvalues and eigenvectors. NEEDS A SCALER
- from sklearn.preprocessing import StandardScaler
- from sklearn.decomposition import PCA
- # Scaler
- scaler = StandardScaler()
- scaler = scaler.fit(engdata)
- transformed = pd.DataFrame(scaler.transform(engdata), columns=engdata.columns)
- # PCA - 4 components
- pca = PCA()
- pca = pca.fit(transformed)
- pca_transformed = pca.transform(transformed)
- # Eigen
- eigenvalues = pca.explained_variance_
- eigenvectors = pca.components_
- print("**************************************************************")
- print("Eigenvalues = " + str(eigenvalues))
- print("Eigenvectors = ")
- print(eigenvectors)
- print("**************************************************************")
- print()
- # Principal Components
- plt.figure()
- plt.bar(range(1, len(eigenvalues)+1), 100 * eigenvalues/sum(eigenvalues))
- plt.title("Principal components with 4 features")
- plt.xlabel("Eigenvalue #")
- plt.ylabel("Component's weight (%)")
- plt.show()
- # PCA - 2 components
- pca = PCA(n_components=2)
- pca_transformed = pd.DataFrame(pca.fit_transform(transformed))
- plt.figure()
- plt.scatter(pca_transformed.loc[:, 0], pca_transformed.loc[:, 1])
- plt.title("2-D data (from 4-D data)")
- plt.xlabel("New var 1")
- plt.ylabel("New var 2")
- plt.show()
- # PCA - 2 components - Go back and compare the 2-D plots
- pca_inverse = pd.DataFrame(pca.inverse_transform(pca_transformed), columns=engdata.columns)
- plt.figure()
- plt.scatter(pca_transformed.loc[:, 0], pca_transformed.loc[:, 1])
- plt.title("Inversed 2-D data (from 4-D data) can now be depicted")
- plt.xlabel("New var 1")
- plt.ylabel("New var 2")
- plt.show()
- print()
- print()
- print("transformed = ")
- print(transformed)
- print()
- print("inversed (having kept 2-D instead of 4-D) = ")
- print(pca_inverse)
- print()
- # New plot with new variables Age' and Salary'
- plt.figure()
- plt.scatter(pca_inverse[(location == "EU")].Age, pca_inverse[(location == "EU")].Salary, c="red", marker="+", label="EU")
- plt.scatter(pca_inverse[(location == "US")].Age, pca_inverse[(location == "US")].Salary, c="blue", marker="o", label="US")
- plt.title("Age' - Salary'")
- plt.xlabel("Age'")
- plt.ylabel("Salary'")
- plt.legend()
- plt.show()
- # Info loss
- info_loss = (eigenvalues[2] + eigenvalues[3]) / sum(eigenvalues)
- print("Info loss = " + str(100 * info_loss) + " %")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement