Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import numpy as np
- import pandas as pd
- import matplotlib.pyplot as plt
- import sklearn
- # Read data
- engdata = pd.read_csv("./engdata.txt")
- pdata = engdata.loc[:, ["Age", "Salary"]] # Like 'X'
- print("********************************************")
- print("pdata: ")
- print(pdata)
- print()
- print("pdata summary: ")
- print(pdata.describe())
- print("********************************************")
- print()
- # Preprocessing and cleansing data
- # 1. Drop Duplicates
- print("Data length with duplicates = " + str(len(pdata)))
- pdata = pdata.drop_duplicates()
- print("Data length without duplicates = " + str(len(pdata)))
- print()
- # 2. Subtract mean, divide by std (Zero-mean Data with std_new = 1)
- from sklearn.preprocessing import StandardScaler
- scaler = StandardScaler()
- scaler = scaler.fit(pdata)
- transformed = pd.DataFrame(scaler.transform(pdata), columns=["Age", "Salary"])
- print("********************************************")
- print("transformed pdata: ")
- print(transformed)
- print()
- print("transformed pdata summary: ")
- print(transformed.describe())
- print("********************************************")
- print()
- plt.figure()
- plt.subplot(1, 2, 1)
- plt.title("Initial Data")
- plt.scatter(pdata["Age"], pdata["Salary"])
- plt.subplot(1, 2, 2)
- plt.title("Transformed Data (mean' = 0, std' = 1)")
- plt.scatter(transformed["Age"], transformed["Salary"])
- plt.show()
- # Sampling
- n = 150
- data_sample = pdata.sample(n=n, random_state=1, replace=True)
- plt.figure()
- plt.subplot(1, 2, 1)
- plt.title("Initial Data with length = " + str(len(pdata)))
- plt.scatter(pdata["Age"], pdata["Salary"])
- plt.subplot(1, 2, 2)
- plt.title("Sampled Data with length = " + str(n))
- plt.scatter(data_sample["Age"], data_sample["Salary"])
- plt.show()
- # Discretization
- discAge = pd.cut(pdata.Age, [0, 10, 20, 30, 40, 50, 60, 70, 80])
- discSalary = pd.cut(pdata.Salary, pd.interval_range(start=0, freq=400, end=4000))
- print("********************************************")
- print("discAge = ")
- print(discAge)
- print()
- print("discSalary = ")
- print(discSalary)
- print("********************************************")
- print()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement