Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from sklearn.cluster import KMeans
- from sklearn.decomposition import PCA
- import matplotlib.pyplot as plt
- from IPython.display import clear_output
- import pandas as pd
- import numpy as np
- """
- KMeans Cluster Algorithm steps:
- 1.Read data from csv
- 2.Prepare data - clean data and rename
- 3.Scaling/normalize data - min max scaling
- 4.Initialize random CENTROIDS
- 5.Label each data point - Calculate geometric mean distance between all data-points and CENTROIDS
- 6.Update CENTROIDS
- 7.Repeat step 5 and 6 until CENTROIDS stop changing
- """
- # Elbow method is to find the best count of the clusters we need for the KMeans Cluster Algorithm
- def elbow_method1(scaled_data):
- means = range(1, 12)
- inertias = []
- for k in means:
- km = KMeans(n_clusters=k, random_state=42)
- km.fit(scaled_data)
- inertias.append(km.inertia_)
- plt.title('Elbow Method for Optimal Clusters')
- plt.plot(means, inertias, 'o-')
- plt.xlabel('Number of Clusters')
- plt.ylabel('Inertia')
- plt.grid(True)
- plt.show()
- def rename_data(df): # Rename columns
- df.rename(
- columns={
- 'selling_price': 'price',
- 'engine': 'engine_cc',
- 'km_driven': 'kms',
- 'max_power': 'horsepower'
- }, inplace=True)
- return df
- def clean_car_data(df): # Clean data
- cleaned_df = df.copy()
- re_pattern = f'([0-9]+)'
- cleaned_df['price'] = cleaned_df['price'].apply(lambda x: x / 10)
- cleaned_df['engine_cc'] = cleaned_df['engine_cc'].str.extract(re_pattern).astype(float)
- cleaned_df['horsepower'] = cleaned_df['horsepower'].str.extract(re_pattern).astype(float)
- cleaned_df['mileage'] = cleaned_df['mileage'].str.extract(re_pattern).astype(float)
- return cleaned_df
- # Read data
- cars = pd.read_csv('selling_cars_list.csv')
- features = ['year', 'price', 'kms', 'engine_cc', 'horsepower', 'seats', 'mileage']
- cars = rename_data(cars)
- cars = cars.dropna(subset=features)
- data = cars[features].copy()
- car_data = clean_car_data(data)
- """
- Cleaned data:
- year price kms engine hp
- ------------------------------------------------------------
- 0 2014 | 45000.0 | 145500 | 1248.0 | 74.0
- 1 2014 | 37000.0 | 120000 | 1498.0 | 103.0
- 2 2006 | 15800.0 | 140000 | 1497.0 | 78.0
- 3 2010 | 22500.0 | 127000 | 1396.0 | 90.0
- 4 2007 | 13000.0 | 120000 | 1298.0 | 88.0
- -------------------------------------------------------------
- """
- # Scaling data, Min Max Scaling
- # (No negative value or 0) the values must be between (0.1 to 1) or (1 to 10) or (10 to 100) etc...
- data = ((car_data - car_data.min()) / (car_data.max() - car_data.min())) * 9 + 1
- # data = ((car_data - car_data.min()) / (car_data.max() - car_data.min())) * 0.9 + 0.1
- elbow_method1(car_data)
- """
- Scaled data:
- year price kms engine hp
- ------------------------------------------------------------------------
- 0 7.923077 | 1.379138 | 1.554762 | 2.884564 | 2.027174
- 1 7.923077 | 1.306922 | 1.457535 | 3.639597 | 2.736413
- 2 5.153846 | 1.115548 | 1.533791 | 3.636577 | 2.125000
- 3 6.538462 | 1.176029 | 1.484225 | 3.331544 | 2.418478
- 4 5.500000 | 1.090272 | 1.457535 | 3.035570 | 2.369565
- ------------------------------------------------------------------------
- """
- print(f'\nCleaned data: \n', car_data.head())
- print(f'\nScaled data: \n', data.head())
- def create_random_centroids(data, k):
- random_centroids = []
- for _ in range(k):
- centroid = data.apply(lambda x: float(x.sample().iloc[0])) # sample() get a random value from each column
- # Type of centroid <class 'pandas.core.series.Series'>
- random_centroids.append(centroid)
- # Convert into <class 'pandas.core.frame.DataFrame'>
- convert_to_df = pd.concat(random_centroids, axis=1)
- return convert_to_df
- # K = Number of Clusters
- # k = 6
- # centroids = create_random_centroids(data, k)
- # print(f'\nCentroids: ')
- # print(centroids)
- # Finding distance between data points and centroid using Pythagorean theorem
- # distance = np.sqrt((data - centroids.iloc[:, 0]) ** 2).sum(axis=1)
- def find_labels(data, centroids):
- distances = centroids.apply(lambda x: np.sqrt((data - x) ** 2).sum(axis=1))
- return distances.idxmin(axis=1)
- # labels = find_labels(data, centroids)
- # Create new centroids updated with geometric mean data
- def new_centroids(data, labels):
- return data.groupby(labels).apply(lambda x: np.exp(np.log(x).mean())).T
- # Visualisation
- """"
- # PCA - Principle Components Analysis.
- PCA Class Help us to transform the 5 dimensional features data into 2 dimensional data.
- It's much easier to display 2 instead of 5 or more.
- """
- def plot_clusters(data, labels, centroids, iteration):
- pca = PCA(n_components=2)
- data_2d = pca.fit_transform(data)
- centroids_2d = pca.transform(centroids.T)
- clear_output(wait=True)
- plt.title(f'Iteration {iteration}')
- plt.scatter(x=data_2d[:, 0], y=data_2d[:, 1], c=labels)
- plt.scatter(x=centroids_2d[:, 0], y=centroids_2d[:, 1], color='red', s=75)
- plt.show()
- def k_means_algorithm(max_iterations=50, centroid_count=5):
- centroids = create_random_centroids(data, centroid_count)
- print(f'\nFirst Centroid:\n', centroids)
- old_centroids = pd.DataFrame()
- iteration = 1
- max_i = 0
- while iteration < max_iterations and not centroids.equals(old_centroids):
- old_centroids = centroids
- labels = find_labels(data, centroids)
- centroids = new_centroids(data, labels)
- plot_clusters(data, labels, centroids, iteration) # Visualization function of how clusters changing
- max_i = iteration
- iteration += 1
- print(f'\nLast Centroids:\n', centroids)
- print(f'after {max_i} iterations')
- print(f'\nCount of cars going to every cluster category:\n', labels.value_counts())
- return labels
- labels = k_means_algorithm(50, 6)
- top_clusters = []
- for i in range(len(labels.value_counts())):
- top_clusters.append(int(labels.value_counts().index[i]))
- """
- # Categories
- #1 - Best deal car (balanced stats, good year, middle price, low kms, good engine cc)
- #2 - Normal city every day car (balanced stats, low consume)
- #3 - Normal city every day car (balanced stats, low consume)
- #4 - Big old car(High engine cc, high consume, 5+ seats = Vans, bus, mini-vans etc...)
- #5 - Fast Sport luxury car(very high stats - year, engine, horsepower)
- #6 - Cheap and old and small, last chance cars, bad stats
- """
- df_list1, df_list2, df_list3, df_list4, df_list5 = [], [], [], [], []
- for car_index, cluster in labels.items():
- row = cars.loc[car_index].copy()
- if cluster == top_clusters[0]:
- row['cluster'] = 1
- df_list1.append(row)
- elif cluster == top_clusters[1] or cluster == top_clusters[2]:
- row['cluster'] = 2
- df_list2.append(row)
- elif cluster == top_clusters[3]:
- row['cluster'] = 3
- df_list3.append(row)
- elif cluster == top_clusters[4]:
- row['cluster'] = 4
- df_list4.append(row)
- elif cluster == top_clusters[5]:
- row['cluster'] = 5
- df_list5.append(row)
- new_cars_cluster1 = clean_car_data(pd.DataFrame(df_list1))
- new_cars_cluster2 = clean_car_data(pd.DataFrame(df_list2))
- new_cars_cluster3 = clean_car_data(pd.DataFrame(df_list3))
- new_cars_cluster4 = clean_car_data(pd.DataFrame(df_list4))
- new_cars_cluster5 = clean_car_data(pd.DataFrame(df_list5))
- y = np.array([len(new_cars_cluster1), len(new_cars_cluster2), len(new_cars_cluster3), len(new_cars_cluster4),
- len(new_cars_cluster5)])
- mylabels = ["Best offers", "Balanced cars", "Big cars/jeeps/4x4", "Fast luxury cars", "Cheap and old cars", ]
- myexplode = [0.2, 0, 0, 0, 0]
- plt.pie(y, labels=mylabels, explode=myexplode)
- plt.show()
- print('\nBest deal car (balanced stats, good year, middle price, low kms, good engine cc) \n', new_cars_cluster1)
- new_cars_cluster1.to_csv('new_cars_cluster1.csv')
- print('\nNormal city every day car (balanced stats)\n', new_cars_cluster2)
- new_cars_cluster2.to_csv('new_cars_cluster2.csv')
- print('\nBig old car(High engine cc,5+ seats = Vans, bus,jeep, 4x4, mini-vans etc...)\n', new_cars_cluster3)
- new_cars_cluster3.to_csv('new_cars_cluster3.csv')
- print('\nFast Sport luxury car(new year, very high price, very fast)\n', new_cars_cluster4)
- new_cars_cluster4.to_csv('new_cars_cluster4.csv')
- print('\nCheap and old and small, last chance cars\n', new_cars_cluster5)
- new_cars_cluster5.to_csv('new_cars_cluster5.csv')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement