Advertisement
GeorgiLukanov87

KMeansClusterAlg

Aug 10th, 2024 (edited)
169
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 8.71 KB | None | 0 0
  1. from sklearn.cluster import KMeans
  2. from sklearn.decomposition import PCA
  3. import matplotlib.pyplot as plt
  4. from IPython.display import clear_output
  5.  
  6. import pandas as pd
  7. import numpy as np
  8.  
  9. """
  10. KMeans Cluster Algorithm steps:
  11.  
  12. 1.Read data from csv
  13. 2.Prepare data - clean data and rename
  14. 3.Scaling/normalize data - min max scaling
  15. 4.Initialize random CENTROIDS
  16. 5.Label each data point - Calculate geometric mean distance between all data-points and CENTROIDS
  17. 6.Update CENTROIDS
  18. 7.Repeat step 5 and 6 until CENTROIDS stop changing
  19. """
  20.  
  21.  
  22. # Elbow method is to find the best count of the clusters we need for the KMeans Cluster Algorithm
  23. def elbow_method1(scaled_data):
  24.     means = range(1, 12)
  25.     inertias = []
  26.     for k in means:
  27.         km = KMeans(n_clusters=k, random_state=42)
  28.         km.fit(scaled_data)
  29.         inertias.append(km.inertia_)
  30.  
  31.     plt.title('Elbow Method for Optimal Clusters')
  32.     plt.plot(means, inertias, 'o-')
  33.     plt.xlabel('Number of Clusters')
  34.     plt.ylabel('Inertia')
  35.     plt.grid(True)
  36.     plt.show()
  37.  
  38.  
  39. def rename_data(df):  # Rename columns
  40.     df.rename(
  41.         columns={
  42.             'selling_price': 'price',
  43.             'engine': 'engine_cc',
  44.             'km_driven': 'kms',
  45.             'max_power': 'horsepower'
  46.         }, inplace=True)
  47.     return df
  48.  
  49.  
  50. def clean_car_data(df):  # Clean data
  51.     cleaned_df = df.copy()
  52.     re_pattern = f'([0-9]+)'
  53.  
  54.     cleaned_df['price'] = cleaned_df['price'].apply(lambda x: x / 10)
  55.     cleaned_df['engine_cc'] = cleaned_df['engine_cc'].str.extract(re_pattern).astype(float)
  56.     cleaned_df['horsepower'] = cleaned_df['horsepower'].str.extract(re_pattern).astype(float)
  57.     cleaned_df['mileage'] = cleaned_df['mileage'].str.extract(re_pattern).astype(float)
  58.  
  59.     return cleaned_df
  60.  
  61.  
  62. # Read data
  63. cars = pd.read_csv('selling_cars_list.csv')
  64. features = ['year', 'price', 'kms', 'engine_cc', 'horsepower', 'seats', 'mileage']
  65. cars = rename_data(cars)
  66. cars = cars.dropna(subset=features)
  67. data = cars[features].copy()
  68. car_data = clean_car_data(data)
  69.  
  70. """
  71.                        Cleaned data:
  72.    year         price        kms        engine         hp
  73. ------------------------------------------------------------
  74. 0  2014    |    45000.0   |   145500   |   1248.0   |    74.0
  75. 1  2014    |    37000.0   |   120000   |   1498.0   |    103.0
  76. 2  2006    |    15800.0   |   140000   |   1497.0   |    78.0
  77. 3  2010    |    22500.0   |   127000   |   1396.0   |    90.0
  78. 4  2007    |    13000.0   |   120000   |   1298.0   |    88.0
  79. -------------------------------------------------------------
  80. """
  81. # Scaling data, Min Max Scaling
  82. # (No negative value or 0) the values must be between (0.1 to 1) or (1 to 10) or (10 to 100) etc...
  83. data = ((car_data - car_data.min()) / (car_data.max() - car_data.min())) * 9 + 1
  84. # data = ((car_data - car_data.min()) / (car_data.max() - car_data.min())) * 0.9 + 0.1
  85. elbow_method1(car_data)
  86. """
  87.                        Scaled data:
  88.    year            price            kms          engine          hp
  89. ------------------------------------------------------------------------
  90. 0  7.923077   |    1.379138   |   1.554762   |   2.884564   |   2.027174
  91. 1  7.923077   |    1.306922   |   1.457535   |   3.639597   |   2.736413
  92. 2  5.153846   |    1.115548   |   1.533791   |   3.636577   |   2.125000
  93. 3  6.538462   |    1.176029   |   1.484225   |   3.331544   |   2.418478
  94. 4  5.500000   |    1.090272   |   1.457535   |   3.035570   |   2.369565
  95. ------------------------------------------------------------------------
  96. """
  97. print(f'\nCleaned data: \n', car_data.head())
  98. print(f'\nScaled data: \n', data.head())
  99.  
  100.  
  101. def create_random_centroids(data, k):
  102.     random_centroids = []
  103.     for _ in range(k):
  104.         centroid = data.apply(lambda x: float(x.sample().iloc[0]))  # sample() get a random value from each column
  105.         # Type of centroid <class 'pandas.core.series.Series'>
  106.         random_centroids.append(centroid)
  107.  
  108.     # Convert into <class 'pandas.core.frame.DataFrame'>
  109.     convert_to_df = pd.concat(random_centroids, axis=1)
  110.     return convert_to_df
  111.  
  112.  
  113. # K = Number of Clusters
  114. # k = 6
  115. # centroids = create_random_centroids(data, k)
  116. # print(f'\nCentroids: ')
  117. # print(centroids)
  118.  
  119.  
  120. # Finding distance between data points and centroid using Pythagorean theorem
  121. # distance = np.sqrt((data - centroids.iloc[:, 0]) ** 2).sum(axis=1)
  122. def find_labels(data, centroids):
  123.     distances = centroids.apply(lambda x: np.sqrt((data - x) ** 2).sum(axis=1))
  124.     return distances.idxmin(axis=1)
  125.  
  126.  
  127. # labels = find_labels(data, centroids)
  128.  
  129.  
  130. # Create new centroids updated with geometric mean data
  131. def new_centroids(data, labels):
  132.     return data.groupby(labels).apply(lambda x: np.exp(np.log(x).mean())).T
  133.  
  134.  
  135. # Visualisation
  136. """"
  137. # PCA - Principle Components Analysis.
  138. PCA Class Help us to transform the 5 dimensional features data into 2 dimensional data.
  139. It's much easier to display 2 instead of 5 or more.
  140. """
  141.  
  142.  
  143. def plot_clusters(data, labels, centroids, iteration):
  144.     pca = PCA(n_components=2)
  145.     data_2d = pca.fit_transform(data)
  146.     centroids_2d = pca.transform(centroids.T)
  147.     clear_output(wait=True)
  148.     plt.title(f'Iteration {iteration}')
  149.     plt.scatter(x=data_2d[:, 0], y=data_2d[:, 1], c=labels)
  150.     plt.scatter(x=centroids_2d[:, 0], y=centroids_2d[:, 1], color='red', s=75)
  151.     plt.show()
  152.  
  153.  
  154. def k_means_algorithm(max_iterations=50, centroid_count=5):
  155.     centroids = create_random_centroids(data, centroid_count)
  156.     print(f'\nFirst Centroid:\n', centroids)
  157.     old_centroids = pd.DataFrame()
  158.     iteration = 1
  159.     max_i = 0
  160.     while iteration < max_iterations and not centroids.equals(old_centroids):
  161.         old_centroids = centroids
  162.  
  163.         labels = find_labels(data, centroids)
  164.         centroids = new_centroids(data, labels)
  165.         plot_clusters(data, labels, centroids, iteration)  # Visualization function of how clusters changing
  166.         max_i = iteration
  167.         iteration += 1
  168.  
  169.     print(f'\nLast Centroids:\n', centroids)
  170.     print(f'after {max_i} iterations')
  171.     print(f'\nCount of cars going to every cluster category:\n', labels.value_counts())
  172.     return labels
  173.  
  174.  
  175. labels = k_means_algorithm(50, 6)
  176.  
  177. top_clusters = []
  178. for i in range(len(labels.value_counts())):
  179.     top_clusters.append(int(labels.value_counts().index[i]))
  180.  
  181. """
  182. # Categories
  183. #1 - Best deal car (balanced stats, good year, middle price, low kms, good engine cc)
  184. #2 - Normal city every day car (balanced stats, low consume)
  185. #3 - Normal city every day car (balanced stats, low consume)
  186. #4 - Big old car(High engine cc, high consume, 5+ seats = Vans, bus, mini-vans etc...)
  187. #5 - Fast Sport luxury car(very high stats - year, engine, horsepower)
  188. #6 - Cheap and old and small, last chance cars, bad stats
  189. """
  190.  
  191. df_list1, df_list2, df_list3, df_list4, df_list5 = [], [], [], [], []
  192.  
  193. for car_index, cluster in labels.items():
  194.     row = cars.loc[car_index].copy()
  195.  
  196.     if cluster == top_clusters[0]:
  197.         row['cluster'] = 1
  198.         df_list1.append(row)
  199.  
  200.     elif cluster == top_clusters[1] or cluster == top_clusters[2]:
  201.         row['cluster'] = 2
  202.         df_list2.append(row)
  203.  
  204.     elif cluster == top_clusters[3]:
  205.         row['cluster'] = 3
  206.         df_list3.append(row)
  207.  
  208.     elif cluster == top_clusters[4]:
  209.         row['cluster'] = 4
  210.         df_list4.append(row)
  211.  
  212.     elif cluster == top_clusters[5]:
  213.         row['cluster'] = 5
  214.         df_list5.append(row)
  215.  
  216. new_cars_cluster1 = clean_car_data(pd.DataFrame(df_list1))
  217. new_cars_cluster2 = clean_car_data(pd.DataFrame(df_list2))
  218. new_cars_cluster3 = clean_car_data(pd.DataFrame(df_list3))
  219. new_cars_cluster4 = clean_car_data(pd.DataFrame(df_list4))
  220. new_cars_cluster5 = clean_car_data(pd.DataFrame(df_list5))
  221.  
  222. y = np.array([len(new_cars_cluster1), len(new_cars_cluster2), len(new_cars_cluster3), len(new_cars_cluster4),
  223.               len(new_cars_cluster5)])
  224. mylabels = ["Best offers", "Balanced cars", "Big cars/jeeps/4x4", "Fast luxury cars", "Cheap and old cars", ]
  225. myexplode = [0.2, 0, 0, 0, 0]
  226. plt.pie(y, labels=mylabels, explode=myexplode)
  227. plt.show()
  228.  
  229. print('\nBest deal car (balanced stats, good year, middle price, low kms, good engine cc) \n', new_cars_cluster1)
  230. new_cars_cluster1.to_csv('new_cars_cluster1.csv')
  231.  
  232. print('\nNormal city every day car (balanced stats)\n', new_cars_cluster2)
  233. new_cars_cluster2.to_csv('new_cars_cluster2.csv')
  234.  
  235. print('\nBig old car(High engine cc,5+ seats = Vans, bus,jeep, 4x4, mini-vans etc...)\n', new_cars_cluster3)
  236. new_cars_cluster3.to_csv('new_cars_cluster3.csv')
  237.  
  238. print('\nFast Sport luxury car(new year, very high price, very fast)\n', new_cars_cluster4)
  239. new_cars_cluster4.to_csv('new_cars_cluster4.csv')
  240.  
  241. print('\nCheap and old and small, last chance cars\n', new_cars_cluster5)
  242. new_cars_cluster5.to_csv('new_cars_cluster5.csv')
  243.  
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement