Advertisement
mirosh111000

Метод Apriori

Feb 24th, 2024
74
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 6.36 KB | None | 0 0
  1. import pandas as pd
  2. import numpy as np
  3. import seaborn as sns
  4. import matplotlib.pyplot as plt
  5. from mlxtend.frequent_patterns import apriori, association_rules
  6. from itertools import combinations
  7. import time
  8.  
  9.  
  10. def EDA(df):
  11.     total_na = df.isna().sum().sum()
  12.     print("Dimensions : %d rows, %d columns" % (df.shape[0], df.shape[1]))
  13.     print("Total NA Values : %d " % (total_na))
  14.     print("%38s %10s     %10s %10s" % ("Column Name", "Data Type", "Count Distinct", "NA Values"))
  15.     col_name = df.columns
  16.     dtypes = df.dtypes
  17.     uniq = df.nunique()
  18.     na_val = df.isna().sum()
  19.     for i in range(len(df.columns)):
  20.         print("%38s %10s   %10s %10s" % (col_name[i], dtypes[i], uniq[i], na_val[i]))
  21.  
  22.        
  23. def data_to_1_and_0(df):
  24.    
  25.     df = df.fillna("-")
  26.     data = pd.DataFrame(columns=np.unique([i for i in df.stack().unique() if i != "-"]))
  27.    
  28.     for index, row in df.iterrows():
  29.         row_ = []
  30.         for word in data.columns:
  31.             if word in row.values:
  32.                 row_.append(1)
  33.             else:
  34.                 row_.append(0)
  35.         data.loc[index] = row_
  36.    
  37.     return data
  38.  
  39.  
  40.  
  41. class Apriori:
  42.     def __init__(self, min_support=0.25, min_confidence=0.3):
  43.         self.min_support = min_support
  44.         self.min_confidence = min_confidence
  45.         self.columns_ = None
  46.         self._transactions = None
  47.         self.N = None
  48.    
  49.     def _get_frequent_itemsets(self, df):
  50.         self.columns_ = df.columns
  51.         df = df.copy()
  52.         self.N = len(df)
  53.         transactions = df.values
  54.         transactions = np.copy(transactions)
  55.         self._transactions = transactions
  56.         names_itemsets = {}
  57.         n_col = transactions.shape[1]
  58.         combinations = []
  59.         total = 0
  60.        
  61.         for i in range(n_col):
  62.             total += 1
  63.             sum_ = np.copy(transactions[:, i])
  64.             sup = np.sum(sum_[sum_ == 1]) / self.N
  65.             list_sup = []
  66.                
  67.             if sup >= self.min_support:
  68.                 list_sup.append(i)
  69.                 names_itemsets[f'{[self.columns_[i] for i in list_sup]}'] = sup
  70.                 combinations.append(list_sup)
  71.        
  72.         floor = 2
  73.         count_comb = len(combinations)
  74.         while len(combinations[-1]) != n_col and floor < n_col:
  75.             n = count_comb
  76.             if n == 0:
  77.                 break
  78.                
  79.             count_comb = 0
  80.             for comb in combinations[-n:]:
  81.                 if comb[-1] == n_col - 1 and comb[0] == n_col - 1:
  82.                     break
  83.                 for l in range(comb[-1] + 1, n_col):
  84.                     total += 1
  85.                     list_sup = [i for i in comb]
  86.                     count = len(comb) + 1
  87.  
  88.                     sum_ = np.zeros_like(np.copy(transactions[:, comb[0]]))
  89.                     for k in comb:
  90.                         sum_ += np.copy(transactions[:, k])
  91.                     sum_ += np.copy(transactions[:, l])
  92.                     sup = (np.sum(sum_[sum_ == count]) / count) / self.N
  93.  
  94.                     if sup >= self.min_support:
  95.                         list_sup.append(l)
  96.                         names_itemsets[f'{[self.columns_[i] for i in list_sup]}'] = sup
  97.                         combinations.append(list_sup)
  98.                         count_comb += 1
  99.  
  100.             floor += 1
  101.  
  102.         frequent_itemsets = pd.DataFrame(list(names_itemsets.values()), index=names_itemsets.keys(), columns=['Support'])
  103.         frequent_itemsets['Itemsets'] = [', '.join(map(str, [self.columns_[col] for col in itemset])) for itemset in combinations]
  104.         frequent_itemsets.index = range(len(frequent_itemsets))
  105.         frequent_itemsets['Num_Itemsets'] = combinations
  106.        
  107.         return frequent_itemsets
  108.    
  109.     def _generate_rules(self, input_frame):
  110.         rules = []
  111.         for idx, row in input_frame.iterrows():
  112.             items = row['Num_Itemsets']
  113.             support = row['Support']
  114.             if len(items) > 1:
  115.                 for r in range(1, len(items)):
  116.                     antecedents = list(combinations(items, r))
  117.                     for antecedent in antecedents:
  118.                         consequent = list([item for item in items if item not in antecedent])
  119.                        
  120.                         sum_A = np.zeros_like(self._transactions[:, 0])
  121.                         sum_A_B = round(support * self.N)
  122.                         for l in antecedent:
  123.                             sum_A += self._transactions[:, l]
  124.                         sum_A = (np.sum(sum_A[sum_A == len(antecedent)])) / len(antecedent)
  125.                         conf = sum_A_B / sum_A
  126.                         if conf >= self.min_confidence:
  127.                             rules.append((', '.join(map(str, [self.columns_[col] for col in antecedent])),
  128.                                           ', '.join(map(str, [self.columns_[col] for col in consequent])),
  129.                                           support, conf))
  130.                            
  131.         output_frame = pd.DataFrame(rules, columns=['Antecedents', 'Consequents', 'Support', 'Confidence'])
  132.         output_frame = output_frame.sort_values(by='Confidence')[::-1]
  133.         return output_frame
  134.    
  135.     def fit(self, df):
  136.         support_df = self._get_frequent_itemsets(df)
  137.         rules_df = self._generate_rules(support_df)
  138.         support_df = support_df[['Support', 'Itemsets']]
  139.        
  140.         return support_df, rules_df
  141.  
  142.    
  143. # df = pd.read_csv('Market_Basket_Optimisation.csv')
  144. df = pd.read_csv("retail_dataset.csv")
  145. print(df)
  146. EDA(df)
  147. df = data_to_1_and_0(df)
  148. print(df)
  149. EDA(df)
  150.  
  151. start_time = time.time()
  152. method_apriori = Apriori(min_support=0.15, min_confidence=0.7)
  153. support, my_rules = method_apriori.fit(df)
  154. end_time = time.time()
  155. my_time = end_time - start_time
  156. print(support)
  157. print(my_rules)
  158.  
  159. start_time = time.time()
  160. frequent_itemsets = apriori(df, min_support=0.15, use_colnames=True)
  161. rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
  162. rules = rules[['antecedents', 'consequents', 'support', 'confidence']]
  163. rules = rules.sort_values(by='confidence')[::-1]
  164. end_time = time.time()
  165. mlx_time = end_time - start_time
  166. print(frequent_itemsets)
  167. print(rules)
  168.  
  169. print(f'\nЧас роботи алгоритму {round(my_time, 6)} секунд.\nЧас роботи вбудованого алгоритму: {round(mlx_time, 6)} секунд.')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement