Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import pandas as pd
- import numpy as np
- import seaborn as sns
- import matplotlib.pyplot as plt
- from mlxtend.frequent_patterns import apriori, association_rules
- from itertools import combinations
- import time
- def EDA(df):
- total_na = df.isna().sum().sum()
- print("Dimensions : %d rows, %d columns" % (df.shape[0], df.shape[1]))
- print("Total NA Values : %d " % (total_na))
- print("%38s %10s %10s %10s" % ("Column Name", "Data Type", "Count Distinct", "NA Values"))
- col_name = df.columns
- dtypes = df.dtypes
- uniq = df.nunique()
- na_val = df.isna().sum()
- for i in range(len(df.columns)):
- print("%38s %10s %10s %10s" % (col_name[i], dtypes[i], uniq[i], na_val[i]))
- def data_to_1_and_0(df):
- df = df.fillna("-")
- data = pd.DataFrame(columns=np.unique([i for i in df.stack().unique() if i != "-"]))
- for index, row in df.iterrows():
- row_ = []
- for word in data.columns:
- if word in row.values:
- row_.append(1)
- else:
- row_.append(0)
- data.loc[index] = row_
- return data
- class Apriori:
- def __init__(self, min_support=0.25, min_confidence=0.3):
- self.min_support = min_support
- self.min_confidence = min_confidence
- self.columns_ = None
- self._transactions = None
- self.N = None
- def _get_frequent_itemsets(self, df):
- self.columns_ = df.columns
- df = df.copy()
- self.N = len(df)
- transactions = df.values
- transactions = np.copy(transactions)
- self._transactions = transactions
- names_itemsets = {}
- n_col = transactions.shape[1]
- combinations = []
- total = 0
- for i in range(n_col):
- total += 1
- sum_ = np.copy(transactions[:, i])
- sup = np.sum(sum_[sum_ == 1]) / self.N
- list_sup = []
- if sup >= self.min_support:
- list_sup.append(i)
- names_itemsets[f'{[self.columns_[i] for i in list_sup]}'] = sup
- combinations.append(list_sup)
- floor = 2
- count_comb = len(combinations)
- while len(combinations[-1]) != n_col and floor < n_col:
- n = count_comb
- if n == 0:
- break
- count_comb = 0
- for comb in combinations[-n:]:
- if comb[-1] == n_col - 1 and comb[0] == n_col - 1:
- break
- for l in range(comb[-1] + 1, n_col):
- total += 1
- list_sup = [i for i in comb]
- count = len(comb) + 1
- sum_ = np.zeros_like(np.copy(transactions[:, comb[0]]))
- for k in comb:
- sum_ += np.copy(transactions[:, k])
- sum_ += np.copy(transactions[:, l])
- sup = (np.sum(sum_[sum_ == count]) / count) / self.N
- if sup >= self.min_support:
- list_sup.append(l)
- names_itemsets[f'{[self.columns_[i] for i in list_sup]}'] = sup
- combinations.append(list_sup)
- count_comb += 1
- floor += 1
- frequent_itemsets = pd.DataFrame(list(names_itemsets.values()), index=names_itemsets.keys(), columns=['Support'])
- frequent_itemsets['Itemsets'] = [', '.join(map(str, [self.columns_[col] for col in itemset])) for itemset in combinations]
- frequent_itemsets.index = range(len(frequent_itemsets))
- frequent_itemsets['Num_Itemsets'] = combinations
- return frequent_itemsets
- def _generate_rules(self, input_frame):
- rules = []
- for idx, row in input_frame.iterrows():
- items = row['Num_Itemsets']
- support = row['Support']
- if len(items) > 1:
- for r in range(1, len(items)):
- antecedents = list(combinations(items, r))
- for antecedent in antecedents:
- consequent = list([item for item in items if item not in antecedent])
- sum_A = np.zeros_like(self._transactions[:, 0])
- sum_A_B = round(support * self.N)
- for l in antecedent:
- sum_A += self._transactions[:, l]
- sum_A = (np.sum(sum_A[sum_A == len(antecedent)])) / len(antecedent)
- conf = sum_A_B / sum_A
- if conf >= self.min_confidence:
- rules.append((', '.join(map(str, [self.columns_[col] for col in antecedent])),
- ', '.join(map(str, [self.columns_[col] for col in consequent])),
- support, conf))
- output_frame = pd.DataFrame(rules, columns=['Antecedents', 'Consequents', 'Support', 'Confidence'])
- output_frame = output_frame.sort_values(by='Confidence')[::-1]
- return output_frame
- def fit(self, df):
- support_df = self._get_frequent_itemsets(df)
- rules_df = self._generate_rules(support_df)
- support_df = support_df[['Support', 'Itemsets']]
- return support_df, rules_df
- # df = pd.read_csv('Market_Basket_Optimisation.csv')
- df = pd.read_csv("retail_dataset.csv")
- print(df)
- EDA(df)
- df = data_to_1_and_0(df)
- print(df)
- EDA(df)
- start_time = time.time()
- method_apriori = Apriori(min_support=0.15, min_confidence=0.7)
- support, my_rules = method_apriori.fit(df)
- end_time = time.time()
- my_time = end_time - start_time
- print(support)
- print(my_rules)
- start_time = time.time()
- frequent_itemsets = apriori(df, min_support=0.15, use_colnames=True)
- rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
- rules = rules[['antecedents', 'consequents', 'support', 'confidence']]
- rules = rules.sort_values(by='confidence')[::-1]
- end_time = time.time()
- mlx_time = end_time - start_time
- print(frequent_itemsets)
- print(rules)
- print(f'\nЧас роботи алгоритму {round(my_time, 6)} секунд.\nЧас роботи вбудованого алгоритму: {round(mlx_time, 6)} секунд.')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement