import numpy as np
import pandas as pd
import itertools
How PCY Algorithm works
An Effective Hash-Based Algorithm for Mining Association Rules.
Initializing the transaction baskets
# initialize lists
= [[1,2,5], [2,3,5], [4,5], [1,6,7], [2,3,5,7], [1,2,7]]
data = [list(itertools.combinations(i, 2)) for i in data]
pairs = {'Items':data, 'Pairs': pairs}
mydict
# Create the pandas DataFrame with column name is provided explicitly
= pd.DataFrame(mydict)
df
# Print dataframe
display(df)
Items | Pairs | |
---|---|---|
0 | [1, 2, 5] | [(1, 2), (1, 5), (2, 5)] |
1 | [2, 3, 5] | [(2, 3), (2, 5), (3, 5)] |
2 | [4, 5] | [(4, 5)] |
3 | [1, 6, 7] | [(1, 6), (1, 7), (6, 7)] |
4 | [2, 3, 5, 7] | [(2, 3), (2, 5), (2, 7), (3, 5), (3, 7), (5, 7)] |
5 | [1, 2, 7] | [(1, 2), (1, 7), (2, 7)] |
Calculating the supports
print(data)
[[1, 2, 5], [2, 3, 5], [4, 5], [1, 6, 7], [2, 3, 5, 7], [1, 2, 7]]
= {}
supports for row in data:
for e in row:
if e not in supports:
= 0
supports[e]
# Sorting dictionary
= supports.keys()
sorted_dict = sorted(sorted_dict)
suppports print(supports)
{1: 0, 2: 0, 5: 0, 3: 0, 4: 0, 6: 0, 7: 0}
for row in data:
for e in row:
+= 1
supports[e]
print(supports)
{1: 3, 2: 4, 5: 4, 3: 2, 4: 1, 6: 1, 7: 3}
= [ key for key, value in supports.items()]
item = [ value for key, value in supports.items()]
supp = {'Itemset': item, 'Sup': supp}
unique_items_count
= pd.DataFrame(unique_items_count)
df_item_sup display(df_item_sup)
Itemset | Sup | |
---|---|---|
0 | 1 | 3 |
1 | 2 | 4 |
2 | 5 | 4 |
3 | 3 | 2 |
4 | 4 | 1 |
5 | 6 | 1 |
6 | 7 | 3 |
Pass 1
def hash_f_pair(i,j):
return (i*j) % 7
= []
unique_items for row in pairs:
for e in row:
if e not in unique_items:
unique_items.append(e)print(unique_items)
[(1, 2), (1, 5), (2, 5), (2, 3), (3, 5), (4, 5), (1, 6), (1, 7), (6, 7), (2, 7), (3, 7), (5, 7)]
0][1] unique_items[
2
= []
hash_value_pairs for e in unique_items:
0],e[1]))
hash_value_pairs.append(hash_f_pair(e[print(hash_value_pairs)
[2, 5, 3, 6, 1, 6, 6, 0, 0, 0, 0, 0]
= {}
my_dict2 for pair in unique_items:
str(pair)] = 0
my_dict2[print(my_dict2)
{'(1, 2)': 0, '(1, 5)': 0, '(2, 5)': 0, '(2, 3)': 0, '(3, 5)': 0, '(4, 5)': 0, '(1, 6)': 0, '(1, 7)': 0, '(6, 7)': 0, '(2, 7)': 0, '(3, 7)': 0, '(5, 7)': 0}
for row in pairs:
for e in row:
str(e)] +=1
my_dict2[
print(my_dict2)
{'(1, 2)': 2, '(1, 5)': 1, '(2, 5)': 3, '(2, 3)': 2, '(3, 5)': 2, '(4, 5)': 1, '(1, 6)': 1, '(1, 7)': 2, '(6, 7)': 1, '(2, 7)': 2, '(3, 7)': 1, '(5, 7)': 1}
= [ value for key, value in my_dict2.items()]
counts print(counts)
# df_counts = pd.DataFrame()
[2, 1, 3, 2, 2, 1, 1, 2, 1, 2, 1, 1]
= {'Pairs': unique_items, 'Count': counts, 'Hash': hash_value_pairs}
unique_items_count print(unique_items_count)
{'Pairs': [(1, 2), (1, 5), (2, 5), (2, 3), (3, 5), (4, 5), (1, 6), (1, 7), (6, 7), (2, 7), (3, 7), (5, 7)], 'Count': [2, 1, 3, 2, 2, 1, 1, 2, 1, 2, 1, 1], 'Hash': [2, 5, 3, 6, 1, 6, 6, 0, 0, 0, 0, 0]}
= pd.DataFrame(unique_items_count)
df_counts display(df_counts)
Pairs | Count | Hash | |
---|---|---|---|
0 | (1, 2) | 2 | 2 |
1 | (1, 5) | 1 | 5 |
2 | (2, 5) | 3 | 3 |
3 | (2, 3) | 2 | 6 |
4 | (3, 5) | 2 | 1 |
5 | (4, 5) | 1 | 6 |
6 | (1, 6) | 1 | 6 |
7 | (1, 7) | 2 | 0 |
8 | (6, 7) | 1 | 0 |
9 | (2, 7) | 2 | 0 |
10 | (3, 7) | 1 | 0 |
11 | (5, 7) | 1 | 0 |
Minium support count is 2 so we eliminate from the Pair list the items that have less than 2 from Table 1. The resulting table is called the Candidate Pairs.
= df_counts.drop([5, 6, 8])
df_counts_after display(df_counts_after)
Pairs | Count | Hash | |
---|---|---|---|
0 | (1, 2) | 2 | 2 |
1 | (1, 5) | 1 | 5 |
2 | (2, 5) | 3 | 3 |
3 | (2, 3) | 2 | 6 |
4 | (3, 5) | 2 | 1 |
7 | (1, 7) | 2 | 0 |
9 | (2, 7) | 2 | 0 |
10 | (3, 7) | 1 | 0 |
11 | (5, 7) | 1 | 0 |
Pass 2
The final step is to build a table from Table 2 that counts the number of ocurrences per hash value.
= [x for x in range(max(hash_value_pairs)+1)]
hash_values = {}
my_dict3 for e in hash_values:
= 0
my_dict3[e]
for i, hash_value in enumerate(df_counts["Hash"]):
+= df_counts["Count"][i]
my_dict3[hash_value] print(my_dict3)
= {"Bucket": hash_values, "Count": my_dict3.values()}
data_bucket = pd.DataFrame(data_bucket)
df_bucket_count display(df_bucket_count)
{0: 7, 1: 2, 2: 2, 3: 3, 4: 0, 5: 1, 6: 4}
Bucket | Count | |
---|---|---|
0 | 0 | 7 |
1 | 1 | 2 |
2 | 2 | 2 |
3 | 3 | 3 |
4 | 4 | 0 |
5 | 5 | 1 |
6 | 6 | 4 |
With the result above we can look back to table Table 3 and see which hash values, have a value lower than the minimum support count. That is hash value 4 and 5. With this result we look at table Table 3 and delete those corresponding hash values. The final result is teh Final Candidates
= df_counts_after.drop([1])
df_counts_after_pass2 display(df_counts_after_pass2)
Pairs | Count | Hash | |
---|---|---|---|
0 | (1, 2) | 2 | 2 |
2 | (2, 5) | 3 | 3 |
3 | (2, 3) | 2 | 6 |
4 | (3, 5) | 2 | 1 |
7 | (1, 7) | 2 | 0 |
9 | (2, 7) | 2 | 0 |
10 | (3, 7) | 1 | 0 |
11 | (5, 7) | 1 | 0 |