Skip to content

Instantly share code, notes, and snippets.

@cpearce
Created March 14, 2020 23:00
Show Gist options
  • Save cpearce/e9e3de30307bb65908a60a8471ded105 to your computer and use it in GitHub Desktop.
Save cpearce/e9e3de30307bb65908a60a8471ded105 to your computer and use it in GitHub Desktop.
Three ways to mine frequent patterns...
#!pip install mlxtend
import pandas as pd
import csv
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
def csv_to_tabular(csv_path):
"""
Given a path to a CSV data file, return a tabular dataset, and its
column names.
For example, given a CSV file containing:
eggs,cheese,bread
eggs,milk
Return:
column_names=['eggs','cheese','bread','milk']
tabular_dataset=[[1,1,1,0],[1,0,0,1]]
That is, convert the CSV rows into a num_items x num_rows matrix
of 1=present, 0=absent values.
"""
dataset = list()
column_names = set()
with open(csv_path, "r") as csvfile:
reader = csv.reader(csvfile, delimiter=",", quotechar='"')
for row_number, items in enumerate(reader):
# Drop empty items
items = [i for i in items if i]
if not items:
# No non-empty items in this row, skip it!
continue
transaction = set(items)
dataset.append(transaction)
# Add any unique item names to our list of column names.
column_names |= transaction
# Lock the column name order.
column_names = list(column_names)
tabular_dataset = [
[1 if col_name in transaction else 0 for col_name in column_names]
for transaction in dataset
]
return tabular_dataset, column_names
def one():
tabular_dataset, column_names = csv_to_tabular("supermarket_trans.csv")
te = TransactionEncoder()
te_ary = te.fit(tabular_dataset).transform(tabular_dataset)
df = pd.DataFrame(tabular_dataset, columns=column_names)
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
print(frequent_itemsets)
def two():
def csv_lines(csv_path_name):
with open(csv_path_name) as csv_file:
reader = csv.reader(csv_file)
for row in reader:
# Drop empty values
yield [value for value in row if value]
dataset = [row for row in csv_lines("supermarket_trans.csv")]
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
# print(df)
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
print(frequent_itemsets)
def three():
dataset = list()
with open('supermarket_trans.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',',quotechar='"')
for row in reader:
if len(row) > 0:
dataset.append(row)
# print(len(dataset))
te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)
# print(df)
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
print(frequent_itemsets)
def one():
tabular_dataset, column_names = csv_to_tabular("supermarket_trans.csv")
te = TransactionEncoder()
te_ary = te.fit(tabular_dataset).transform(tabular_dataset)
df = pd.DataFrame(tabular_dataset, columns=column_names)
# print(df)
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True)
print(frequent_itemsets)
one()
two()
three()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment