Created
March 14, 2020 23:00
-
-
Save cpearce/e9e3de30307bb65908a60a8471ded105 to your computer and use it in GitHub Desktop.
Three ways to mine frequent patterns...
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!pip install mlxtend | |
import pandas as pd | |
import csv | |
from mlxtend.preprocessing import TransactionEncoder | |
from mlxtend.frequent_patterns import apriori | |
def csv_to_tabular(csv_path): | |
""" | |
Given a path to a CSV data file, return a tabular dataset, and its | |
column names. | |
For example, given a CSV file containing: | |
eggs,cheese,bread | |
eggs,milk | |
Return: | |
column_names=['eggs','cheese','bread','milk'] | |
tabular_dataset=[[1,1,1,0],[1,0,0,1]] | |
That is, convert the CSV rows into a num_items x num_rows matrix | |
of 1=present, 0=absent values. | |
""" | |
dataset = list() | |
column_names = set() | |
with open(csv_path, "r") as csvfile: | |
reader = csv.reader(csvfile, delimiter=",", quotechar='"') | |
for row_number, items in enumerate(reader): | |
# Drop empty items | |
items = [i for i in items if i] | |
if not items: | |
# No non-empty items in this row, skip it! | |
continue | |
transaction = set(items) | |
dataset.append(transaction) | |
# Add any unique item names to our list of column names. | |
column_names |= transaction | |
# Lock the column name order. | |
column_names = list(column_names) | |
tabular_dataset = [ | |
[1 if col_name in transaction else 0 for col_name in column_names] | |
for transaction in dataset | |
] | |
return tabular_dataset, column_names | |
def one(): | |
tabular_dataset, column_names = csv_to_tabular("supermarket_trans.csv") | |
te = TransactionEncoder() | |
te_ary = te.fit(tabular_dataset).transform(tabular_dataset) | |
df = pd.DataFrame(tabular_dataset, columns=column_names) | |
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True) | |
print(frequent_itemsets) | |
def two(): | |
def csv_lines(csv_path_name): | |
with open(csv_path_name) as csv_file: | |
reader = csv.reader(csv_file) | |
for row in reader: | |
# Drop empty values | |
yield [value for value in row if value] | |
dataset = [row for row in csv_lines("supermarket_trans.csv")] | |
te = TransactionEncoder() | |
te_ary = te.fit(dataset).transform(dataset) | |
df = pd.DataFrame(te_ary, columns=te.columns_) | |
# print(df) | |
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True) | |
print(frequent_itemsets) | |
def three(): | |
dataset = list() | |
with open('supermarket_trans.csv', 'r') as csvfile: | |
reader = csv.reader(csvfile, delimiter=',',quotechar='"') | |
for row in reader: | |
if len(row) > 0: | |
dataset.append(row) | |
# print(len(dataset)) | |
te = TransactionEncoder() | |
te_ary = te.fit(dataset).transform(dataset) | |
df = pd.DataFrame(te_ary, columns=te.columns_) | |
# print(df) | |
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True) | |
print(frequent_itemsets) | |
def one(): | |
tabular_dataset, column_names = csv_to_tabular("supermarket_trans.csv") | |
te = TransactionEncoder() | |
te_ary = te.fit(tabular_dataset).transform(tabular_dataset) | |
df = pd.DataFrame(tabular_dataset, columns=column_names) | |
# print(df) | |
frequent_itemsets = apriori(df, min_support=0.1, use_colnames=True) | |
print(frequent_itemsets) | |
one() | |
two() | |
three() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment