Skip to content

Instantly share code, notes, and snippets.

@mpolatcan
Last active February 17, 2020 14:52
Show Gist options
  • Save mpolatcan/cc0214563313ae3ecdab834ae9651ed1 to your computer and use it in GitHub Desktop.
Save mpolatcan/cc0214563313ae3ecdab834ae9651ed1 to your computer and use it in GitHub Desktop.
Match external market products with your product category, get n best match with your products using TF-IDF and Cosine similarity
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
def get_best_n_match(external_products_df, internal_products_dfs, count=3):
# Create vectorizer according to internal product space
vectorizer = TfidfVectorizer()
vectorizer.fit(internal_products_dfs.name)
# Calculate tf_idf vectors according to internal product space for external and internal products
external_products_df["tf_idf"] = external_products_df.apply(lambda x: vectorizer.transform([x[0]]), axis=1)
internal_products_dfs["tf_idf"] = internal_products_dfs.apply(lambda x: vectorizer.transform([x[0]]), axis=1)
# Find best match according to cosine similarity
external_products_df["best_match"] = external_products_df.apply(
lambda x:
sorted(
internal_products_dfs.apply(
lambda y: (y[0], cosine(x[2].todense(), y[2].todense())), # (internal_product_name, distance)
axis=1
).values,
key=lambda tuple: tuple[1]
)[:count],
axis=1
)
return external_products_df[["name", "category", "best_match"]].explode(column="best_match")
# Load your external product names with categories
external_products_df = pd.DataFrame([
["Detan 30 gece Elektrolikit Cihaz + Yedek", "Temizlik"],
["Detan 60 gece Elektrolikit Cihaz + Yedek", "Temizlik"],
["Detan Sinekkıran Su Bazlı 275 ml", "Temizlik"],
["Polonez Hindi Füme Eti 60 g", "Et"],
["Polenez Hindi Jambon 60 g", "Et"],
["Polonez Fıstıklı Salam 110 gr", "Et"]
], columns=["name", "category"])
# Load your internal product names with categories
internal_products_dfs = pd.DataFrame([
["Detan Elektrolikit Cihazı + Yedek 30 Gece", "Temizlik"],
["Detan Su Bazlı Sinekkıran 275 ml", "Temizlik"],
["Polonez Hindi Büme Et 60 g", "Et"],
["Detan Sinekkıran Su Bazlı 150 ml", "Temizlik"],
["Detan Karafatmakıran Su Bazlı 150 ml", "Temizlik"],
["Polonez Hindi Jambon 60 g", "Et"],
["Polonez Fıstıklı Salam 110 g", "Et"]
], columns=["name", "category"])
get_best_n_match(external_products_df, internal_products_dfs, count=2).to_csv("res.csv", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment