Last active
February 17, 2020 14:52
-
-
Save mpolatcan/cc0214563313ae3ecdab834ae9651ed1 to your computer and use it in GitHub Desktop.
Match external market products with your product category, get n best match with your products using TF-IDF and Cosine similarity
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from scipy.spatial.distance import cosine | |
def get_best_n_match(external_products_df, internal_products_dfs, count=3): | |
# Create vectorizer according to internal product space | |
vectorizer = TfidfVectorizer() | |
vectorizer.fit(internal_products_dfs.name) | |
# Calculate tf_idf vectors according to internal product space for external and internal products | |
external_products_df["tf_idf"] = external_products_df.apply(lambda x: vectorizer.transform([x[0]]), axis=1) | |
internal_products_dfs["tf_idf"] = internal_products_dfs.apply(lambda x: vectorizer.transform([x[0]]), axis=1) | |
# Find best match according to cosine similarity | |
external_products_df["best_match"] = external_products_df.apply( | |
lambda x: | |
sorted( | |
internal_products_dfs.apply( | |
lambda y: (y[0], cosine(x[2].todense(), y[2].todense())), # (internal_product_name, distance) | |
axis=1 | |
).values, | |
key=lambda tuple: tuple[1] | |
)[:count], | |
axis=1 | |
) | |
return external_products_df[["name", "category", "best_match"]].explode(column="best_match") | |
# Load your external product names with categories | |
external_products_df = pd.DataFrame([ | |
["Detan 30 gece Elektrolikit Cihaz + Yedek", "Temizlik"], | |
["Detan 60 gece Elektrolikit Cihaz + Yedek", "Temizlik"], | |
["Detan Sinekkıran Su Bazlı 275 ml", "Temizlik"], | |
["Polonez Hindi Füme Eti 60 g", "Et"], | |
["Polenez Hindi Jambon 60 g", "Et"], | |
["Polonez Fıstıklı Salam 110 gr", "Et"] | |
], columns=["name", "category"]) | |
# Load your internal product names with categories | |
internal_products_dfs = pd.DataFrame([ | |
["Detan Elektrolikit Cihazı + Yedek 30 Gece", "Temizlik"], | |
["Detan Su Bazlı Sinekkıran 275 ml", "Temizlik"], | |
["Polonez Hindi Büme Et 60 g", "Et"], | |
["Detan Sinekkıran Su Bazlı 150 ml", "Temizlik"], | |
["Detan Karafatmakıran Su Bazlı 150 ml", "Temizlik"], | |
["Polonez Hindi Jambon 60 g", "Et"], | |
["Polonez Fıstıklı Salam 110 g", "Et"] | |
], columns=["name", "category"]) | |
get_best_n_match(external_products_df, internal_products_dfs, count=2).to_csv("res.csv", index=False) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment