Last active
August 29, 2015 14:15
-
-
Save wy36101299/35b71ab48f38ac890212 to your computer and use it in GitHub Desktop.
collaborative filtering
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
# linalg:Linear algebra | |
# 日式 中式 美式 泰式 韓式 | |
# --------------------------- | |
# sam 2 0 0 4 4 | |
#john 5 5 5 3 3 | |
# tim 2 4 2 1 2 | |
#以下矩陣可看此 user 對 item 進行評分 | |
#藉由協同過濾 猜出 sam對中式和美式的評價後,推薦sam吃什麼 | |
def loadItemScore(): | |
tmp = [[4, 4, 0, 2, 2], | |
[4, 0, 0, 3, 3], | |
[4, 0, 0, 1, 1], | |
[1, 1, 1, 2, 0], | |
[2, 2, 2, 0, 0], | |
[5, 5, 5, 0, 0], | |
[1, 1, 1, 0, 0]] | |
itemScore = np.array(tmp,dtype=np.float) | |
return itemScore | |
def loadItemScore2(): | |
tmp = [[0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 5], | |
[0, 0, 0, 3, 0, 4, 0, 0, 0, 0, 3], | |
[0, 0, 0, 0, 4, 0, 0, 1, 0, 4, 0], | |
[3, 3, 4, 0, 0, 0, 0, 2, 2, 0, 0], | |
[5, 4, 5, 0, 0, 0, 0, 5, 5, 0, 0], | |
[0, 0, 0, 0, 5, 0, 1, 0, 0, 5, 0], | |
[4, 3, 4, 0, 0, 0, 0, 5, 5, 0, 1], | |
[0, 0, 0, 4, 0, 4, 0, 0, 0, 0, 4], | |
[0, 0, 0, 2, 0, 2, 5, 0, 0, 1, 2], | |
[0, 0, 0, 0, 5, 0, 0, 0, 0, 4, 0], | |
[1, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0]] | |
itemScore = np.array(tmp,dtype=np.float) | |
return itemScore | |
# SVD 矩陣奇異值分解 | |
# 利用SVD奇異值分解可以大幅減低矩陣的儲存量 | |
U,sigma,VT = np.linalg.svd(loadItemScore2()) | |
# 矩陣能量為 sum(sigma**2) 需保留90%為典型作法 | |
sig2 = sigma**2 | |
print("矩陣總能量為:") | |
print(sum(sig2)) | |
print("90%矩陣能量:") | |
print(sum(sig2)*0.9) | |
print("轉換為二維矩陣能量:") | |
print(sum(sig2[:2])) | |
print("轉換三維矩陣能量:") | |
print(sum(sig2[:3])) | |
# 轉換為二維能量太低,需為三維 svdMatrix則為還原後的矩陣 | |
Sig4 = np.mat(np.eye(3)*sigma[:3]) | |
svdMatrix = U[:,:3] * Sig4 * VT[:3,:] | |
# 資料來源:http://en.wikipedia.org/wiki/Singular_value_decomposition | |
# Euclid Similarity | |
def eulidSim(a,b): | |
eulid = np.linalg.norm(a-b) | |
norEulid = 1.0/(1.0 + eulid) | |
return norEulid | |
# Adjusted Cosine Similarity | |
# 不超過3 Similarity =1 | |
def adjCosSim(a,b): | |
if len(a)<3: | |
return 1 | |
else: | |
a = a-a.mean() | |
b = b-b.mean() | |
cos = float(np.inner(a,b))/(np.linalg.norm(a)*np.linalg.norm(b)) | |
norCos = 0.5+0.5*cos | |
return norCos | |
# Cosine Similarity | |
def cosSim(a,b): | |
cos = float(np.inner(a,b))/(np.linalg.norm(a)*np.linalg.norm(b)) | |
norCos = 0.5+0.5*cos | |
return norCos | |
# Person Correlation Coefficient Similarity | |
# 不超過3 Similarity =1 | |
def pearsSim(a,b): | |
if len(a)<3: | |
return 1 | |
else: | |
corcoef = np.corrcoef(a,b,rowvar=0)[0][1] | |
norCorcoef = 0.5+0.5*corcoef | |
return norCorcoef | |
# 資料來源:http://en.wikipedia.org/wiki/Pearson_product-moment_correlation_coefficient | |
# Person Correlation Coefficient Similarity 和 Adjusted Cosine Similarity 比較 | |
# In pearson correlation,減去該物品ㄉ | |
# In adjusted cosine correlation 不同使用者可能對物品會有較高或是較低的評分表現 因此減去使用者平均的評分 e.x.(4,5)(1,2)視為行為相似 | |
# 資料來源:http://www.cs.carleton.edu/cs_comps/0607/recommend/recommender/itembased.html | |
# 概念藉由使用者都評分物品的分數做相似度運算,其結果*該未評分之物品 總和取平均 即為可能的分數 | |
def itemSimRec(itemScore, user, simMethods, item): | |
# user number : shape(itemScore)[0] item number : shape(itemScore)[1] | |
n = np.shape(itemScore)[1] | |
simTotal = 0.0; ratSimTotal = 0.0 | |
for i in range(n): | |
userItemRating = itemScore[user,i] | |
# pass norating item | |
if userItemRating == 0 or i==item: | |
pass | |
else: | |
# find same rating item | |
sameRatingItem = np.nonzero(np.logical_and(itemScore[:,item] > 0,itemScore[:,i] > 0))[0] | |
if len(sameRatingItem) == 0: | |
similarity = 0 | |
else: | |
similarity = simMethods(itemScore[sameRatingItem,item],itemScore[sameRatingItem,i]) | |
# print(itemScore[sameRatingItem,item]) | |
# print(itemScore[sameRatingItem,i]) | |
# print 'the %d and %d similarity is: %f' % (item, i, similarity) | |
simTotal += similarity | |
ratSimTotal += similarity * userItemRating | |
if simTotal == 0: | |
return 0 | |
else: | |
return ratSimTotal/simTotal | |
def recommend(itemScore, user, N=3, simMethods=cosSim, estMethod=itemSimRec): | |
#find unrated items | |
unratedItems = np.nonzero(itemScore[user,:] ==0)[0] | |
if len(unratedItems) == 0: | |
return 'you rated everything' | |
else: | |
itemScores = [] | |
for item in unratedItems: | |
estimatedScore = estMethod(itemScore, user, simMethods, item) | |
itemScores.append((item, estimatedScore)) | |
recItem = sorted(itemScores, key=lambda x: x[1], reverse=True)[:N] | |
return recItem | |
# SVD分解再還原 只有有多出SVD的部分,其餘和itemSimRec算法相同 | |
def svdItemSimRec(itemScore, user, simMethods, item): | |
n = np.shape(itemScore)[1] | |
simTotal = 0.0; ratSimTotal = 0.0 | |
U,Sigma,VT = np.linalg.svd(itemScore) | |
Sig4 = np.mat(np.eye(3)*Sigma[:3]) #arrange Sig4 into a diagonal matrix | |
xformedItems = U[:,:3] * Sig4 * VT[:3,:] #create transformed items | |
for i in range(n): | |
userItemRating = itemScore[user,i] | |
# pass norating item | |
if userItemRating == 0 or i==item: | |
pass | |
else: | |
# find same rating item | |
sameRatingItem = np.nonzero(np.logical_and(itemScore[:,item] > 0,itemScore[:,i] > 0))[0] | |
if len(sameRatingItem) == 0: | |
similarity = 0 | |
else: | |
similarity = simMethods(itemScore[sameRatingItem,item],itemScore[sameRatingItem,i]) | |
# print(itemScore[sameRatingItem,item]) | |
# print(itemScore[sameRatingItem,i]) | |
# print 'the %d and %d similarity is: %f' % (item, i, similarity) | |
simTotal += similarity | |
ratSimTotal += similarity * userItemRating | |
if simTotal == 0: | |
return 0 | |
else: | |
return ratSimTotal/simTotal | |
# [(4, 5.0), (9, 5.0), (10, 4.7297297297297298)] | |
# 物品4 分數5 物品9 分數5 物品10 分數4.7 依序推薦 | |
print("estMethod=itemSimRec") | |
print("--------------------") | |
print("simMethods=eulidSim") | |
print(recommend(loadItemScore2(),4,simMethods=eulidSim,estMethod=itemSimRec)) | |
print("simMethods=cosSim") | |
print(recommend(loadItemScore2(),4,simMethods=cosSim,estMethod=itemSimRec)) | |
print("simMethods=adjCosSim") | |
print(recommend(loadItemScore2(),4,simMethods=adjCosSim,estMethod=itemSimRec)) | |
print("simMethods=pearsSim") | |
print(recommend(loadItemScore2(),4,simMethods=pearsSim,estMethod=itemSimRec)) | |
# 照理說經過SVD分解後,可大幅減少儲存量,但對於矩陣還原可能會失真,故會比itemSimRec中有更多誤差存在 | |
print("estMethod=svdItemSimRec") | |
print("-----------------------") | |
print("simMethods=eulidSim") | |
print(recommend(loadItemScore2(),4,simMethods=eulidSim,estMethod=svdItemSimRec)) | |
print("simMethods=cosSim") | |
print(recommend(loadItemScore2(),4,simMethods=cosSim,estMethod=svdItemSimRec)) | |
print("simMethods=adjCosSim") | |
print(recommend(loadItemScore2(),4,simMethods=adjCosSim,estMethod=svdItemSimRec)) | |
print("simMethods=pearsSim") | |
print(recommend(loadItemScore2(),4,simMethods=pearsSim,estMethod=svdItemSimRec)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment