Created
January 25, 2015 07:50
-
-
Save wy36101299/185d95442a1f83d4ee29 to your computer and use it in GitHub Desktop.
Bayes-classify 貝氏分類器
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import numpy as np | |
# 關鍵字所屬分類 P P S S T T | |
# 新聞 分類 賓士 寶馬 籃球 路跑 手機 App | |
# ---------------------------------------------- | |
# C63發表會 P 15 25 0 5 8 3 | |
# BMW i8 P 35 40 1 3 3 2 | |
# 林書豪 S 5 0 35 50 0 0 | |
# 湖人隊 S 1 5 32 15 0 0 | |
# Android 5.0 T 10 5 7 0 2 30 | |
# iPhone6 T 5 5 5 15 8 32 | |
dataSet=[] | |
label=[] | |
lines =[['p',15,25,0,5,8,3],['p',35,40,1,3,3,2],['s',5,0,35,50,0,0], | |
['s',1,5,32,15,0,0],['t',10,5,7,0,2,30],['t',5,5,5,15,8,32]] | |
for index, line in enumerate(lines): | |
label.append(line[0]) | |
dimension = len(line[1:]) | |
dataSet.append(map(float,line[1:])) | |
dataSet = np.array(dataSet) | |
def Bayestrain(classifyRange): | |
alltrainSample = dataSet.sum() | |
classifyProbabilityList = [] | |
# P(分類) = 該分類下字詞頻率總和 / 所有訓練集合字詞頻率總和 | |
for key, value in classifyRange.iteritems() : | |
tmp = dataSet[:, value[0] : value[1] ].sum()/alltrainSample | |
classifyProbabilityList.append(tmp) | |
# P(特徵關鍵字|分類) = (該分類下、該關鍵字字詞頻率總和 + 1) / (該分類下所有關鍵字字詞頻率總和 + 訓練集合關鍵字個數) | |
featureProbabilityList=[] | |
for key, value in classifyRange.iteritems() : | |
t = [] | |
for a in range(dimension): | |
featureProbability = (dataSet[value[0]:value[1]][:,a].sum()+1)/(dataSet[value[0]:value[1]].sum()+dimension) | |
t.append(featureProbability) | |
featureProbabilityList.append(t) | |
return classifyProbabilityList,featureProbabilityList | |
def Bayespredict(classifyProbabilityList,featureProbabilityList,classifyRange,predictList): | |
tmp = [] | |
for key, value in classifyRange.iteritems() : | |
tmp.append(key) | |
compareList = [] | |
for c,fList in zip(classifyProbabilityList,featureProbabilityList): | |
c = math.log10(c) | |
# 取 log 避免向下溢位情況發生 | |
for p,f in zip(predictList,fList): | |
c += (p*math.log10(f)) | |
compareList.append(c) | |
predict = tmp [compareList.index(max(compareList))] | |
return predict | |
# 2,4為門檻值-關鍵字分類p [:2] 關鍵字分類s [2:4] 關鍵字分類t [4:dimension] | |
classifyRange = {'p':[0,2],'s':[2,4],'t':[4,6]} | |
classifyProbabilityList , featureProbabilityList = Bayestrain(classifyRange) | |
# classifyProbabilityList,featureProbabilityList 皆為train後的結果 | |
# classifyRange 所設之門檻 | |
# predictList 要預測的feature | |
predictList=[10,2,50,56,8,5] | |
predict = Bayespredict(classifyProbabilityList,featureProbabilityList,classifyRange,predictList) | |
predict |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment