-
-
Save tsoporan/1268224 to your computer and use it in GitHub Desktop.
Manually train an NLTK NaiveBayes Classifier
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.probability import DictionaryProbDist, ELEProbDist, FreqDist | |
from nltk import NaiveBayesClassifier | |
from nltk import FreqDist, ConditionalFreqDist | |
from nltk import BigramAssocMeasures | |
from collections import defaultdict | |
train_samples = { | |
'I hate you and you are a bad person': 'neg', | |
'I love you and you are a good person': 'pos', | |
'I fail at everything and I want to kill people' : 'neg', | |
'I win at everything and I want to love people' : 'pos', | |
'sad are things are heppening. fml' : 'neg', | |
'good are things are heppening. gbu' : 'pos', | |
'I am so poor' : 'neg', | |
'I am so rich' : 'pos', | |
'I hate you mommy ! You are my terrible person' : 'neg', | |
'I love you mommy ! You are my amazing person' : 'pos', | |
'I want to kill butterflies since they make me sad' : 'neg', | |
'I want to chase butterflies since they make me happy' : 'pos', | |
'I want to hurt bunnies' : 'neg', | |
'I want to hug bunnies' : 'pos', | |
'You make me frown' : 'neg', | |
'You make me smile' : 'pos', | |
} | |
test_samples = [ | |
'You are a terrible person and everything you do is bad', | |
'I love you all and you make me happy', | |
'I frown whenever I see you in a poor state of mind', | |
'Finally getting rich from my ideas. They make me smile.', | |
'My mommy is poor', | |
'I love butterflies. Yay for happy', | |
'Everything is fail today and I hate stuff', | |
] | |
def gen_bow(text): | |
words = text.split() | |
bow = {} | |
for word in words: | |
bow[word.lower()] = True | |
return bow | |
def get_label_freqdist(samples): | |
label_freqdist = ConditionalFreqDist() | |
for words, label in train_samples.items(): | |
for word in words.split(): | |
label_freqdist[label].inc(word.lower()) | |
return label_freqdist | |
def get_feature_probdist(label_freqdist): | |
feature_freqdist = defaultdict(FreqDist) | |
feature_values = defaultdict(set) | |
for label in label_freqdist._fdists: | |
num_samples = len(train_samples) / 2 | |
for fname, count in label_freqdist._fdists[label].items(): | |
feature_freqdist[label, fname].inc(None, num_samples) | |
feature_values[fname].add(None) | |
feature_probdist = {} | |
for ((label, fname), freqdist) in feature_freqdist.items(): | |
probdist = ELEProbDist(freqdist, bins=len(feature_values[fname])) | |
feature_probdist[label,fname] = probdist | |
return feature_probdist | |
def get_label_probdist(label_freqdist): | |
label_fd = FreqDist() | |
for label in ['pos','neg']: | |
for word in label_freqdist._fdists[label].items(): | |
label_fd.inc(label) | |
label_probdist = ELEProbDist(label_fd) | |
return label_probdist | |
#fd._fdists['neg'] | |
label_freqdist = get_label_freqdist(train_samples) | |
label_probdist = get_label_probdist(label_freqdist) | |
feature_probdist = get_feature_probdist(label_freqdist) | |
classifier = NaiveBayesClassifier(label_probdist, feature_probdist) | |
for sample in test_samples: | |
print "%s | %s" % (sample, classifier.classify(gen_bow(sample))) | |
classifier.show_most_informative_features() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment