Created
September 11, 2012 21:25
-
-
Save meqif/3702194 to your computer and use it in GitHub Desktop.
Twitter account spam/ham classifier
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Simple Naive Bayes tweet classifier. | |
It analyses a number of tweets of a given user and determines if that user is | |
a spammer. | |
""" | |
from __future__ import division | |
import json | |
import re | |
import os | |
import pickle | |
import requests | |
import numpy | |
from sklearn.naive_bayes import GaussianNB | |
from sklearn.cross_validation import StratifiedKFold | |
from sklearn import cross_validation | |
def get_user_data(user, count=2): | |
"""Retrieve latest tweet from a user, given a username.""" | |
url = 'https://api.twitter.com/1/statuses/user_timeline.json' \ | |
'?include_entities=true' \ | |
'&include_rts=true' \ | |
'&screen_name=%s' \ | |
'&count=%d' % (user, count) | |
req = requests.get(url) | |
data = req.json | |
assert req.status_code == 200, data['error'] | |
return data | |
def analyse(tweet): | |
"""Analyse characteristics of a tweet, converting them into numerical values.""" | |
tweetSize = len(tweet['text']) | |
nUrls = len(tweet['entities']['urls']) | |
if nUrls == 0: | |
ratio = 0 | |
else: | |
if tweet['in_reply_to'] != None: | |
ratio = len('@' + tweet['in_reply_to'] + ' ' + | |
tweet['entities']['urls'][0]['url']) / tweetSize | |
else: | |
ratio = 0 | |
return [ | |
ratio, | |
nUrls, | |
len(tweet['screen_name']), | |
tweet['screen_name'].count('x'), | |
len(re.findall(r'\d', tweet['screen_name'])), | |
tweet['followers_count'], | |
tweet['friends_count'] | |
] | |
def main(user, count, verbose=False): | |
def extract(x): | |
"""Extract relevant characteristics from a tweet.""" | |
return { | |
u'screen_name': x['user']['screen_name'], | |
u'text': x['text'], | |
u'in_reply_to': x['in_reply_to_screen_name'], | |
u'entities': x['entities'], | |
u'followers_count': x['user']['followers_count'], | |
u'friends_count': x['user']['friends_count'] | |
} | |
user_data = get_user_data(user, count) | |
user_data = map(extract, user_data) | |
user_data = map(analyse, user_data) | |
model_file = 'curbstomp_model.pkl' | |
if os.path.exists(model_file): | |
with open(model_file) as f: | |
model = pickle.load(f) | |
else: | |
with open('spam.json') as f: | |
spam = map(extract, json.loads(f.read())) | |
spam = map(analyse, spam) | |
with open('ham.json') as f: | |
ham = map(extract, json.loads(f.read())) | |
ham = map(analyse, ham) | |
training_data = numpy.array(spam + ham) | |
classes = numpy.array(len(spam) * [1] + len(ham) * [0]) | |
skf = StratifiedKFold(classes, 2) | |
gnb = GaussianNB() | |
for train_index, test_index in skf: | |
x_train, x_test = training_data[train_index], training_data[test_index] | |
y_train, y_test = classes[train_index], classes[test_index] | |
model = gnb.fit(x_train, y_train) | |
pred = model.predict(x_test) | |
print "Number of mislabeled points : %d" % (y_test != pred).sum() | |
# print "Verdict: %s" % model.predict(user_data).sum() | |
# Export model | |
with open(model_file, 'w') as f: | |
pickle.dump(model, f) | |
if verbose: | |
print "Verdict (spam): %s" % model.predict(spam) | |
print "Verdict (ham): %s" % model.predict(ham) | |
scores = cross_validation.cross_val_score(gnb, training_data, classes, cv=2) | |
print scores | |
print "Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() / 2) | |
# Actually use model with input | |
if verbose: | |
print "Analysis result: %s" % model.predict(user_data) | |
print "Verdict: {:.2%} probability of being a spammer".format(sum(model.predict(user_data)) / len(user_data)) | |
if __name__ == '__main__': | |
import argparse | |
parser = argparse.ArgumentParser() | |
parser.add_argument('username', | |
help="Twitter user whose tweets will be analysed") | |
parser.add_argument('-v', '--verbose', action='store_true', | |
help="show each tweet's probability of being spam") | |
parser.add_argument('-n', type=int, default=50, dest='count', | |
help="how many tweets should be used for analysis") | |
args = parser.parse_args() | |
main(args.username, args.count, args.verbose) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment