Created
October 8, 2017 03:51
-
-
Save EnsekiTT/bbf842cd93f014af7c3771951c596efb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# ツイートを集める | |
# https://github.com/sixohsix/twitter | |
# 環境変数にトークンとか入れておく | |
import os | |
from twitter import * | |
import time | |
# Janomeの形態素解析 | |
# https://github.com/mocobeta/janome | |
from janome.tokenizer import Tokenizer | |
from janome.analyzer import Analyzer | |
from janome.charfilter import * | |
from janome.tokenfilter import * | |
# 記号、助詞、助動詞は削除 | |
# 動詞は原形に変換してみる。 | |
# Scikit-learnの決定木と可視化 | |
from sklearn import tree | |
from sklearn.externals.six import StringIO | |
import pydotplus | |
from IPython.display import Image | |
import numpy as np | |
# Twitterの設定 | |
TOKEN = os.environ["DTA_TWITTER_TOKEN"] | |
TOKEN_SECRET = os.environ["DTA_TWITTER_TOKEN_SECRET"] | |
CONSUMER_KEY = os.environ["DTA_TWITTER_CONSUMER_KEY"] | |
CONSUMER_SECRET = os.environ["DTA_TWITTER_CONSUMER_SECRET"] | |
t = Twitter( | |
auth = OAuth(TOKEN, TOKEN_SECRET, CONSUMER_KEY, CONSUMER_SECRET)) | |
# 設定 | |
NUM_TWEET = 200 | |
NUM_USER = 50 | |
def get_word_count(analyzer, tweets, word_list): | |
""" | |
ワードをカウントしてくれる | |
""" | |
word_count = [0]*len(word_list) | |
for text in tweets: | |
#print("###BASE###" + text) | |
for token in analyzer.analyze(text): | |
#print(token) | |
if '動詞' in token.part_of_speech.split(','): | |
word = token.base_form | |
elif '形容詞' in token.part_of_speech.split(','): | |
word = token.base_form | |
else: | |
word = token.surface | |
if word in word_list: | |
word_index = word_list.index(word) | |
word_count[word_index] += 1 | |
else: | |
word_list.append(word) | |
word_count.append(1) | |
#print(word_list) | |
#print(word_count) | |
return word_count | |
# Userリストの取得 | |
engineer = [user['screen_name'] for user in t.lists.members(owner_screen_name="EnsekiTT", slug="engineer", count=NUM_USER)['users']] | |
not_engineer = [user['screen_name'] for user in t.lists.members(owner_screen_name="EnsekiTT", slug="notengineer", count=NUM_USER)['users']] | |
users = list(engineer) | |
users.extend(not_engineer) | |
print(users) | |
word_list = [] | |
user_vectors = {} | |
user_vectors_raw = {} | |
last_id = 0 | |
for user in users: | |
tweets = t.statuses.user_timeline(screen_name=user, count=200, include_rts=False, exclude_replies=True) | |
#print("user :" + user) | |
while (len(tweets)< NUM_TWEET): | |
max_id = tweets[-1]['id'] | |
if max_id == last_id: | |
print("now :" + str(len(tweets)) +', '+ str(tweets[-1]['id']) + ', ' + tweets[-1]['text']) | |
print("Break!!!" + user) | |
break | |
last_id = max_id | |
tweets.extend(t.statuses.user_timeline(screen_name=users[1], count=200, include_rts=False, exclude_replies=True, max_id=max_id+1)) | |
time.sleep(1) # Twitter APIのstatuses.user_timelineの制限が15分間に900回なので念のためちょっと長めにしてある。 | |
user_vectors_raw[user]=[tweet['text'] for tweet in tweets[:NUM_TWEET]] | |
from datetime import datetime | |
import json | |
ts = datetime.now().strftime("%Y%m%d_%H%M%S") | |
path = ts + '_tweets.json' | |
with open(path, 'w') as f: | |
json.dump(user_vectors_raw, f) | |
len(user_vectors_raw[user]) | |
# Janomeの設定 | |
char_filters = [UnicodeNormalizeCharFilter() | |
, RegexReplaceCharFilter(u'[ー()()*/\n:゚∀.&;|%д@_○!,?・]', u'') | |
, RegexReplaceCharFilter(u"http[:\/A-Za-z0-9\n]*", u"")] | |
tokenizer = Tokenizer() | |
token_filters = [CompoundNounFilter(), POSStopFilter(['動詞','記号', '助詞', '助動詞','接頭詞','数','フィラー']), LowerCaseFilter()] | |
#token_filters = [POSKeepFilter('名詞'), LowerCaseFilter()] | |
analyzer = Analyzer(char_filters, tokenizer, token_filters) | |
for user in users: | |
user_vectors[user] = get_word_count(analyzer, user_vectors_raw[user], word_list) | |
#break | |
max_len = max([len(user_vectors[key]) for key in user_vectors.keys()]) | |
for key in user_vectors.keys(): | |
user_len = len(user_vectors[key]) | |
user_vectors[key].extend([0]*(max_len - user_len)) | |
user_list=[] | |
vectors = [] | |
labels = [] | |
print(engineer) | |
print(len(engineer)) | |
print(not_engineer) | |
print(len(not_engineer)) | |
# Not Engineerなら0, Engineerなら1ってことで。 | |
for key in user_vectors.keys(): | |
user_list.append(key) | |
if key in engineer: | |
labels.append(1) | |
elif key in not_engineer: | |
labels.append(0) | |
vectors.append(user_vectors[key]) | |
print(labels) | |
print(len(vectors)) | |
clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=3, min_samples_leaf=2) | |
clf = clf.fit(vectors, labels) | |
predicted = clf.predict(vectors) | |
print(predicted) | |
print(sum(predicted == labels) / len(labels)) | |
dot_data = StringIO() | |
tree.export_graphviz(clf, out_file=dot_data, feature_names = word_list, | |
class_names = ['not engineer', 'engineer'], | |
filled=True, rounded=True) | |
graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) | |
Image(graph.create_png()) | |
# 非エンジニアが0、エンジニアが1なのでつぶやいていたらそのWordのカウント数が0より大きいはず | |
oppai = 'おっぱい' | |
if oppai in word_list: | |
oppai_index = word_list.index(oppai) | |
oppai_predicted = np.array([int(vector[oppai_index]>0) for vector in vectors]) | |
print(sum(oppai_predicted)) | |
print(sum(oppai_predicted == labels) / len(labels)) | |
from sklearn import tree, cross_validation | |
scores = cross_validation.cross_val_score(clf, vectors, labels,cv=10) | |
print(scores.mean(), scores) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment