Last active
August 29, 2015 14:19
-
-
Save tmtysk/f83ab394bd4236178c25 to your computer and use it in GitHub Desktop.
Jubatusでテキストに含まれる特徴語の傾向を学習し、入力テキストをカテゴライズする ref: http://qiita.com/tmtysk/items/4d177ba27c8d2484b206
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"method": "NHERD", | |
"parameter": { | |
"regularization_weight": 0.001 | |
}, | |
"converter": { | |
"num_filter_types": { | |
}, | |
"num_filter_rules": [ | |
], | |
"string_filter_types": { | |
}, | |
"string_filter_rules": [ | |
], | |
"num_types": { | |
}, | |
"num_rules": [ | |
], | |
"string_types": { | |
"bigram": { "method": "ngram", "char_num": "2" }, | |
"mecab": { | |
"method": "dynamic", | |
"path": "libmecab_splitter.so", | |
"function": "create" | |
} | |
}, | |
"string_rules": [ | |
{ "key": "*", "type": "mecab", "sample_weight": "bin", "global_weight": "idf" } | |
] | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import sys | |
from jubatus.classifier import client | |
from jubatus.common import Datum | |
# Jubatus configuration | |
host = "127.0.0.1" | |
port = 9199 | |
instance_name = "" # required only when using distributed mode | |
def estimate_blog_category_for(text): | |
classifier = client.Classifier(host, port, instance_name) | |
# Create datum for Jubatus | |
d = Datum({'text': text}) | |
# Send estimation query to Jubatus | |
result = classifier.classify([d]) | |
if len(result[0]) > 0: | |
# Sort results by score | |
est = sorted(result[0], key=lambda e: e.score, reverse=True) | |
# Print the result | |
print "Estimated Category for %s:" % text | |
i = 0 | |
for e in est: | |
print " " + e.label + " (" + str(e.score) + ")" | |
i += 1 | |
if i >= 3: | |
break | |
else: | |
# No estimation results; maybe we haven't trained enough | |
print "No estimation results available." | |
print "Train more data or try using another text." | |
if __name__ == '__main__': | |
if len(sys.argv) == 2: | |
estimate_blog_category_for(sys.argv[1]) | |
else: | |
print "Usage: %s data" % sys.argv[0] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ sudo rpm -Uvh http://download.jubat.us/yum/rhel/6/stable/x86_64/jubatus-release-6-1.el6.x86_64.rpm | |
$ sudo yum install jubatus jubatus-client |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ git clone https://github.com/jubatus/jubatus-example.git |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ mysql -uuser -p -N db < blog.sql > blog.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ jubaclassifier -f blog_category.json -t 0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ cat blog.txt | ./train.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ ./classify.py "はじめまして。田中といいます。" | |
Estimated Category for はじめまして。田中といいます。: | |
自己紹介 (0.231856495142) | |
日記 (0.0823381990194) | |
お知らせ (0.0661180838943) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import sys | |
import json | |
import re | |
from jubatus.classifier import client | |
from jubatus.common import Datum | |
# Jubatus Configuration | |
host = "127.0.0.1" | |
port = 9199 | |
instance_name = "" # required only when using distributed mode | |
def print_color(color, msg, end): | |
sys.stdout.write('\033[' + str(color) + 'm' + str(msg) + '\033[0m' + str(end)) | |
def print_red(msg, end="\n"): | |
print_color(31, msg, end) | |
def print_green(msg, end="\n"): | |
print_color(32, msg, end) | |
def train(): | |
classifier = client.Classifier(host, port, instance_name) | |
for line in sys.stdin: | |
category_name, body = line.split("\t") | |
d = Datum({'text': body}) | |
classifier.train([(category_name, d)]) | |
# Print trained entry | |
print_green(category_name, ' ') | |
print body | |
# 学習後に学習データをバックアップしておく場合は以下を有効に | |
# classifier.save("foo") | |
if __name__ == '__main__': | |
try: | |
train() | |
except KeyboardInterrupt: | |
print "Stopped." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment