Skip to content

Instantly share code, notes, and snippets.

@tmtysk
Last active August 29, 2015 14:19
Show Gist options
  • Save tmtysk/f83ab394bd4236178c25 to your computer and use it in GitHub Desktop.
Save tmtysk/f83ab394bd4236178c25 to your computer and use it in GitHub Desktop.
Jubatusでテキストに含まれる特徴語の傾向を学習し、入力テキストをカテゴライズする ref: http://qiita.com/tmtysk/items/4d177ba27c8d2484b206
{
"method": "NHERD",
"parameter": {
"regularization_weight": 0.001
},
"converter": {
"num_filter_types": {
},
"num_filter_rules": [
],
"string_filter_types": {
},
"string_filter_rules": [
],
"num_types": {
},
"num_rules": [
],
"string_types": {
"bigram": { "method": "ngram", "char_num": "2" },
"mecab": {
"method": "dynamic",
"path": "libmecab_splitter.so",
"function": "create"
}
},
"string_rules": [
{ "key": "*", "type": "mecab", "sample_weight": "bin", "global_weight": "idf" }
]
}
}
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
from jubatus.classifier import client
from jubatus.common import Datum
# Jubatus configuration
host = "127.0.0.1"
port = 9199
instance_name = "" # required only when using distributed mode
def estimate_blog_category_for(text):
classifier = client.Classifier(host, port, instance_name)
# Create datum for Jubatus
d = Datum({'text': text})
# Send estimation query to Jubatus
result = classifier.classify([d])
if len(result[0]) > 0:
# Sort results by score
est = sorted(result[0], key=lambda e: e.score, reverse=True)
# Print the result
print "Estimated Category for %s:" % text
i = 0
for e in est:
print " " + e.label + " (" + str(e.score) + ")"
i += 1
if i >= 3:
break
else:
# No estimation results; maybe we haven't trained enough
print "No estimation results available."
print "Train more data or try using another text."
if __name__ == '__main__':
if len(sys.argv) == 2:
estimate_blog_category_for(sys.argv[1])
else:
print "Usage: %s data" % sys.argv[0]
$ sudo rpm -Uvh http://download.jubat.us/yum/rhel/6/stable/x86_64/jubatus-release-6-1.el6.x86_64.rpm
$ sudo yum install jubatus jubatus-client
$ git clone https://github.com/jubatus/jubatus-example.git
$ mysql -uuser -p -N db < blog.sql > blog.txt
$ jubaclassifier -f blog_category.json -t 0
$ cat blog.txt | ./train.py
$ ./classify.py "はじめまして。田中といいます。"
Estimated Category for はじめまして。田中といいます。:
自己紹介 (0.231856495142)
日記 (0.0823381990194)
お知らせ (0.0661180838943)
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import json
import re
from jubatus.classifier import client
from jubatus.common import Datum
# Jubatus Configuration
host = "127.0.0.1"
port = 9199
instance_name = "" # required only when using distributed mode
def print_color(color, msg, end):
sys.stdout.write('\033[' + str(color) + 'm' + str(msg) + '\033[0m' + str(end))
def print_red(msg, end="\n"):
print_color(31, msg, end)
def print_green(msg, end="\n"):
print_color(32, msg, end)
def train():
classifier = client.Classifier(host, port, instance_name)
for line in sys.stdin:
category_name, body = line.split("\t")
d = Datum({'text': body})
classifier.train([(category_name, d)])
# Print trained entry
print_green(category_name, ' ')
print body
# 学習後に学習データをバックアップしておく場合は以下を有効に
# classifier.save("foo")
if __name__ == '__main__':
try:
train()
except KeyboardInterrupt:
print "Stopped."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment