Created
May 9, 2011 03:38
-
-
Save seikichi/962015 to your computer and use it in GitHub Desktop.
サークルの講座用(2011/5/9)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
サークルの講座用に書いた何か. | |
`もう/もう 何/なに も/も 怖/コワ く/く な/な い/い 。/。` | |
という形式のコーパスを読み込んで仮名漢字変換っぽい何かをします. | |
未知語とか何も考えてない上にスムージングのパラメータも適当でこれはひどい. | |
「EUCだから1文字2バイトだろ」とか決め打ちでこれはひどい. | |
全体的に富豪気味でこれはひどい. | |
using namepsace std; もぐもぐ! | |
gcc 4.4で動作確認. | |
% echo "もうなにもこわくない" | nkf -e | ./a.out corpus/all | nkf -w | |
=> もう何も怖くない | |
*/ | |
#include <iostream> | |
#include <vector> | |
#include <string> | |
#include <fstream> | |
#include <sstream> | |
#include <tr1/unordered_map> | |
#include <algorithm> | |
#include <queue> | |
#include <cmath> | |
using namespace std; | |
typedef tr1::unordered_map<string, double> StringDoubleMap; | |
typedef tr1::unordered_map<string, double> Unigram; | |
typedef tr1::unordered_map<string, StringDoubleMap > Bigram; | |
typedef tr1::unordered_map<string, StringDoubleMap > YomiProb; | |
struct LM { | |
double lambda1, lambda2; | |
Unigram unigram; | |
Bigram bigram; | |
LM(double l1, double l2) : lambda1(l1), lambda2(l2) {} | |
double prob(const string& prev, const string& w) const; | |
}; | |
void get_yomi(istream& in, YomiProb* yomi); | |
void make_model(istream& in, Unigram* uni, Bigram* bg); | |
string henkan(const LM& model, const YomiProb& yomi, const string& hira); | |
int main(int argc, char **argv) { | |
if (argc != 2) { | |
cerr << "Usage: kakan corpus" << endl; | |
return -1; | |
} | |
LM model(0.3, 0.7); | |
YomiProb yomi; | |
char *filename = argv[1]; | |
{ | |
ifstream in(filename); | |
get_yomi(in, &yomi); | |
} { | |
ifstream in(filename); | |
make_model(in, &model.unigram, &model.bigram); | |
} | |
string raw; | |
while (cin >> raw) { | |
cout << "=> " << henkan(model, yomi, raw) << endl; | |
} | |
} | |
double LM::prob(const string& prev, const string& w) const { | |
double p1 = 0.0, p2 = 0.0; | |
Unigram::const_iterator ui = unigram.find(w); | |
if (ui != unigram.end()) { p1 = ui->second; } | |
Bigram::const_iterator bi = bigram.find(prev); | |
if (bi != bigram.end()) { | |
Unigram::const_iterator it = bi->second.find(w); | |
if (it != bi->second.end()) { p2 = it->second; } | |
} | |
return lambda1*p1 + lambda2*p2; | |
} | |
pair<string, string> split(const string& s) { | |
size_t i = s.find('/'); | |
return pair<string, string>(s.substr(0, i), s.substr(i+1)); | |
} | |
void get_yomi(istream& in, YomiProb* yomi) { | |
YomiProb tmp; | |
string line, wordyomi; | |
while (getline(in, line)) { | |
istringstream iss(line); | |
while (iss >> wordyomi) { | |
string w, y; | |
pair<string, string> p = split(wordyomi); | |
w = p.first; | |
y = p.second; | |
tmp[w][y] += 1.0; | |
} | |
} | |
for (YomiProb::iterator it = tmp.begin(); | |
it != tmp.end(); | |
++it) { | |
double sum = 0.0; | |
for (StringDoubleMap::iterator jt = it->second.begin(); | |
jt != it->second.end(); | |
++jt) { | |
sum += jt->second; | |
} | |
for (StringDoubleMap::iterator jt = it->second.begin(); | |
jt != it->second.end(); | |
++jt) { | |
(*yomi)[jt->first][it->first] = jt->second / sum; | |
} | |
} | |
(*yomi)["BT"]["BT"] = 1.0; | |
} | |
void make_model(istream& in, Unigram* uni, Bigram* bg) { | |
string line, wordyomi; | |
while (getline(in, line)) { | |
string prevw = "BT", w, y; | |
istringstream iss(line + " BT/BT"); | |
while (iss >> wordyomi) { | |
pair<string, string> p = split(wordyomi); | |
w = p.first; | |
y = p.second; | |
(*uni)[w] += 1.0; | |
(*bg)[prevw][w] += 1.0; | |
prevw = w; | |
} | |
} | |
double usum = 0.0; | |
for (Unigram::iterator it = uni->begin(); | |
it != uni->end(); | |
++it) { | |
usum += it->second; | |
} | |
for (Unigram::iterator it = uni->begin(); | |
it != uni->end(); | |
++it) { | |
it->second /= usum; | |
} | |
for (Bigram::iterator it = bg->begin(); | |
it != bg->end(); | |
++it) { | |
double sum = 0.0; | |
for (Unigram::iterator jt = it->second.begin(); | |
jt != it->second.end(); | |
++jt) { | |
sum += jt->second; | |
} | |
for (Unigram::iterator jt = it->second.begin(); | |
jt != it->second.end(); | |
++jt) { | |
jt->second /= sum; | |
} | |
} | |
} | |
struct Node { | |
unsigned int index; | |
double prob; | |
string prev, all; | |
Node(int index, double prob, | |
const string& prev, const string& all) : | |
index(index), prob(prob), prev(prev), all(all) {} | |
bool operator<(const Node& rhs) const { | |
return prob < rhs.prob; | |
} | |
}; | |
string henkan(const LM& model, | |
const YomiProb& yomi, | |
const string& hira) { | |
string ret = "!!ERROR: couldn't translate `" + hira + "` !!!"; | |
const string raw = hira + "BT"; | |
priority_queue<Node> Q; | |
Q.push(Node(0, 0.0, "BT", "")); | |
tr1::unordered_map<int, tr1::unordered_map<string, double> > memo; | |
while (!Q.empty()) { | |
Node n = Q.top(); Q.pop(); | |
memo[n.index][n.prev] = n.prob; | |
if (n.index > 0 && n.prev == "BT") { | |
ret = n.all.substr(0, n.all.length()-2); | |
break; | |
} | |
for (int i=2; n.index+i<=raw.length(); i+=2) { | |
string rword = raw.substr(n.index, i); | |
YomiProb::const_iterator f = yomi.find(rword); | |
if (f != yomi.end()) { | |
StringDoubleMap::const_iterator it;; | |
for (it = f->second.begin(); | |
it != f->second.end(); | |
++it) { | |
int index = n.index + i; | |
string kanji = it->first; | |
double kanji_prob = it->second; | |
double model_prob = model.prob(n.prev, kanji); | |
double prob = log2(kanji_prob) | |
+ log2(model_prob) | |
+ n.prob; | |
if ((memo[index].find(kanji) != memo[index].end()) | |
&& (prob < memo[index][kanji])) { | |
continue; | |
} | |
Q.push(Node(index, prob, kanji, n.all+kanji)); | |
} | |
} | |
} | |
} | |
return ret; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment