seikichi · May 9, 2011 03:38
diff --git a/kakan.cpp b/kakan.cpp
 /*
  サークルの講座用に書いた何か．
  `もう/もう 何/なに も/も 怖/コワ く/く な/な い/い 。/。`
  という形式のコーパスを読み込んで仮名漢字変換っぽい何かをします．

  未知語とか何も考えてない上にスムージングのパラメータも適当でこれはひどい．
  「EUCだから1文字2バイトだろ」とか決め打ちでこれはひどい．
  全体的に富豪気味でこれはひどい．
  using namepsace std; もぐもぐ!

  gcc 4.4で動作確認．

  % echo "もうなにもこわくない" | nkf -e | ./a.out corpus/all | nkf -w
  => もう何も怖くない
 */

 #include <iostream>
 #include <vector>
 #include <string>
 #include <fstream>
 #include <sstream>
 #include <tr1/unordered_map>
 #include <algorithm>
 #include <queue>

 #include <cmath>

 using namespace std;

 typedef tr1::unordered_map<string, double> StringDoubleMap;
 typedef tr1::unordered_map<string, double> Unigram;
 typedef tr1::unordered_map<string, StringDoubleMap > Bigram;
 typedef tr1::unordered_map<string, StringDoubleMap > YomiProb;

 struct LM {
    double lambda1, lambda2;
    Unigram unigram;
    Bigram bigram;
    LM(double l1, double l2) : lambda1(l1), lambda2(l2) {}
    double prob(const string& prev, const string& w) const;
 };

 void get_yomi(istream& in, YomiProb* yomi);
 void make_model(istream& in, Unigram* uni, Bigram* bg);
 string henkan(const LM& model, const YomiProb& yomi, const string& hira);

 int main(int argc, char **argv) {
    if (argc != 2) {
        cerr << "Usage: kakan corpus" << endl;
        return -1;
    }
    LM model(0.3, 0.7);
    YomiProb yomi;
    char *filename = argv[1];
    {
        ifstream in(filename);
        get_yomi(in, &yomi);
    } {
        ifstream in(filename);
        make_model(in, &model.unigram, &model.bigram);
    }
    string raw;
    while (cin >> raw) {
        cout << "=> " << henkan(model, yomi, raw) << endl;
    }
 }

 double LM::prob(const string& prev, const string& w) const {
    double p1 = 0.0, p2 = 0.0;
    Unigram::const_iterator ui = unigram.find(w);
    if (ui != unigram.end()) { p1 = ui->second; }
    Bigram::const_iterator bi = bigram.find(prev);
    if (bi != bigram.end()) {
        Unigram::const_iterator it = bi->second.find(w);
        if (it != bi->second.end()) { p2 = it->second; }
    }
    return lambda1*p1 + lambda2*p2;
 }

 pair<string, string> split(const string& s) {
    size_t i = s.find('/');
    return pair<string, string>(s.substr(0, i), s.substr(i+1));
 }

 void get_yomi(istream& in, YomiProb* yomi) {
    YomiProb tmp;
    string line, wordyomi;
    while (getline(in, line)) {
        istringstream iss(line);
        while (iss >> wordyomi) {
            string w, y;
            pair<string, string> p = split(wordyomi);
            w = p.first;
            y = p.second;
            tmp[w][y] += 1.0;
        }
    }
    for (YomiProb::iterator it = tmp.begin();
         it != tmp.end();
         ++it) {
        double sum = 0.0;
        for (StringDoubleMap::iterator jt = it->second.begin();
             jt != it->second.end();
             ++jt) {
            sum += jt->second;
        }
        for (StringDoubleMap::iterator jt = it->second.begin();
             jt != it->second.end();
             ++jt) {
            (*yomi)[jt->first][it->first] = jt->second / sum;
        }
    }
    (*yomi)["BT"]["BT"] = 1.0;
 }


 void make_model(istream& in, Unigram* uni, Bigram* bg) {
    string line, wordyomi;
    while (getline(in, line)) {
        string prevw = "BT", w, y;
        istringstream iss(line + " BT/BT");
        while (iss >> wordyomi) {
            pair<string, string> p = split(wordyomi);
            w = p.first;
            y = p.second;
            (*uni)[w] += 1.0;
            (*bg)[prevw][w] += 1.0;
            prevw = w;
        }
    }

    double usum = 0.0;
    for (Unigram::iterator it = uni->begin();
         it != uni->end();
         ++it) {
        usum += it->second;
    }
    for (Unigram::iterator it = uni->begin();
         it != uni->end();
         ++it) {
        it->second /= usum;
    }

    for (Bigram::iterator it = bg->begin();
         it != bg->end();
         ++it) {
        double sum = 0.0;
        for (Unigram::iterator jt = it->second.begin();
             jt != it->second.end();
             ++jt) {
            sum += jt->second;
        }
        for (Unigram::iterator jt = it->second.begin();
             jt != it->second.end();
             ++jt) {
            jt->second /= sum;
        }
    }
 }

 struct Node {
    unsigned int index;
    double prob;
    string prev, all;
    Node(int index, double prob,
         const string& prev, const string& all) :
        index(index), prob(prob), prev(prev), all(all) {}
    bool operator<(const Node& rhs) const {
        return prob < rhs.prob;
    }
 };
 string henkan(const LM& model,
              const YomiProb& yomi,
              const string& hira) {
    string ret = "!!ERROR: couldn't translate `" + hira + "` !!!";
    const string raw = hira + "BT";
    priority_queue<Node> Q;
    Q.push(Node(0, 0.0, "BT", ""));
    tr1::unordered_map<int, tr1::unordered_map<string, double> > memo;

    while (!Q.empty()) {
        Node n = Q.top(); Q.pop();
        memo[n.index][n.prev] = n.prob;

        if (n.index > 0 && n.prev == "BT") {
            ret = n.all.substr(0, n.all.length()-2);
            break;
        }

        for (int i=2; n.index+i<=raw.length(); i+=2) {
            string rword = raw.substr(n.index, i);
            YomiProb::const_iterator f = yomi.find(rword);
            if (f != yomi.end()) {
                StringDoubleMap::const_iterator it;;
                for (it = f->second.begin();
                     it != f->second.end();
                     ++it) {
                    int index = n.index + i;
                    string kanji = it->first;
                    double kanji_prob = it->second;
                    double model_prob = model.prob(n.prev, kanji);
                    double prob = log2(kanji_prob)
                        + log2(model_prob)
                        + n.prob;
                    if ((memo[index].find(kanji) != memo[index].end())
                        && (prob < memo[index][kanji])) {
                        continue;
                    }
                    Q.push(Node(index, prob, kanji, n.all+kanji));
                }
            }
        }
    }
    return ret;
 }
	/*
	サークルの講座用に書いた何か．
	`もう/もう何/なにも/も怖/コワく/くな/ない/い。/。`
	という形式のコーパスを読み込んで仮名漢字変換っぽい何かをします．

	未知語とか何も考えてない上にスムージングのパラメータも適当でこれはひどい．
	「EUCだから1文字2バイトだろ」とか決め打ちでこれはひどい．
	全体的に富豪気味でこれはひどい．
	using namepsace std; もぐもぐ!

	gcc 4.4で動作確認．

	% echo "もうなにもこわくない" \| nkf -e \| ./a.out corpus/all \| nkf -w
	=> もう何も怖くない
	*/

	#include <iostream>
	#include <vector>
	#include <string>
	#include <fstream>
	#include <sstream>
	#include <tr1/unordered_map>
	#include <algorithm>
	#include <queue>

	#include <cmath>

	using namespace std;

	typedef tr1::unordered_map<string, double> StringDoubleMap;
	typedef tr1::unordered_map<string, double> Unigram;
	typedef tr1::unordered_map<string, StringDoubleMap > Bigram;
	typedef tr1::unordered_map<string, StringDoubleMap > YomiProb;

	struct LM {
	double lambda1, lambda2;
	Unigram unigram;
	Bigram bigram;
	LM(double l1, double l2) : lambda1(l1), lambda2(l2) {}
	double prob(const string& prev, const string& w) const;
	};

	void get_yomi(istream& in, YomiProb* yomi);
	void make_model(istream& in, Unigram* uni, Bigram* bg);
	string henkan(const LM& model, const YomiProb& yomi, const string& hira);

	int main(int argc, char **argv) {
	if (argc != 2) {
	cerr << "Usage: kakan corpus" << endl;
	return -1;
	}
	LM model(0.3, 0.7);
	YomiProb yomi;
	char *filename = argv[1];
	{
	ifstream in(filename);
	get_yomi(in, &yomi);
	} {
	ifstream in(filename);
	make_model(in, &model.unigram, &model.bigram);
	}
	string raw;
	while (cin >> raw) {
	cout << "=> " << henkan(model, yomi, raw) << endl;
	}
	}

	double LM::prob(const string& prev, const string& w) const {
	double p1 = 0.0, p2 = 0.0;
	Unigram::const_iterator ui = unigram.find(w);
	if (ui != unigram.end()) { p1 = ui->second; }
	Bigram::const_iterator bi = bigram.find(prev);
	if (bi != bigram.end()) {
	Unigram::const_iterator it = bi->second.find(w);
	if (it != bi->second.end()) { p2 = it->second; }
	}
	return lambda1p1 + lambda2p2;
	}

	pair<string, string> split(const string& s) {
	size_t i = s.find('/');
	return pair<string, string>(s.substr(0, i), s.substr(i+1));
	}

	void get_yomi(istream& in, YomiProb* yomi) {
	YomiProb tmp;
	string line, wordyomi;
	while (getline(in, line)) {
	istringstream iss(line);
	while (iss >> wordyomi) {
	string w, y;
	pair<string, string> p = split(wordyomi);
	w = p.first;
	y = p.second;
	tmp[w][y] += 1.0;
	}
	}
	for (YomiProb::iterator it = tmp.begin();
	it != tmp.end();
	++it) {
	double sum = 0.0;
	for (StringDoubleMap::iterator jt = it->second.begin();
	jt != it->second.end();
	++jt) {
	sum += jt->second;
	}
	for (StringDoubleMap::iterator jt = it->second.begin();
	jt != it->second.end();
	++jt) {
	(*yomi)[jt->first][it->first] = jt->second / sum;
	}
	}
	(*yomi)["BT"]["BT"] = 1.0;
	}


	void make_model(istream& in, Unigram* uni, Bigram* bg) {
	string line, wordyomi;
	while (getline(in, line)) {
	string prevw = "BT", w, y;
	istringstream iss(line + " BT/BT");
	while (iss >> wordyomi) {
	pair<string, string> p = split(wordyomi);
	w = p.first;
	y = p.second;
	(*uni)[w] += 1.0;
	(*bg)[prevw][w] += 1.0;
	prevw = w;
	}
	}

	double usum = 0.0;
	for (Unigram::iterator it = uni->begin();
	it != uni->end();
	++it) {
	usum += it->second;
	}
	for (Unigram::iterator it = uni->begin();
	it != uni->end();
	++it) {
	it->second /= usum;
	}

	for (Bigram::iterator it = bg->begin();
	it != bg->end();
	++it) {
	double sum = 0.0;
	for (Unigram::iterator jt = it->second.begin();
	jt != it->second.end();
	++jt) {
	sum += jt->second;
	}
	for (Unigram::iterator jt = it->second.begin();
	jt != it->second.end();
	++jt) {
	jt->second /= sum;
	}
	}
	}

	struct Node {
	unsigned int index;
	double prob;
	string prev, all;
	Node(int index, double prob,
	const string& prev, const string& all) :
	index(index), prob(prob), prev(prev), all(all) {}
	bool operator<(const Node& rhs) const {
	return prob < rhs.prob;
	}
	};
	string henkan(const LM& model,
	const YomiProb& yomi,
	const string& hira) {
	string ret = "!!ERROR: couldn't translate `" + hira + "` !!!";
	const string raw = hira + "BT";
	priority_queue<Node> Q;
	Q.push(Node(0, 0.0, "BT", ""));
	tr1::unordered_map<int, tr1::unordered_map<string, double> > memo;

	while (!Q.empty()) {
	Node n = Q.top(); Q.pop();
	memo[n.index][n.prev] = n.prob;

	if (n.index > 0 && n.prev == "BT") {
	ret = n.all.substr(0, n.all.length()-2);
	break;
	}

	for (int i=2; n.index+i<=raw.length(); i+=2) {
	string rword = raw.substr(n.index, i);
	YomiProb::const_iterator f = yomi.find(rword);
	if (f != yomi.end()) {
	StringDoubleMap::const_iterator it;;
	for (it = f->second.begin();
	it != f->second.end();
	++it) {
	int index = n.index + i;
	string kanji = it->first;
	double kanji_prob = it->second;
	double model_prob = model.prob(n.prev, kanji);
	double prob = log2(kanji_prob)
	+ log2(model_prob)
	+ n.prob;
	if ((memo[index].find(kanji) != memo[index].end())
	&& (prob < memo[index][kanji])) {
	continue;
	}
	Q.push(Node(index, prob, kanji, n.all+kanji));
	}
	}
	}
	}
	return ret;
	}