Created
August 29, 2013 12:47
-
-
Save fbkarsdorp/6377634 to your computer and use it in GitHub Desktop.
Parsimonious Language Model in Python
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
import numpy as np | |
from heapq import nlargest | |
from itertools import izip | |
from sklearn.feature_extraction.text import CountVectorizer | |
old_settings = np.seterr(all='ignore') | |
def logsum(x): | |
"""Computes the sum of x assuming x is in the log domain. | |
Returns log(sum(exp(x))) while minimizing the possibility of | |
over/underflow. | |
Examples | |
======== | |
>>> import numpy as np | |
>>> a = np.arange(10) | |
>>> np.log(np.sum(np.exp(a))) | |
9.4586297444267107 | |
>>> logsum(a) | |
9.4586297444267107 | |
""" | |
# Use the max to normalize, as with the log this is what accumulates | |
# the less errors | |
vmax = x.max(axis=0) | |
out = np.log(np.sum(np.exp(x - vmax), axis=0)) | |
out += vmax | |
return out | |
class ParsimoniousLM(object): | |
def __init__(self, documents, weight, min_df=1, max_df=1.0): | |
self.weight = np.log(weight) | |
self.vectorizer = CountVectorizer(min_df=min_df, max_df=max_df, analyzer=lambda i: i) | |
cf = np.array(self.vectorizer.fit_transform(documents).sum(axis=0))[0] | |
self.pc = (np.log(cf) - np.log(np.sum(cf))) + np.log(1 - weight) | |
def topK(self, k, document, iterations=50, eps=1e-5): | |
ptf = self.lm(document, iterations, eps) | |
return nlargest(k, izip(self.vectorizer.get_feature_names(), ptf), lambda tp: tp[1]) | |
def lm(self, document, iterations, eps): | |
tf = self.vectorizer.transform([document]).toarray()[0] | |
ptf = np.log(tf > 0) - np.log((tf > 0).sum()) | |
ptf = self.EM(tf, ptf, iterations, eps) | |
return ptf | |
def EM(self, tf, ptf, iterations, eps): | |
tf = np.log(tf) | |
for i in xrange(1, iterations + 1): | |
ptf += self.weight | |
E = tf + ptf - np.logaddexp(self.pc, ptf) | |
M = E - logsum(E) # np.logaddexp.reduce(E) | |
diff = M - ptf | |
ptf = M | |
if (diff < eps).all(): | |
break | |
return ptf | |
def fit(self, texts, labels=None, iterations=50, eps=1e-5): | |
self.fitted_ = [] | |
if labels is None: | |
labels = range(len(texts)) | |
for label, text in izip(labels, texts): | |
lm = self.lm(text, iterations, eps) | |
self.fitted_.append((label, lm)) | |
def fit_transform(self, texts, labels=None, iterations=50, eps=1e-5): | |
self.fit(texts, labels, iterations, eps) | |
return self.fitted_ | |
def cross_entropy(self, qlm, rlm): | |
return -np.sum(np.exp(qlm) * np.logaddexp(self.pc, rlm + self.weight)) | |
def predict_proba(self, query): | |
if not hasattr(self, 'fitted_'): | |
raise ValueError("No Language Model fitted.") | |
for i in range(len(self.fitted_)): | |
score = self.cross_entropy(query, self.fitted_[i][1]) | |
yield self.fitted_[i][0], score | |
def demo(): | |
documents = ['er loopt een man op straat', 'de man is vies', | |
'de man heeft een gek hoofd', 'de hele straat kijkt naar de man'] | |
request = 'de straat is vies' | |
# initialize a parsimonious language model | |
plm = ParsimoniousLM(documents, 0.1) | |
# compute a LM for each document in the document collection | |
plm.fit(documents) | |
# compute a LM model for the test or request document | |
qlm = plm.lm(request, 50, 1e-5) | |
# compute the cross-entropy between the LM of the test document and all training document LMs | |
# sort by increasing entropy | |
print [(documents[i], score) for i, score in sorted(plm.predict_proba(qlm), key=lambda i: i[1])] | |
if __name__ == '__main__': | |
demo() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi! How can one get the probabilities assigned to each token by the parsimonious language model, i.e. the trained language model?