Last active
May 2, 2016 23:03
-
-
Save mmatkinson/ae64bfae230522d8b9de03ce4d69ec74 to your computer and use it in GitHub Desktop.
Helper class for using sklearn vectorizers with gensim lda.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# For gensim | |
from itertools import groupby | |
import gensim | |
class VectorizedCorpus(object): | |
""" | |
Helper Class for using Sklearn Vectorizers with gensim's LDA model | |
handles transformations between gensim corpus / bow representations and sklearn matrix | |
""" | |
def __init__(self, vec, doc_list): | |
self.vec = vec | |
self.doc_list = doc_list | |
self.vec.fit(self.doc_list) | |
self.dictionary = gensim.corpora.dictionary.Dictionary([self.vec.vocabulary_.keys()]) # for use in LDA model instantiation | |
self.idvec2word = {v:k for k,v in self.vec.vocabulary_.items()} | |
def __repr__(self): | |
return "<VectorizedCorpus: \n vec:({}) \n docs:({}) \n dict:({})>".format(self.vec, len(self), len(self.dictionary)) | |
def __len__(self): | |
return len(self.doc_list) | |
def __iter__(self): | |
return self.to_corpus(self.doc_list) | |
def to_corpus(self, doc_list=None): | |
""" Transforms a list of documents into a gensim corpus """ | |
if type(doc_list) is type(None): | |
doc_list = self.doc_list | |
for k in gensim.matutils.Sparse2Corpus(self.vec.transform(doc_list), documents_columns=False): | |
yield k | |
def to_bow_dict(self,doc): | |
""" | |
representation of a document needed for transforming an unseen document into lda topic | |
distribution | |
LDAModel[self.to_bow_dict(["list of" , "sentences or paragraphs"])] | |
""" | |
return [self.dictionary.doc2bow(d) for d in self.to_bow(doc)] | |
def to_bow(self, doc): | |
""" Transforms a doc into list of words used | |
utilizes the stored vectorizer (self.vec) | |
""" | |
doc_vec = self._transform(doc).nonzero() | |
doc_words = [(k,self.idvec2word[v]) for k,v in zip(*doc_vec)] | |
doc_group = {k: [v[1] for v in list(g)] for k, g in itertools.groupby(doc_words, lambda x: x[0])} | |
return list(doc_group.values()) | |
def _transform(self, doc): | |
if type(doc) in [list , np.ndarray]: | |
return self.vec.transform(doc) | |
elif type(doc) in [str]: | |
return self.vec.transform([doc]) | |
else: | |
raise Exception() | |
def from_bow(self, doc): | |
if type(doc) in [list , np.ndarray]: | |
return [self._from_bow(d) for d in doc] | |
elif type(doc) in [str]: | |
return [self._from_bow(doc)] | |
else: | |
raise Exception() | |
def _from_bow(self, single_doc): | |
return [self.dictionary[term[0]] for term in single_doc] | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from sklearn.feature_extraction.text import CountVectorizer | |
import gensim | |
vectorizer = CountVectorizer(stop_words='english', | |
min_df=4, | |
binary=True) | |
VecCorp = VectorizedCorpus(vec=vectorizer, doc_list = [list of documents]) | |
VecCorp.dictionary # gensim dictionary | |
VecCorp.to_bow([" this is a sentence "]) # Transform a sentence to a list of words in it | |
VecCorp.to_bow_dict([" this is a sentence "]) # Transform a sentence gensim bow format ( dictionary index of each word ) | |
VecCorp.from_bow(v.to_bow_dict([" this is a sentence "])) # inverse transform of to_bow_dict | |
ntopics=5 | |
lda = gensim.models.ldamodel.LdaModel(corpus=VecCorp, num_topics=ntopics, id2word = VecCorp.dictionary) | |
lda[VecCorp.to_bow_dict(["This is a new unseen sentence"])[0]] # Asign topic mix to a new document |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment