Last active
November 27, 2021 14:47
-
-
Save avriiil/ef88b94f32dff78af4ef3253c93b6436 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# importing libraries | |
import pandas as pd | |
import numpy as np | |
import gensim | |
from gsdmm import MovieGroupProcess | |
# cast tweets to numpy array | |
docs = df.tweet_text.to_numpy() | |
# create dictionary of all words in all documents | |
dictionary = gensim.corpora.Dictionary(docs) | |
# filter extreme cases out of dictionary | |
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000) | |
# create variable containing length of dictionary/vocab | |
vocab_length = len(dictionary) | |
# create BOW dictionary | |
bow_corpus = [dictionary.doc2bow(doc) for doc in docs] | |
# initialize GSDMM | |
gsdmm = MovieGroupProcess(K=15, alpha=0.1, beta=0.3, n_iters=15) | |
# fit GSDMM model | |
y = gsdmm.fit(docs, vocab_length) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment