Created
March 16, 2019 16:30
-
-
Save gaurav-singh1998/fc9650bedae39406f6af52f9bbbacc9a to your computer and use it in GitHub Desktop.
Gist to calculate the cooccurence matrix
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def co_occurence_matrix(list_of_words, preprocessed_reviews, window_size=5): | |
''' | |
Function which takes list of words and preprocessed_reviews as the inputs and | |
returns the co_occurence_matrix. | |
Args: | |
list_of_words: A list of words for which the co-occurence matrix is required. | |
preprocessed_reviews: A list of all the preprocessed reviews for which the co-occurence matrix is required. | |
window_size: Suitable window size for which the co-occurence matrix is required. | |
Returns: | |
cooc: The calculated co-occurence matrix. | |
''' | |
import numpy as np | |
from tqdm import tqdm | |
if not isinstance(list_of_words, list): | |
raise TypeError("Please provide a list argument.") | |
if not isinstance(preprocessed_reviews, list): | |
raise TypeError("Please provide a list argument.") | |
if not isinstance(window_size, int): | |
raise TypeError("Please provide a integer argument.") | |
cooc = np.zeros((len(list_of_words), len(list_of_words)),np.float64) | |
list_of_words_dict = {list_of_words[i]:i for i in range(len(list_of_words))} | |
for i in tqdm(list_of_words_dict.keys()): | |
for j in preprocessed_reviews: | |
j = j.split() | |
if str(i) not in j: | |
continue | |
else: | |
for x in list_of_words_dict.values(): | |
if abs(list_of_words_dict[i]-x)<window_size: | |
cooc[list_of_words_dict[i],x]+=1 | |
return cooc |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment