Created
January 6, 2020 06:17
-
-
Save santhalakshminarayana/4c4b91af7b01328a1247addd99585442 to your computer and use it in GitHub Desktop.
Quotes Similarity Score - Medium
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
co_occ_matrix = defaultdict(int) | |
window = 5 # not greater than 5 | |
for sent in quotes: | |
words = sent.split(' ') | |
# first window | |
for i in range(0, window): | |
for j in range(i+1, window): | |
weight = 1/(j-i) | |
ind_1 = word_to_int[words[i]] | |
ind_2 = word_to_int[words[j]] | |
if ind_1 == ind_2: | |
continue | |
co_occ_matrix[(ind_1, ind_2)] += weight | |
co_occ_matrix[(ind_2, ind_1)] += weight | |
# rest windows | |
for i in range(window, len(words)): | |
for j in range(i-window, i): | |
weight = 1/(i-j) | |
ind_1 = word_to_int[words[i]] | |
ind_2 = word_to_int[words[j]] | |
if ind_1 == ind_2: | |
continue | |
co_occ_matrix[(ind_1, ind_2)] += weight | |
co_occ_matrix[(ind_2, ind_1)] += weight |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment