Created
September 10, 2019 13:01
-
-
Save RobinvanderVliet/4f9d5a3142badc4252d11a2b60fdfca1 to your computer and use it in GitHub Desktop.
Next word suggestor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
echo "[!] Downloading sentences from Tatoeba." | |
wget -O "sentences.tar.bz2" "https://downloads.tatoeba.org/exports/sentences.tar.bz2" | |
echo "[!] Extracting sentences from Tatoeba." | |
tar -xvjf "sentences.tar.bz2" | |
rm "sentences.tar.bz2" | |
echo "[!] Done!" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
def cleanSentence(sentence): | |
sentence = re.sub("[!¡?¿@\",:;().]", " ", sentence) | |
sentence = sentence.lower().strip() | |
sentence = re.sub(" +", " ", sentence) | |
return sentence.split(" ")[::-1] | |
language = input("Enter a three-letter language code: ") | |
importedSentences = open("sentences.csv", "r").read().splitlines() | |
lines = [] | |
for i in range(len(importedSentences)): | |
line = importedSentences[i].split("\t") | |
if line[1] == language: | |
lines.append(cleanSentence(line[2])) | |
while True: | |
sentence = cleanSentence(input("Enter a sentence: ")) | |
words = [] | |
precision = 0 | |
#loop thru all database lines | |
for i in range(len(lines)): | |
#loop thru all words from line | |
firstWord = "" | |
wordsFound = 0 | |
for j in range(len(lines[i])): | |
if sentence[wordsFound] == lines[i][j]: | |
if wordsFound == 0: | |
firstWord = lines[i][j - 1] | |
wordsFound = wordsFound + 1 | |
if wordsFound == len(sentence): | |
break | |
elif wordsFound > 0: | |
break | |
precision = max(precision, wordsFound) | |
words.append([wordsFound, firstWord]) | |
wordAmounts = {} | |
for i in range(len(words)): | |
if precision == words[i][0]: | |
if words[i][1] in wordAmounts: | |
wordAmounts[words[i][1]] = wordAmounts[words[i][1]] + 1 | |
else: | |
wordAmounts[words[i][1]] = 1 | |
sortedWordAmounts = sorted(wordAmounts.items(), key=lambda kv: kv[1]) | |
print(sortedWordAmounts) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment