Created
April 9, 2016 23:45
-
-
Save nikolajbaer/8ed567371851b9f4dd66bc277976a38c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from operator import itemgetter | |
from collections import Counter | |
def clean_data(txt): | |
txt = txt.lower() | |
txt = txt.replace('mr.','mr').replace('mrs.','mrs').replace('ms.','ms') | |
sentences = re.findall(r"[^!\.\?]+[!\.\?]",txt.lower()) | |
sentences = [s.replace('\n',' ').replace('\r',' ') for s in sentences] | |
sentences = [re.sub(r'[^a-z ]','',x) for x in sentences] | |
return sentences | |
def build_db(rows): | |
db = {} | |
for r in rows: | |
words = r.split() | |
lw = len(words) - 1 | |
for i,w in enumerate(words): | |
if w not in db: | |
db[w] = {} | |
if i == lw: break | |
nxt = words[i+1] | |
if nxt not in db[w]: | |
db[w][nxt] = 1 | |
else: | |
db[w][nxt] += 1 | |
return db | |
if __name__=="__main__": | |
rows = clean_data(open("test.txt").read()) | |
db = build_db(rows) | |
while True: | |
print "Word:" | |
word = raw_input().lower() | |
if word not in db: print "(word not found)" | |
c = Counter(db[word]) | |
print ' '.join([i[0] for i in c.most_common()[:3]]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
foolish little hack to probabilistically predict the next word based upon what word you put in. Just came as a result of a conversation at pythonSD saturday meetup.