nikolajbaer · April 9, 2016 23:45 · nikolajbaer · Apr 9, 2016
diff --git a/predict_next.py b/predict_next.py
 import re
 from operator import itemgetter
 from collections import Counter

 def clean_data(txt):
    txt = txt.lower()
    txt = txt.replace('mr.','mr').replace('mrs.','mrs').replace('ms.','ms')
    sentences = re.findall(r"[^!\.\?]+[!\.\?]",txt.lower())
    sentences = [s.replace('\n',' ').replace('\r',' ') for s in sentences] 
    sentences = [re.sub(r'[^a-z ]','',x) for x in sentences]
    return sentences

 def build_db(rows):
    db = {} 
    for r in rows:
        words = r.split()
        lw = len(words) - 1
        for i,w in enumerate(words):
            if w not in db:
                db[w] = {}
            if i == lw: break
            nxt = words[i+1]
            if nxt not in db[w]:
                db[w][nxt] = 1
            else:
                db[w][nxt] += 1
    return db

 if __name__=="__main__":
    rows = clean_data(open("test.txt").read())
    db = build_db(rows)

    while True:
        print "Word:"    
        word = raw_input().lower()
        if word not in db: print "(word not found)"
        c = Counter(db[word])
        print ' '.join([i[0] for i in c.most_common()[:3]])
	import re
	from operator import itemgetter
	from collections import Counter

	def clean_data(txt):
	txt = txt.lower()
	txt = txt.replace('mr.','mr').replace('mrs.','mrs').replace('ms.','ms')
	sentences = re.findall(r"[^!\.\?]+[!\.\?]",txt.lower())
	sentences = [s.replace('\n',' ').replace('\r',' ') for s in sentences]
	sentences = [re.sub(r'[^a-z ]','',x) for x in sentences]
	return sentences

	def build_db(rows):
	db = {}
	for r in rows:
	words = r.split()
	lw = len(words) - 1
	for i,w in enumerate(words):
	if w not in db:
	db[w] = {}
	if i == lw: break
	nxt = words[i+1]
	if nxt not in db[w]:
	db[w][nxt] = 1
	else:
	db[w][nxt] += 1
	return db

	if __name__=="__main__":
	rows = clean_data(open("test.txt").read())
	db = build_db(rows)

	while True:
	print "Word:"
	word = raw_input().lower()
	if word not in db: print "(word not found)"
	c = Counter(db[word])
	print ' '.join([i[0] for i in c.most_common()[:3]])