Skip to content

Instantly share code, notes, and snippets.

@rohitdholakia
Created December 27, 2011 21:22
Show Gist options
  • Save rohitdholakia/1525201 to your computer and use it in GitHub Desktop.
Save rohitdholakia/1525201 to your computer and use it in GitHub Desktop.
A python script to generate a dictionary to be used with NaiveBayes
#This script reads all files in all directories of the folder taken from the openClassroom site and generates the dictionary, which we can then store in a file
folders = ["spam-train","spam-test","nonspam-train","nonspam-test"]
import os,sys
#We need a dictionary to store word occurences. What we can do is create a default dict and then update the frequencies. Write it all into a file all at once.
from collections import *
dictionary = defaultdict(int)
fdict = open(sys.argv[2],'w') #File to write all the entries in the dictionary
for root,dirnames,filenames in os.walk(sys.argv[1]):
for d in dirnames: #For each directory
for f in os.listdir(d):
data = open ( os.path.join(sys.argv[1],d,f),'r')
for line in data:
words = line.split(" ")#Split words on space
for w in words:
dictionary[w] += 1
for k,v in dictionary.iteritems():
fdict.write(k +" "+str(dictionary[k])+"\n")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment