sureshgorakala · February 8, 2017 01:26
diff --git a/NLP_Session_1.py b/NLP_Session_1.py
 import nltk

 #reading text into python 
 path = "~/textCourpus.txt"
 f = open(path,'r')
 lines = [line.replace('\n','') for line in f.readlines()]
 #lines2 = [line.replace('\n','') for line in f.readlines()]

 type(lines)
 len(lines)

 #sentence tokenizing 
 from nltk.tokenize import sent_tokenize
 lines[4]
 sent_tokenize(lines[4])
 len(sent_tokenize(lines[4]))
 #download required resources for tokenizer - english.pickle 
 nltk.download()
 #nltk.download('punkt')
 sent_tokenize(lines[4])
 sent_tokenize(lines[4])[0]
 sent_tokenize(lines[4])[1]

 #tokenising sentences to words
 from nltk.tokenize import word_tokenize
 sent = sent_tokenize(lines[4])[1]
 word_tokenize(sent)
 type(word_tokenize(sent))
 # (OR)
 from nltk.tokenize import TreebankWordTokenizer
 tokenizer = TreebankWordTokenizer()
 tokenizer.tokenize(sent)

 #frequency distributions:
 from nltk.probability import FreqDist
 fdist = FreqDist(word.lower() for word in word_tokenize(sent))
 #length of each word
 [len(word) for word in word_tokenize(sent)]
 
 #collocations, wordsense disambiguation, co-reference
 #stopwords
 from nltk.corpus import stopwords
 english_stops = set(stopwords.words('english'))
 [word for word in word_tokenize(sent) if word not in english_stops]
 english_stops
 stopwords.fileids()
	import nltk

	#reading text into python
	path = "~/textCourpus.txt"
	f = open(path,'r')
	lines = [line.replace('\n','') for line in f.readlines()]
	#lines2 = [line.replace('\n','') for line in f.readlines()]

	type(lines)
	len(lines)

	#sentence tokenizing
	from nltk.tokenize import sent_tokenize
	lines[4]
	sent_tokenize(lines[4])
	len(sent_tokenize(lines[4]))
	#download required resources for tokenizer - english.pickle
	nltk.download()
	#nltk.download('punkt')
	sent_tokenize(lines[4])
	sent_tokenize(lines[4])[0]
	sent_tokenize(lines[4])[1]

	#tokenising sentences to words
	from nltk.tokenize import word_tokenize
	sent = sent_tokenize(lines[4])[1]
	word_tokenize(sent)
	type(word_tokenize(sent))
	# (OR)
	from nltk.tokenize import TreebankWordTokenizer
	tokenizer = TreebankWordTokenizer()
	tokenizer.tokenize(sent)

	#frequency distributions:
	from nltk.probability import FreqDist
	fdist = FreqDist(word.lower() for word in word_tokenize(sent))
	#length of each word
	[len(word) for word in word_tokenize(sent)]

	#collocations, wordsense disambiguation, co-reference
	#stopwords
	from nltk.corpus import stopwords
	english_stops = set(stopwords.words('english'))
	[word for word in word_tokenize(sent) if word not in english_stops]
	english_stops
	stopwords.fileids()