tgalery · August 15, 2012 02:04 · husseinkohy · Nov 15, 2016
diff --git a/gistfile1.textile b/gistfile1.textile
diff --git a/nnmf_no_datatreatment.py b/nnmf_no_datatreatment.py
 #DISCLAIMER: This is a re-work of the examples provided in the re-work of 
 #the examples given in the sklearn's library website (ww.sci-kit.org)
 #most of the work are done by the libraries themselves
 #thanks for the working example given by Olivier Grisel <olivier.grisel@ensta.org>

 import simplejson as json
 import nltk
 from time import time
 from sklearn.feature_extraction import text
 from sklearn import decomposition

 #setting up initial time for benchmark purposes
 initialtime = time()
 #loading of initial variables and data
 corpus = [] #initializing a corpus variable
 data = json.load(open('data.json')) # loads the data into a file


 #creating the corpus matrix
 print "Loading data..."

 for article in data["articles"]:
  print "Loading article  entitled: " + article["title"]
  document = (article["title"] + " " + article["title"] + " " + article["content"]) # note: titles are added twice, so they can be counted as more significant when calculating the requency probability matrix
  corpus.append(document)
 print "The total number of documents in our corpus is: " + str(len(corpus) +1)
 components =  len(corpus)

 print "Calculating term frequency by inverse document frequency matrix..."
 vectorizer = text.CountVectorizer(max_df=0.95,max_features=1000,binary=False)
 counts = vectorizer.fit_transform(corpus)
 tfidf = text.TfidfTransformer().fit_transform(counts)



 #vectorizer = text.TfidfVectorizer(max_df=0.95, max_features=750, binary=False) #This excludes the 5% top words
 #print vectorizer.vocabulary_ # uncomment to see the vocabulary
 #tfidf= vectorizer.fit_transform(corpus) #this calculates the frequency matrix for the words in the corpus

 print "Calculating non-negative matrix factorization..."
 #n_interestingwords = 10 # sets the number of more interesting words
 nmf = decomposition.NMF(n_components=components)
 nmf = nmf.fit(tfidf)
 features = vectorizer.get_feature_names()
 print "These are the documents extracted from the data and their respective topics:"
 #nmf.components_

 for topicidx, topic in enumerate(nmf.components_):
    print "Keywords : " + " ".join([features[i] for i in topic.argsort()[:-11:-1]]) # this gives the top 10 keywords associated with a given topic
 print "Data extraction took: " + str(time() - initialtime) + "s to complete!"
diff --git a/nnmf_noun_extraction b/nnmf_noun_extraction
 #DISCLAIMER: This is a re-work of the examples provided in the re-work of 
 #the examples given in the sklearn's library website (ww.sci-kit.org)
 #most of the work are done by the libraries themselves
 #thanks for the working example given by Olivier Grisel <olivier.grisel@ensta.org>


 import simplejson as json
 import nltk
 from time import time
 from sklearn.feature_extraction import text
 from sklearn import decomposition

 #setting up initial time for benchmark purposes
 initialtime = time()


 #Data treatment component:
 #General procedures for data treatment
 def postagfilter(tagged, tags=['NN','NNP']): # this filters pairs whose second element is a noun related POS
    return [item[0] for item in tagged if item[1] in tags]

 def normalize(tagged): # this removes punctuation left by the word punkt tokenizer used to clean the initial data 
    return [item.replace('.', '') for item in tagged]

 #loading of initial variables and data
 corpus = [] #initializing a corpus variable
 data = json.load(open('data.json')) # loads the data into a file

 print "Loading data..."
 #creating the corpus matrix
 for article in data["articles"]:
  document = (article["title"] + " " + article["title"] + " " + article["content"]) 
  tokensdocument = nltk.PunktWordTokenizer().tokenize(document)
  tagsdocument = nltk.pos_tag(tokensdocument)
  filtereddocument = postagfilter(tagsdocument)
  bagofnouns = ' '.join(normalize(filtereddocument))
  corpus.append(bagofnouns)

 components =  len(corpus) 

 print "Calculating term frequency by inverse document frequency matrix..."
 vectorizer = text.TfidfVectorizer(max_df=0.95, max_features=500, binary=False) #note, you have to set this to false, otherwise this leads to a feature explosion, when calculating NNMF
 #print vectorizer.vocabulary_ # uncomment this, if you want to see the vocabulary
 tfidf= vectorizer.fit_transform(corpus) #this calculates the frequency matrix for the words in the corpus

 print "Calculating non-negative matrix factorization..."

 nmf = decomposition.NMF(n_components=components)
 nmf = nmf.fit(tfidf)
 features = vectorizer.get_feature_names() # potential problem here
 print "This are the documents extracted from the data and their respective topics:"
 for topicidx, topic in enumerate(nmf.components_):
    print "Keywords : " + " ".join([features[i] for i in topic.argsort()[:-11:-1]]) # this gives the top 10 keywords associated with a given topic
    
 print "Data extraction took: " + str(time() - initialtime) + "s to complete!"
	#DISCLAIMER: This is a re-work of the examples provided in the re-work of
	#the examples given in the sklearn's library website (ww.sci-kit.org)
	#most of the work are done by the libraries themselves
	#thanks for the working example given by Olivier Grisel <olivier.grisel@ensta.org>

	import simplejson as json
	import nltk
	from time import time
	from sklearn.feature_extraction import text
	from sklearn import decomposition

	#setting up initial time for benchmark purposes
	initialtime = time()
	#loading of initial variables and data
	corpus = [] #initializing a corpus variable
	data = json.load(open('data.json')) # loads the data into a file


	#creating the corpus matrix
	print "Loading data..."

	for article in data["articles"]:
	print "Loading article entitled: " + article["title"]
	document = (article["title"] + " " + article["title"] + " " + article["content"]) # note: titles are added twice, so they can be counted as more significant when calculating the requency probability matrix
	corpus.append(document)
	print "The total number of documents in our corpus is: " + str(len(corpus) +1)
	components = len(corpus)

	print "Calculating term frequency by inverse document frequency matrix..."
	vectorizer = text.CountVectorizer(max_df=0.95,max_features=1000,binary=False)
	counts = vectorizer.fit_transform(corpus)
	tfidf = text.TfidfTransformer().fit_transform(counts)



	#vectorizer = text.TfidfVectorizer(max_df=0.95, max_features=750, binary=False) #This excludes the 5% top words
	#print vectorizer.vocabulary_ # uncomment to see the vocabulary
	#tfidf= vectorizer.fit_transform(corpus) #this calculates the frequency matrix for the words in the corpus

	print "Calculating non-negative matrix factorization..."
	#n_interestingwords = 10 # sets the number of more interesting words
	nmf = decomposition.NMF(n_components=components)
	nmf = nmf.fit(tfidf)
	features = vectorizer.get_feature_names()
	print "These are the documents extracted from the data and their respective topics:"
	#nmf.components_

	for topicidx, topic in enumerate(nmf.components_):
	print "Keywords : " + " ".join([features[i] for i in topic.argsort()[:-11:-1]]) # this gives the top 10 keywords associated with a given topic
	print "Data extraction took: " + str(time() - initialtime) + "s to complete!"