yabyzq · April 4, 2018 07:48
diff --git a/doc_prediction b/doc_prediction
 import PyPDF2





 def read_pdfs(pdf_file_name):
    pdf = PyPDF2.PdfFileReader(open(pdf_file_name,'rb'))
    num_pages = pdf.numPages
    count = 0
    text = ""
    while count < num_pages:
        pageObj = pdf.getPage(count)
        count +=1
        text += pageObj.extractText()
    #This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
    if text != "":
       text = text
    else:
       text = textract.process(fileurl, method='tesseract', language='eng')
    return text


 doc2 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
 BB Docs/Sample of Ezidox/train_20180630_Comparative Profit and Loss.pdf')
 doc3 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
 BB Docs/Sample of Ezidox/train_Corporate Family Tree 15112017.pdf')
 doc4 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
 BB Docs/Sample of Ezidox/train_Financial Statements - FY16 Jamclan Trust.pdf')
 doc5 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
 BB Docs/Sample of Ezidox/train_Financial Statements - Pre Assessment 15112017.pdf')
 doc6 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
 BB Docs/Sample of Ezidox/train_Professional_Indemnity_Insurance_09022018035943.pdf')
 doc_complete = [doc2, doc3, doc4, doc5, doc6]


 import nltk
 from nltk.corpus import stopwords 
 stop = set(stopwords.words('english'))





 from nltk.corpus import stopwords 
 from nltk.stem.wordnet import WordNetLemmatizer
 import nltk
 #nltk.download()
 import string
 stop = set(stopwords.words('english'))
 exclude = set(string.punctuation) 
 lemma = WordNetLemmatizer()
 def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    punc_free = ''.join(ch for ch in punc_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

 doc_clean = [clean(doc).split() for doc in doc_complete]  





 # Importing Gensim

 from gensim import corpora
 import gensim
 # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
 dictionary = corpora.Dictionary(doc_clean)

 # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
 doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

 Lda = gensim.models.ldamodel.LdaModel

 # Running and Trainign LDA model on the document term matrix.
 ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

 print(ldamodel.print_topics(num_topics=5, num_words=5))
 #ldamodel.print_topics()



 from sklearn.pipeline import Pipeline
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.feature_extraction.text import TfidfTransformer
 from sklearn.naive_bayes import MultinomialNB
 import numpy as np

 doc_target = [0,1,0,0,2]#0 - financial, 1 - family, 2 - insurance

 #adding test data
 doc1 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
 BB Docs/Sample of Ezidox/test_20170630_Consolidated Financials.pdf')
 doc6 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
 BB Docs/Sample of Ezidox/test_Corporate_Family_Tree_09022018035933.pdf')
 doc7 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
 BB Docs/Sample of Ezidox/test_Financial_Statements__Pre_Assessment_09022018035933.pdf')
 doc8 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
 BB Docs/Sample of Ezidox/test_Professional Indemnity Insurance.pdf')
 doc_test = [doc1,doc6, doc7, doc8]
 doc_test_target =[0,1,0,2]





 from sklearn.linear_model import SGDClassifier #SVM
 text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)),])
 _ = text_clf_svm.fit(doc_complete, doc_target)

 predicted_svm = text_clf_svm.predict(doc_complete)
 print(np.mean(predicted_svm == doc_target))
 print(predicted_svm)





 from sklearn.linear_model import SGDClassifier #SVM
 text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)),])
 _ = text_clf_svm.fit(doc_complete, doc_target)

 predicted_svm = text_clf_svm.predict(doc_complete)
 print(np.mean(predicted_svm == doc_target))
 print(predicted_svm)

 import nltk
 from nltk.stem.snowball import SnowballStemmer
 stemmer = SnowballStemmer("english", ignore_stopwords=True)
 class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
 stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
 text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect),
                     ('tfidf', TfidfTransformer()),
                      ('mnb', MultinomialNB(fit_prior=False)),
 ])
 text_mnb_stemmed = text_mnb_stemmed.fit(doc_complete, doc_target)
 predicted_mnb_stemmed = text_mnb_stemmed.predict(doc_test)
 np.mean(predicted_mnb_stemmed == doc_test_target)


 import pandas as pd
 df = pd.DataFrame(data={'doc': doc_complete, 'target': doc_target})
 category_id_df = df[['doc', 'target']].drop_duplicates().sort_values('target')
 category_to_id = dict(category_id_df.values)
 id_to_category = dict(category_id_df[['target', 'doc']].values)
 df


 from sklearn.feature_extraction.text import TfidfVectorizer
 tfidf = TfidfVectorizer(sublinear_tf=True, min_df=1, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
 features = tfidf.fit_transform(df.doc).toarray()
 labels = df.target
 features.shape

 from sklearn.feature_selection import chi2
 N = 2
 for Product, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("\n# '{}':".format(Product[0:50]))
  print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))
	import PyPDF2





	def read_pdfs(pdf_file_name):
	pdf = PyPDF2.PdfFileReader(open(pdf_file_name,'rb'))
	num_pages = pdf.numPages
	count = 0
	text = ""
	while count < num_pages:
	pageObj = pdf.getPage(count)
	count +=1
	text += pageObj.extractText()
	#This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
	if text != "":
	text = text
	else:
	text = textract.process(fileurl, method='tesseract', language='eng')
	return text


	doc2 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
	BB Docs/Sample of Ezidox/train_20180630_Comparative Profit and Loss.pdf')
	doc3 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
	BB Docs/Sample of Ezidox/train_Corporate Family Tree 15112017.pdf')
	doc4 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
	BB Docs/Sample of Ezidox/train_Financial Statements - FY16 Jamclan Trust.pdf')
	doc5 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
	BB Docs/Sample of Ezidox/train_Financial Statements - Pre Assessment 15112017.pdf')
	doc6 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
	BB Docs/Sample of Ezidox/train_Professional_Indemnity_Insurance_09022018035943.pdf')
	doc_complete = [doc2, doc3, doc4, doc5, doc6]


	import nltk
	from nltk.corpus import stopwords
	stop = set(stopwords.words('english'))





	from nltk.corpus import stopwords
	from nltk.stem.wordnet import WordNetLemmatizer
	import nltk
	#nltk.download()
	import string
	stop = set(stopwords.words('english'))
	exclude = set(string.punctuation)
	lemma = WordNetLemmatizer()
	def clean(doc):
	stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
	punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
	punc_free = ''.join(ch for ch in punc_free if ch not in exclude)
	normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
	return normalized

	doc_clean = [clean(doc).split() for doc in doc_complete]





	# Importing Gensim

	from gensim import corpora
	import gensim
	# Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
	dictionary = corpora.Dictionary(doc_clean)

	# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
	doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

	Lda = gensim.models.ldamodel.LdaModel

	# Running and Trainign LDA model on the document term matrix.
	ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50)

	print(ldamodel.print_topics(num_topics=5, num_words=5))
	#ldamodel.print_topics()



	from sklearn.pipeline import Pipeline
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.feature_extraction.text import TfidfTransformer
	from sklearn.naive_bayes import MultinomialNB
	import numpy as np

	doc_target = [0,1,0,0,2]#0 - financial, 1 - family, 2 - insurance

	#adding test data
	doc1 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
	BB Docs/Sample of Ezidox/test_20170630_Consolidated Financials.pdf')
	doc6 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
	BB Docs/Sample of Ezidox/test_Corporate_Family_Tree_09022018035933.pdf')
	doc7 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
	BB Docs/Sample of Ezidox/test_Financial_Statements__Pre_Assessment_09022018035933.pdf')
	doc8 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\
	BB Docs/Sample of Ezidox/test_Professional Indemnity Insurance.pdf')
	doc_test = [doc1,doc6, doc7, doc8]
	doc_test_target =[0,1,0,2]





	from sklearn.linear_model import SGDClassifier #SVM
	text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')),
	('tfidf', TfidfTransformer()),
	('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)),])
	_ = text_clf_svm.fit(doc_complete, doc_target)

	predicted_svm = text_clf_svm.predict(doc_complete)
	print(np.mean(predicted_svm == doc_target))
	print(predicted_svm)





	from sklearn.linear_model import SGDClassifier #SVM
	text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')),
	('tfidf', TfidfTransformer()),
	('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)),])
	_ = text_clf_svm.fit(doc_complete, doc_target)

	predicted_svm = text_clf_svm.predict(doc_complete)
	print(np.mean(predicted_svm == doc_target))
	print(predicted_svm)

	import nltk
	from nltk.stem.snowball import SnowballStemmer
	stemmer = SnowballStemmer("english", ignore_stopwords=True)
	class StemmedCountVectorizer(CountVectorizer):
	def build_analyzer(self):
	analyzer = super(StemmedCountVectorizer, self).build_analyzer()
	return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
	stemmed_count_vect = StemmedCountVectorizer(stop_words='english')
	text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect),
	('tfidf', TfidfTransformer()),
	('mnb', MultinomialNB(fit_prior=False)),
	])
	text_mnb_stemmed = text_mnb_stemmed.fit(doc_complete, doc_target)
	predicted_mnb_stemmed = text_mnb_stemmed.predict(doc_test)
	np.mean(predicted_mnb_stemmed == doc_test_target)


	import pandas as pd
	df = pd.DataFrame(data={'doc': doc_complete, 'target': doc_target})
	category_id_df = df[['doc', 'target']].drop_duplicates().sort_values('target')
	category_to_id = dict(category_id_df.values)
	id_to_category = dict(category_id_df[['target', 'doc']].values)
	df


	from sklearn.feature_extraction.text import TfidfVectorizer
	tfidf = TfidfVectorizer(sublinear_tf=True, min_df=1, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
	features = tfidf.fit_transform(df.doc).toarray()
	labels = df.target
	features.shape

	from sklearn.feature_selection import chi2
	N = 2
	for Product, category_id in sorted(category_to_id.items()):
	features_chi2 = chi2(features, labels == category_id)
	indices = np.argsort(features_chi2[0])
	feature_names = np.array(tfidf.get_feature_names())[indices]
	unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
	bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
	print("\n# '{}':".format(Product[0:50]))
	print(" . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
	print(" . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))