Created
April 4, 2018 07:48
-
-
Save yabyzq/d4535a05ba7b21092a7bd157f327aad3 to your computer and use it in GitHub Desktop.
document prediction
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import PyPDF2 | |
def read_pdfs(pdf_file_name): | |
pdf = PyPDF2.PdfFileReader(open(pdf_file_name,'rb')) | |
num_pages = pdf.numPages | |
count = 0 | |
text = "" | |
while count < num_pages: | |
pageObj = pdf.getPage(count) | |
count +=1 | |
text += pageObj.extractText() | |
#This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files. | |
if text != "": | |
text = text | |
else: | |
text = textract.process(fileurl, method='tesseract', language='eng') | |
return text | |
doc2 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\ | |
BB Docs/Sample of Ezidox/train_20180630_Comparative Profit and Loss.pdf') | |
doc3 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\ | |
BB Docs/Sample of Ezidox/train_Corporate Family Tree 15112017.pdf') | |
doc4 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\ | |
BB Docs/Sample of Ezidox/train_Financial Statements - FY16 Jamclan Trust.pdf') | |
doc5 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\ | |
BB Docs/Sample of Ezidox/train_Financial Statements - Pre Assessment 15112017.pdf') | |
doc6 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\ | |
BB Docs/Sample of Ezidox/train_Professional_Indemnity_Insurance_09022018035943.pdf') | |
doc_complete = [doc2, doc3, doc4, doc5, doc6] | |
import nltk | |
from nltk.corpus import stopwords | |
stop = set(stopwords.words('english')) | |
from nltk.corpus import stopwords | |
from nltk.stem.wordnet import WordNetLemmatizer | |
import nltk | |
#nltk.download() | |
import string | |
stop = set(stopwords.words('english')) | |
exclude = set(string.punctuation) | |
lemma = WordNetLemmatizer() | |
def clean(doc): | |
stop_free = " ".join([i for i in doc.lower().split() if i not in stop]) | |
punc_free = ''.join(ch for ch in stop_free if ch not in exclude) | |
punc_free = ''.join(ch for ch in punc_free if ch not in exclude) | |
normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split()) | |
return normalized | |
doc_clean = [clean(doc).split() for doc in doc_complete] | |
# Importing Gensim | |
from gensim import corpora | |
import gensim | |
# Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean) | |
dictionary = corpora.Dictionary(doc_clean) | |
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above. | |
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] | |
Lda = gensim.models.ldamodel.LdaModel | |
# Running and Trainign LDA model on the document term matrix. | |
ldamodel = Lda(doc_term_matrix, num_topics=3, id2word = dictionary, passes=50) | |
print(ldamodel.print_topics(num_topics=5, num_words=5)) | |
#ldamodel.print_topics() | |
from sklearn.pipeline import Pipeline | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.feature_extraction.text import TfidfTransformer | |
from sklearn.naive_bayes import MultinomialNB | |
import numpy as np | |
doc_target = [0,1,0,0,2]#0 - financial, 1 - family, 2 - insurance | |
#adding test data | |
doc1 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\ | |
BB Docs/Sample of Ezidox/test_20170630_Consolidated Financials.pdf') | |
doc6 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\ | |
BB Docs/Sample of Ezidox/test_Corporate_Family_Tree_09022018035933.pdf') | |
doc7 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\ | |
BB Docs/Sample of Ezidox/test_Financial_Statements__Pre_Assessment_09022018035933.pdf') | |
doc8 = read_pdfs('//vfsyd04/shared/cbg/IMO/Production/DIV - Business Banking/07 Projects/Element AI/\ | |
BB Docs/Sample of Ezidox/test_Professional Indemnity Insurance.pdf') | |
doc_test = [doc1,doc6, doc7, doc8] | |
doc_test_target =[0,1,0,2] | |
from sklearn.linear_model import SGDClassifier #SVM | |
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')), | |
('tfidf', TfidfTransformer()), | |
('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)),]) | |
_ = text_clf_svm.fit(doc_complete, doc_target) | |
predicted_svm = text_clf_svm.predict(doc_complete) | |
print(np.mean(predicted_svm == doc_target)) | |
print(predicted_svm) | |
from sklearn.linear_model import SGDClassifier #SVM | |
text_clf_svm = Pipeline([('vect', CountVectorizer(stop_words='english')), | |
('tfidf', TfidfTransformer()), | |
('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)),]) | |
_ = text_clf_svm.fit(doc_complete, doc_target) | |
predicted_svm = text_clf_svm.predict(doc_complete) | |
print(np.mean(predicted_svm == doc_target)) | |
print(predicted_svm) | |
import nltk | |
from nltk.stem.snowball import SnowballStemmer | |
stemmer = SnowballStemmer("english", ignore_stopwords=True) | |
class StemmedCountVectorizer(CountVectorizer): | |
def build_analyzer(self): | |
analyzer = super(StemmedCountVectorizer, self).build_analyzer() | |
return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)]) | |
stemmed_count_vect = StemmedCountVectorizer(stop_words='english') | |
text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect), | |
('tfidf', TfidfTransformer()), | |
('mnb', MultinomialNB(fit_prior=False)), | |
]) | |
text_mnb_stemmed = text_mnb_stemmed.fit(doc_complete, doc_target) | |
predicted_mnb_stemmed = text_mnb_stemmed.predict(doc_test) | |
np.mean(predicted_mnb_stemmed == doc_test_target) | |
import pandas as pd | |
df = pd.DataFrame(data={'doc': doc_complete, 'target': doc_target}) | |
category_id_df = df[['doc', 'target']].drop_duplicates().sort_values('target') | |
category_to_id = dict(category_id_df.values) | |
id_to_category = dict(category_id_df[['target', 'doc']].values) | |
df | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=1, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english') | |
features = tfidf.fit_transform(df.doc).toarray() | |
labels = df.target | |
features.shape | |
from sklearn.feature_selection import chi2 | |
N = 2 | |
for Product, category_id in sorted(category_to_id.items()): | |
features_chi2 = chi2(features, labels == category_id) | |
indices = np.argsort(features_chi2[0]) | |
feature_names = np.array(tfidf.get_feature_names())[indices] | |
unigrams = [v for v in feature_names if len(v.split(' ')) == 1] | |
bigrams = [v for v in feature_names if len(v.split(' ')) == 2] | |
print("\n# '{}':".format(Product[0:50])) | |
print(" . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:]))) | |
print(" . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:]))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment