Skip to content

Instantly share code, notes, and snippets.

@matthewlenz
Created August 7, 2023 14:22
Show Gist options
  • Save matthewlenz/32a978fce11850cb027f35003c04df39 to your computer and use it in GitHub Desktop.
Save matthewlenz/32a978fce11850cb027f35003c04df39 to your computer and use it in GitHub Desktop.
Richard Gruss - Text Classification With Python
# Note: This will probably require some tweaking. Dr. Gruss sent this to me but said he hasn't used it in a very long time.
# Video link: https://www.youtube.com/watch?v=EfEW3_RLnGA
import os
import random
import string
from nltk import word_tokenize
from collections import defaultdict
from nltk import FreqDist
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import pickle
stop_words = set(stopwords.words('english'))
stop_words.add('said')
stop_words.add('mr')
BASE_DIR = '/Users/rgruss/workspace/AI/nlp/bbc'
LABELS = ['business', 'entertainment', 'politics', 'sport', 'tech']
def create_data_set():
with open('data.txt', 'w', encoding='utf8') as outfile:
for label in LABELS:
dir = '%s/%s' % (BASE_DIR, label)
for filename in os.listdir(dir):
fullfilename = '%s/%s' % (dir, filename)
print(fullfilename)
with open(fullfilename, 'rb') as file:
text = file.read().decode(errors='replace').replace('\n', '')
outfile.write('%s\t%s\t%s\n' % (label, filename, text))
def setup_docs():
docs = [] # (label, text)
with open('data.txt', 'r', encoding='utf8') as datafile:
for row in datafile:
parts = row.split('\t')
doc = (parts[0], parts[2].strip())
docs.append(doc)
return docs
def clean_text(text):
# remove punctuation
text = text.translate(str.maketrans('', '', string.punctuation))
# convert to lower case
text = text.lower()
return text
def get_tokens(text):
# get individual words
tokens = word_tokenize(text)
# remove common words that are useless
tokens = [t for t in tokens if not t in stop_words]
return tokens
def print_frequency_dist(docs):
tokens = defaultdict(list)
# lets make a giant list of all the words for each category
for doc in docs:
doc_label = doc[0]
doc_text = clean_text(doc[1])
doc_tokens = get_tokens(doc_text)
tokens[doc_label].extend(doc_tokens)
for category_label, category_tokens in tokens.items():
print(category_label)
fd = FreqDist(category_tokens)
print(fd.most_common(20))
def get_splits(docs):
# scramble docs
random.shuffle(docs)
X_train = [] #training documents
y_train = [] #corresponding training labels
X_test = [] #test documents
y_test = [] #correspoding test label
pivot = int(.80 * len(docs))
for i in range(0, pivot):
X_train.append(docs[i][1])
y_train.append(docs[i][0])
for i in range(pivot, len(docs)):
X_test.append(docs[i][1])
y_test.append(docs[i][0])
return X_train, X_test, y_train, y_test
def evaluate_classifier(title, classifier, vectorizer, X_test, y_test):
X_test_tfidf = vectorizer.transform(X_test)
y_pred = classifier.predict(X_test_tfidf)
precision = metrics.precision_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
f1 = metrics.f1_score(y_test, y_pred)
print("%s\t%f\t%f\t%f\n" % (title, precision, recall, f1))
def train_classifier(docs):
X_train, X_test, y_train, y_test = get_splits(docs)
# the object that turns text into vectors
vectorizer = CountVectorizer(stop_words='english',
ngram_range=(1, 3),
min_df=3, analyzer='word')
# create doc-term matrix
dtm = vectorizer.fit_transform(X_train)
# train Naive Bayes classifier
naive_bayes_classifier = MultinomialNB().fit(dtm, y_train)
evaluate_classifier("Naive Bayes\tTRAIN\t", naive_bayes_classifier, vectorizer, X_train, y_train)
evaluate_classifier("Naive Bayes\tTEST\t", naive_bayes_classifier, vectorizer, X_test, y_test)
#store the classifier
clf_filename = 'naive_bayes_classifier.pkl'
pickle.dump(naive_bayes_classifier, open(clf_filename, 'wb'))
#also store the vectorizer so we can transform new data
vec_filename = 'count_vectorizer.pkl'
pickle.dump(vectorizer, open(vec_filename, 'wb'))
def classify(text):
# load classifier
clf_filename = 'naive_bayes_classifier.pkl'
nb_clf = pickle.load(open(clf_filename, 'rb'))
# vectorize the new text
vec_filename = 'count_vectorizer.pkl'
vectorizer = pickle.load(open(vec_filename, 'rb'))
pred = nb_clf.predict(vectorizer.transform([text]))
print(pred[0])
if __name__ == '__main__':
# docs = create_data_set()
docs = setup_docs()
# word frequencies
#print_frequency_dist(docs)
train_classifier(docs)
#new_tech_doc = "Google showed off some new camera features on the Pixel 4 today at its annual hardware event, focusing on improvements to its Live HDR and Night Sight mode. The back of the Pixel 4 houses dual cameras in a new subtle square camera bump. There’s a 12.2MP main camera and a 16MP telephoto lens, which is a hybrid of optical and digital zoom. New Pixel 4 features include Live HDR+, with dual exposure controls in the viewfinder, which shows how photos will look in real time. There are HDR sliders to adjust brightness and shadows when you compose. A learning-based white balance feature is applied to all photo modes, so shots come out with true-to-life colors."
#classify(new_tech_doc)
#new_entertainment_doc = "Scarlett Johnasson is working much longer than nine to five! With two films coming out this year, another film currently in production, a wedding to plan on the horizon, and a five-year-old daughter to raise, she has a full plate! The 34-year-old actress still managed to hit the red carpet at this year's Elle Women In Hollywood Awards on Monday, and talk with ET's Nischelle Turner about how she manages to find any semblance of a work-life balance as well as reacting to Dolly Parton requesting that she play her in a biopic."
#classify(new_entertainment_doc)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment