Created
January 31, 2018 20:17
-
-
Save Juancard/0ea387f4a43104ed26a7d15cb8312a90 to your computer and use it in GitHub Desktop.
Sentiment analysis over imdb database using convolutional neural networks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# Sentiment analysis over imdb database using convolutional neural networks | |
# ConvNN architecture follows Kim Yoon directives on paper | |
# "Kim, Y. (2014). Convolutional Neural Networks for Sentence Classification. | |
# Proceedings of the 2014 Conference on Empirical Methods in Natural Language Processing (EMNLP 2014), 1746–1751." | |
# Link: http://arxiv.org/abs/1408.5882 | |
# In[1]: | |
# sequence classification in the IMDB dataset | |
import numpy as np | |
import pandas as pd | |
import h5py | |
# fix random seed for reproducibility | |
seed = 7 | |
np.random.seed(seed) | |
from keras.datasets import imdb | |
from keras.models import Sequential | |
from keras.models import load_model, Model | |
from keras.layers import Dense | |
from keras.layers import Convolution1D, GlobalMaxPooling1D, MaxPooling1D,Flatten, Dropout, Input | |
from keras.layers.embeddings import Embedding | |
from keras.preprocessing import sequence | |
from keras.preprocessing.text import text_to_word_sequence | |
from keras.callbacks import TensorBoard | |
from keras.layers.merge import concatenate | |
from keras.layers import Merge | |
from keras.utils.vis_utils import plot_model | |
from keras.callbacks import ModelCheckpoint | |
# In[2]: | |
print ("Loading imdb dataset") | |
top_words = 10000 | |
(x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz", | |
num_words=top_words, | |
skip_top=0, | |
maxlen=None, | |
seed=113, | |
start_char=1, | |
oov_char=2, | |
index_from=3) | |
# In[3]: | |
print("Encoding sentences") | |
# Pad the sequence to the same length | |
max_review_length = 1600 # less than maximum length of both test and train sets | |
x_train = sequence.pad_sequences(x_train, maxlen=max_review_length) | |
x_test = sequence.pad_sequences(x_test, maxlen=max_review_length) | |
# In[4]: | |
embedding_vector_length = 128 | |
FILTER_SIZES = [3, 4, 5] | |
FILTERS = 128 | |
P_DROPOUT = 0.5 | |
# In[5]: | |
print("Setting up Convolutional Network") | |
input_layer = Input(shape=(max_review_length,), dtype='int32', name='sentence') | |
embedding = Embedding(top_words, embedding_vector_length, input_length=max_review_length, name="embedding")(input_layer) | |
convs = [] | |
inputs = [] | |
for i, kernel_size in enumerate(FILTER_SIZES): | |
name = str(kernel_size) + "ks" | |
conv = Convolution1D( | |
filters=FILTERS, | |
kernel_size=kernel_size, | |
activation='relu', | |
name="conv_" + name | |
)(embedding) | |
maxPooling = MaxPooling1D( | |
pool_size= max_review_length - kernel_size + 1, | |
name="maxpool_" + name | |
)(conv) | |
convs.append(maxPooling) | |
# merge | |
merged = concatenate(convs, name="concatenation") | |
# Flat | |
flat = Flatten(name="flatten_layer")(merged) | |
drop = Dropout(P_DROPOUT, name="dropout_%.2f" % P_DROPOUT)(flat) | |
# interpretation | |
outputs = Dense(1, activation='sigmoid')(drop) | |
model = Model(inputs=input_layer, outputs=outputs, name="output") | |
# compile | |
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) | |
model.summary() | |
# In[22]: | |
plot_model(model, show_shapes=True, to_file='plots/zhang_architecture.png') | |
# In[10]: | |
# checkpoint | |
filepath="za_weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5" | |
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') | |
callbacks_list = [checkpoint] | |
# In[11]: | |
# fit model | |
print ("Fitting model") | |
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=2, batch_size=16, callbacks=callbacks_list, verbose=2) | |
# In[12]: | |
# save the model | |
print ("Saving the model") | |
model.save('zhang_architecture_2epochs_16batchsize_128embeddings_128filters_3ks_4ks_5ks_0point5dropout') | |
# ## Prediction over kaggle test data | |
# In[13]: | |
print("Starting predictions" | |
INDEX_FROM=3 | |
INDEX_UNK=2 | |
INDEX_START=1 | |
def preprocess_imdb(review): | |
# clean and tokenize | |
words_list = text_to_word_sequence(review) | |
# init array | |
words_index_list = [] | |
# INDEX_START as first element in sequence (keras convention) | |
words_index_list.append(INDEX_START) | |
#words_index_list[0] = INDEX_START | |
for word_pos in xrange(0,len(words_list)): | |
word = words_list[word_pos] | |
if word not in word_index: | |
words_index_list.append(INDEX_UNK) | |
#words_index_list[word_pos + 1] = INDEX_UNK | |
else: | |
words_index_list.append(word_index[word] + INDEX_FROM if word_index[word] < top_words - INDEX_FROM - 1 else INDEX_UNK) | |
return words_index_list | |
# In[14]: | |
print("Loadin test dataset to predict") | |
# DOWNLOAD COLLECTION HERE: | |
# https://www.kaggle.com/c/word2vec-nlp-tutorial/data | |
collection_path = "/path/to/imdb_reviews_sentim_analisys/" | |
# Read data from files | |
kaggle_test_df = pd.read_csv( collection_path + "testData.tsv", header=0, delimiter="\t", quoting=3, encoding="utf-8" ) | |
# In[15]: | |
print "Imdb dataset: Loading map from word to index" | |
word_index = imdb.get_word_index() | |
# In[16]: | |
preprocess_test = kaggle_test_df['review'].apply(preprocess_imdb) | |
x_test_predict = sequence.pad_sequences(preprocess_test, maxlen=max_review_length) | |
print("Shape of dataset to predict: " + str(x_test_predict.shape)) | |
# In[17]: | |
print "Predicting one review" | |
neg_rev = "With all the controversy back in 2016 over the #Oscarssowhite shambles, it seems that in 2017 the Academy has made a conscious effort to include as much diversity into the show as they possibly can. Unfortunately, the downside of that is that films like 'Moonlight', which are in reality very average, get recognition they don't deserve and people are fooled into thinking they are better than they actually are. 'Moonlight' is a simple film, in fact it's far too simple. There is almost nothing thought-provoking or interesting that happens for the entire 110 minute run time. Yet somehow it's up for a plethora of awards. Go figure. Mahershala Ali and Naomie Harris have each been nominated for Academy Awards in their respective Supporting categories. Ali is quite brilliant, in fact he's the highlight of the film. He's in nearly the entire first third of the film and I was starting to wonder how this was considered a 'Supporting' role, yet he soon drops away. I wish he had been in it for longer though, because he was quite superb. Harris was also quite good in her role. She has a more spread out performance in the film, reoccurring in each chapter. I wouldn't say she blew me away, but she was certainly solid in her role. I will predict Ali to win his category, and Harris to miss out. 'Moonlight' is one of those films that just kind of drifts along until the credits role. The question I kept asking myself as I watched it was, what is meant to be so extraordinary about these characters? What part of this story justifies making a film out of it? To me it appears that some impressive acting and some false award nominations have tricked people into thinking this film is better than it is. Very disappointing." | |
neg_rev_enc = sequence.pad_sequences([preprocess_imdb(neg_rev)], maxlen=max_review_length) | |
prediction = model.predict(neg_rev_enc) | |
print ("%.4f - %s" % (prediction, "Positive" if round(prediction) == 1 else "Negative")) | |
# In[18]: | |
print "Predicting all dataset" | |
model_predictions = model.predict(x_test_predict, verbose=0) | |
print(model_predictions[:5]) | |
# In[21]: | |
mod_pred_round = [int(round(i)) for i in model_predictions] | |
print(mod_pred_round[:5]) | |
# In[22]: | |
# Write the test results | |
print "Writing prediction results" | |
output = pd.DataFrame( data={"id":kaggle_test_df["id"], "sentiment":mod_pred_round} ) | |
output.to_csv( "imdb_zhang_architecture_2epochs_16batchsize_128embeddings_128filters_3ks_4ks_5ks_0point5dropout.csv", index=False, quoting=3 ) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment