Last active
May 18, 2018 15:31
-
-
Save NoelKennedy/93d5b8fb96895a6093b96560affe73d7 to your computer and use it in GitHub Desktop.
Method 2 : Hide each token in the input sentence one-by-one and see how this changes the likelihood of the sentence's class
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import math | |
import matplotlib | |
import matplotlib.pyplot as plt | |
from matplotlib.ticker import PercentFormatter | |
# x is a vector representing a sentence. Sentence is an vector of integers. | |
# this method returns a matrix Z of size number_of_tokens_in_sentence * number_of_tokens_in_sentence | |
# st that z[i][j] == x[j] except when i==j, z[i][j] == 0. ie the token is blanked out with the padding indicator | |
def create_occlusion_batch(x): | |
number_tokens_in_sentence=x.shape[0] | |
z=list() | |
for j in range(0,number_tokens_in_sentence): | |
z.append(np.copy(x)) | |
z[j][j]=0 # blank out this token | |
return np.vstack(z) | |
# controls the sentence from dev set we will run the analysis on | |
sentence_id=np.random.randint(0,y_dev.shape[0]) | |
#sentence_id=38439 | |
print('sentence_id %s' % sentence_id) | |
significant_difference=0.000001 # % difference in likelihood of class that we will consider worth looking at. 0.01 = 1% | |
original_sentence=x_dev[sentence_id] | |
#original_sentence=cp | |
original_sentence_2=x_dev_2[sentence_id] | |
original_sentence_label=y_dev[sentence_id] | |
# create a batch for the sentence with each token occluded | |
z=create_occlusion_batch(original_sentence) | |
z_disease_phrase=list() | |
for i in range(0,z.shape[0]): | |
z_disease_phrase.append(original_sentence_2) | |
z_disease_phrase=np.stack(z_disease_phrase) | |
# get model to make predictions for each occuluded token | |
y_pred_occulusion=cpu_model.predict([z,z_disease_phrase]) | |
# calculate the effect this has on the true class of the sentence | |
true_class=np.argmax(original_sentence_label) | |
base_prediction=base_predictions[sentence_id][true_class] | |
tokens_no_null=embeddings_to_tokens(original_sentence,original_sentence_2,retain_null=False) | |
tokens=embeddings_to_tokens(original_sentence,original_sentence_2,retain_null=True) | |
# code for anonymising output | |
#tokens[277]='day' | |
#tokens[278]='month' | |
#tokens[279]='year' | |
#tokens[283]='patient_name' | |
diagnosis = ' '.join([token.lower() for token in tokens_no_null if token.upper()==token and token.isalpha()]) | |
print('true class: %s %s, model prediction %s' % (class_labels[true_class],diagnosis,base_prediction)) | |
print('** original sentence') | |
print(' '.join(tokens_no_null)) | |
print('**') | |
important_tokens = list() #('Python', 'C++', 'Java', 'Perl', 'Scala', 'Lisp') | |
impact=list() | |
for i in range(0,z.shape[0]): | |
# skip padding | |
if original_sentence[i]==0: | |
continue | |
token_occluded_score=y_pred_occulusion[i][true_class] | |
#guard: token makes no difference to prediction | |
if math.isclose(token_occluded_score, base_prediction, abs_tol=significant_difference): | |
continue | |
important_tokens.append(tokens[i]) | |
impact.append(100*(token_occluded_score-base_prediction)) | |
#important_tokens=important_tokens[0:20] | |
#impact=impact[0:20] | |
important_tokens.reverse() | |
impact.reverse() | |
y_pos = np.arange(len(important_tokens)) | |
matplotlib.rcParams['figure.figsize'] = [7, 10] | |
plt.barh(y_pos, impact, align='center', alpha=0.5) | |
plt.yticks(y_pos, important_tokens) | |
plt.ylabel('Occluded token') | |
plt.xlabel('Change in % likelihood of true class when token is occluded') | |
plt.title('True class: %s %s' % (class_labels[true_class], diagnosis)) | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment