NoelKennedy · May 18, 2018 15:31
diff --git a/token_occulusion_plots.py b/token_occulusion_plots.py
 import math
 import matplotlib
 import matplotlib.pyplot as plt
 from matplotlib.ticker import PercentFormatter

 # x is a vector representing a sentence.  Sentence is an vector of integers.
 # this method returns a matrix Z of size number_of_tokens_in_sentence * number_of_tokens_in_sentence 
 # st that z[i][j] == x[j] except when i==j, z[i][j] == 0.  ie the token is blanked out with the padding indicator
 def create_occlusion_batch(x):
    number_tokens_in_sentence=x.shape[0]
    z=list()
    
    for j in range(0,number_tokens_in_sentence):
        z.append(np.copy(x))
        z[j][j]=0 # blank out this token
    return np.vstack(z)


 # controls the sentence from dev set we will run the analysis on
 sentence_id=np.random.randint(0,y_dev.shape[0])
 #sentence_id=38439
 print('sentence_id %s' % sentence_id)

 significant_difference=0.000001 # % difference in likelihood of class that we will consider worth looking at.  0.01 = 1%

 original_sentence=x_dev[sentence_id]
 #original_sentence=cp
 original_sentence_2=x_dev_2[sentence_id]
 original_sentence_label=y_dev[sentence_id]

 # create a batch for the sentence with each token occluded
 z=create_occlusion_batch(original_sentence)
 z_disease_phrase=list()
 for i in range(0,z.shape[0]):
    z_disease_phrase.append(original_sentence_2)
 z_disease_phrase=np.stack(z_disease_phrase)

 # get model to make predictions for each occuluded token
 y_pred_occulusion=cpu_model.predict([z,z_disease_phrase])
        
 # calculate the effect this has on the true class of the sentence
 true_class=np.argmax(original_sentence_label)
 base_prediction=base_predictions[sentence_id][true_class]

 tokens_no_null=embeddings_to_tokens(original_sentence,original_sentence_2,retain_null=False)
 tokens=embeddings_to_tokens(original_sentence,original_sentence_2,retain_null=True)

 # code for anonymising output
 #tokens[277]='day'
 #tokens[278]='month'
 #tokens[279]='year'
 #tokens[283]='patient_name'

 diagnosis = ' '.join([token.lower() for token in tokens_no_null if token.upper()==token and token.isalpha()])
 print('true class: %s %s, model prediction %s' % (class_labels[true_class],diagnosis,base_prediction))
 print('** original sentence')
 print(' '.join(tokens_no_null))
 print('**') 


 important_tokens = list() #('Python', 'C++', 'Java', 'Perl', 'Scala', 'Lisp')
 impact=list()


 for i in range(0,z.shape[0]):
    # skip padding 
    if original_sentence[i]==0:
        continue
    token_occluded_score=y_pred_occulusion[i][true_class]
    
    #guard: token makes no difference to prediction
    if math.isclose(token_occluded_score, base_prediction, abs_tol=significant_difference):
        continue
    important_tokens.append(tokens[i])
    impact.append(100*(token_occluded_score-base_prediction))

 #important_tokens=important_tokens[0:20]
 #impact=impact[0:20]
 important_tokens.reverse()
 impact.reverse()

 y_pos = np.arange(len(important_tokens))
 matplotlib.rcParams['figure.figsize'] = [7, 10]
 plt.barh(y_pos, impact, align='center', alpha=0.5)
 plt.yticks(y_pos, important_tokens)
 plt.ylabel('Occluded token')
 plt.xlabel('Change in % likelihood of true class when token is occluded')
 plt.title('True class: %s %s' % (class_labels[true_class], diagnosis))
 plt.show()
	import math
	import matplotlib
	import matplotlib.pyplot as plt
	from matplotlib.ticker import PercentFormatter

	# x is a vector representing a sentence. Sentence is an vector of integers.
	# this method returns a matrix Z of size number_of_tokens_in_sentence * number_of_tokens_in_sentence
	# st that z[i][j] == x[j] except when i==j, z[i][j] == 0. ie the token is blanked out with the padding indicator
	def create_occlusion_batch(x):
	number_tokens_in_sentence=x.shape[0]
	z=list()

	for j in range(0,number_tokens_in_sentence):
	z.append(np.copy(x))
	z[j][j]=0 # blank out this token
	return np.vstack(z)


	# controls the sentence from dev set we will run the analysis on
	sentence_id=np.random.randint(0,y_dev.shape[0])
	#sentence_id=38439
	print('sentence_id %s' % sentence_id)

	significant_difference=0.000001 # % difference in likelihood of class that we will consider worth looking at. 0.01 = 1%

	original_sentence=x_dev[sentence_id]
	#original_sentence=cp
	original_sentence_2=x_dev_2[sentence_id]
	original_sentence_label=y_dev[sentence_id]

	# create a batch for the sentence with each token occluded
	z=create_occlusion_batch(original_sentence)
	z_disease_phrase=list()
	for i in range(0,z.shape[0]):
	z_disease_phrase.append(original_sentence_2)
	z_disease_phrase=np.stack(z_disease_phrase)

	# get model to make predictions for each occuluded token
	y_pred_occulusion=cpu_model.predict([z,z_disease_phrase])

	# calculate the effect this has on the true class of the sentence
	true_class=np.argmax(original_sentence_label)
	base_prediction=base_predictions[sentence_id][true_class]

	tokens_no_null=embeddings_to_tokens(original_sentence,original_sentence_2,retain_null=False)
	tokens=embeddings_to_tokens(original_sentence,original_sentence_2,retain_null=True)

	# code for anonymising output
	#tokens[277]='day'
	#tokens[278]='month'
	#tokens[279]='year'
	#tokens[283]='patient_name'

	diagnosis = ' '.join([token.lower() for token in tokens_no_null if token.upper()==token and token.isalpha()])
	print('true class: %s %s, model prediction %s' % (class_labels[true_class],diagnosis,base_prediction))
	print('** original sentence')
	print(' '.join(tokens_no_null))
	print('**')


	important_tokens = list() #('Python', 'C++', 'Java', 'Perl', 'Scala', 'Lisp')
	impact=list()


	for i in range(0,z.shape[0]):
	# skip padding
	if original_sentence[i]==0:
	continue
	token_occluded_score=y_pred_occulusion[i][true_class]

	#guard: token makes no difference to prediction
	if math.isclose(token_occluded_score, base_prediction, abs_tol=significant_difference):
	continue
	important_tokens.append(tokens[i])
	impact.append(100*(token_occluded_score-base_prediction))

	#important_tokens=important_tokens[0:20]
	#impact=impact[0:20]
	important_tokens.reverse()
	impact.reverse()

	y_pos = np.arange(len(important_tokens))
	matplotlib.rcParams['figure.figsize'] = [7, 10]
	plt.barh(y_pos, impact, align='center', alpha=0.5)
	plt.yticks(y_pos, important_tokens)
	plt.ylabel('Occluded token')
	plt.xlabel('Change in % likelihood of true class when token is occluded')
	plt.title('True class: %s %s' % (class_labels[true_class], diagnosis))
	plt.show()