Last active
May 18, 2018 15:35
-
-
Save NoelKennedy/dd0dd2d4a852305832c75bb07703f6e5 to your computer and use it in GitHub Desktop.
Method 3 : rank input sentences by a filter's activation function score
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# converts array of embedding intergers to a list of tokens that those integers represent | |
# the disease token is shown in upper case | |
# x_1 = array of embedding integers | |
# x_2 = array of disease phrase indicators | |
def embeddings_to_tokens(x_1,x_2,retain_null=True): | |
tokens=list([embedding_to_token_map[embedding_id] for embedding_id in x_1]) | |
for i in range(0,x_2.shape[0]): | |
if x_2[i]==1: | |
tokens[i]=tokens[i].upper() | |
if retain_null: | |
return tokens | |
else: | |
return list([token for token in tokens if token!='NULL']) | |
# remember, my problem set up has two input vectors per training example (x_train and x_train_2) | |
# word2vec is a gensim word2vec model | |
def find_highest_activation_ngrams(x_train,x_train_2, model, word2vec, top_n, | |
layer_name='conv1d_1', | |
number_of_filters=128, | |
print_ngrams=True # only possible on the first convulutional layer | |
): | |
from keras import applications | |
from keras import backend as K | |
import numpy as np | |
# get the symbolic outputs of each "key" layer (we gave them unique names). | |
layer_dict = dict([(layer.name, layer) for layer in model.layers]) | |
# this is the placeholder for the input training example. My CNN has 2 input vectors per example. | |
input_sentence,input_disease_phrase_marker = model.inputs | |
# interogate model to get some values used in processing later | |
layer_output = layer_dict[layer_name].output | |
ouput_dimension = layer_output.shape[1].value | |
max_sequence_length=x_train.shape[1] | |
filter_size = (max_sequence_length - ouput_dimension) + 1 | |
for filter_index in range(0, number_of_filters - 1): | |
print('Processing filter %d' % filter_index) | |
matched_on_padding = False # flag when the filter matches padding, this is a bug | |
# manipulate the tensors to perform our task | |
features = K.identity(layer_output[:, :, filter_index]) | |
extract_features = K.function([input_sentence,input_disease_phrase_marker], [features]) | |
#exceute feature extraction | |
features_extracted = extract_features([x_train,x_train_2]) | |
# get top_n scores | |
flattened=np.array(features_extracted).flatten() | |
top_n_idx=np.argpartition(flattened,-top_n)[-top_n:] | |
top_n_scores=list() | |
for i in top_n_idx: | |
top_n_scores.append(flattened[i]) | |
score_cutoff = min(top_n_scores) | |
# now go through the training set, and pull out the top_n n-grams which maximise the filter | |
count=0 | |
output_printing=list() | |
for i,sentence_features in enumerate(features_extracted[0]): | |
#if highest match is better than the worst top_n score, then print | |
max_idx=np.argmax(sentence_features) | |
highest_match = sentence_features[max_idx] | |
if highest_match == 0.0: | |
continue # sometimes filters learn nothing, just skip | |
if highest_match >= score_cutoff: | |
# this sentence is a strong match for the filter | |
if print_ngrams: | |
# get the tokens from the input data | |
sentence_tokens = embeddings_to_tokens(x_train[i],x_train_2[i],retain_null=True) | |
matched_tokens=sentence_tokens[max_idx:max_idx+ filter_size] | |
matched_string=' '.join(matched_tokens) | |
output_printing.append((matched_string,highest_match)) | |
#print('%s:(%s)' % (matched_string,highest_match)) | |
count=count+1 | |
if matched_string == '' or matched_string == ' ': | |
# somethings wrong here, debug print | |
print('somehow no matched tokens?') | |
print(sentence_tokens) | |
print('length %s' % len(sentence_tokens)) | |
print('max_idx %s' % max_idx) | |
print('padding_count %s' % padding_count) | |
print('index %s' % i) | |
print('filter length %s' % filter_size) | |
matched_on_padding = True | |
else: | |
# cant focus on a particular token sequence, print whole sentence | |
sentence_tokens = embeddings_to_tokens(x_train[i],x_train_2[i],retain_null=False) | |
output_printing.append((' '.join(sentence_tokens),highest_match)) | |
output_printing=sorted(output_printing,key=lambda x:x[1],reverse=True) | |
for xxx in output_printing: | |
print(xxx) | |
if matched_on_padding: return features_extracted[0] | |
print(count) | |
print(score_cutoff) | |
print('Finished filter %d' % filter_index) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment