groverpr · March 29, 2020 05:47
diff --git a/generate_tsne b/generate_tsne
 few_words = ['great', 'excellent', 'best', 'perfect', 'wonderful', 'well',
       'fun', 'love', 'amazing', 'also', 'enjoyed', 'favorite', 'it',
       'and', 'loved', 'highly', 'bit', 'job', 'today', 'beautiful',
       'you', 'definitely', 'superb', 'brilliant', 'world', 'liked',
       'still', 'enjoy', 'life', 'very', 'especially', 'see', 'fantastic',
       'both', 'shows', 'good', 'may', 'terrific', 'heart', 'classic',
       'will', 'enjoyable', 'beautifully', 'always', 'true', 'perfectly',
       'surprised', 'think', 'outstanding', 'most',
             
       'bad', 'worst', 'awful', 'waste', 'boring', 'poor', 'terrible',
       'no', 'nothing', 'poorly', 'dull', 'horrible', 'script', 'stupid',
       'worse', 'even', 'minutes', 'instead', 'fails', 'unfortunately',
       'just', 'annoying', 'ridiculous', 'plot', 'money', 'supposed',
       'avoid', 'mess', 'disappointing', 'disappointment', 'lame', 'crap',
       'predictable', 'any', 'pointless', 'weak', 'badly', 'not', 'only',
       'unless', 'looks', 'why', 'wasted', 'save', 'oh', 'attempt',
       'problem', 'acting', 'lacks', 'seems']

 tok_embed = net1.embed.weight.list_data()[0].asnumpy()  # extract weights of embedding matrix from network
 # use token to index map from transformer to get token for each index in embedding matrix
 tok_trans = transformer.named_steps['token2index']
 tok_embed_sub = tok_embed[[tok_trans.tok2idx[i] for i in few_words]]

 # t-SNE (tune perplexity and n_iter for your purpose)
 tsne = TSNE(perplexity=40, n_iter=1000,)
 Y_char = tsne.fit_transform(tok_embed_sub)

 # Matplotlib plot of 2D embeddings
 fig, ax = plt.subplots(figsize=(10,10))
 ax.scatter(x=Y_char[:,0], y=Y_char[:,1], s=4)
 ax.grid()
 for i in range(Y_char.shape[0]):
    txt = few_words[i]
    ax.annotate(txt, (Y_char[i,0], Y_char[i,1]), fontsize=10)    
 _ = ax.set_title('t-SNE of Word Tokens')
	few_words = ['great', 'excellent', 'best', 'perfect', 'wonderful', 'well',
	'fun', 'love', 'amazing', 'also', 'enjoyed', 'favorite', 'it',
	'and', 'loved', 'highly', 'bit', 'job', 'today', 'beautiful',
	'you', 'definitely', 'superb', 'brilliant', 'world', 'liked',
	'still', 'enjoy', 'life', 'very', 'especially', 'see', 'fantastic',
	'both', 'shows', 'good', 'may', 'terrific', 'heart', 'classic',
	'will', 'enjoyable', 'beautifully', 'always', 'true', 'perfectly',
	'surprised', 'think', 'outstanding', 'most',

	'bad', 'worst', 'awful', 'waste', 'boring', 'poor', 'terrible',
	'no', 'nothing', 'poorly', 'dull', 'horrible', 'script', 'stupid',
	'worse', 'even', 'minutes', 'instead', 'fails', 'unfortunately',
	'just', 'annoying', 'ridiculous', 'plot', 'money', 'supposed',
	'avoid', 'mess', 'disappointing', 'disappointment', 'lame', 'crap',
	'predictable', 'any', 'pointless', 'weak', 'badly', 'not', 'only',
	'unless', 'looks', 'why', 'wasted', 'save', 'oh', 'attempt',
	'problem', 'acting', 'lacks', 'seems']

	tok_embed = net1.embed.weight.list_data()[0].asnumpy() # extract weights of embedding matrix from network
	# use token to index map from transformer to get token for each index in embedding matrix
	tok_trans = transformer.named_steps['token2index']
	tok_embed_sub = tok_embed[[tok_trans.tok2idx[i] for i in few_words]]

	# t-SNE (tune perplexity and n_iter for your purpose)
	tsne = TSNE(perplexity=40, n_iter=1000,)
	Y_char = tsne.fit_transform(tok_embed_sub)

	# Matplotlib plot of 2D embeddings
	fig, ax = plt.subplots(figsize=(10,10))
	ax.scatter(x=Y_char[:,0], y=Y_char[:,1], s=4)
	ax.grid()
	for i in range(Y_char.shape[0]):
	txt = few_words[i]
	ax.annotate(txt, (Y_char[i,0], Y_char[i,1]), fontsize=10)
	_ = ax.set_title('t-SNE of Word Tokens')