Skip to content

Instantly share code, notes, and snippets.

@cwidmer
Last active August 29, 2015 13:55
Show Gist options
  • Save cwidmer/8752502 to your computer and use it in GitHub Desktop.
Save cwidmer/8752502 to your computer and use it in GitHub Desktop.
toy example for learning the similarity
#!/usr/bin/env python2.6
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# Written (W) 2013 Christian Widmer
# Copyright (C) 2013 Max-Planck-Society, MSKCC, TU-Berlin
"""
Created on 04.03.2013
@author: Christian Widmer
@summary: Toy experiment for MT-MKL
"""
import numpy as np
import copy
import random
import pylab
import dcd
import scipy.cluster.hierarchy as sch
leaf_vectors = []
def mutate_vector(vec, num_mut):
"""
takes in vector and flips positions
"""
if num_mut > 0:
positions = random.sample(range(len(vec)), num_mut)
for pos in positions:
vec[pos] = vec[pos]*(-1)
return vec
def mutate_edge(vec, depth, degree):
"""
recursive mutation procedure
"""
#mutation_rate = 8
mutation_rate, = random.sample(range(1,15),1)
vec_copy = copy.copy(vec)
vec_copy = mutate_vector(vec_copy, mutation_rate)
if depth == 5:
leaf_vectors.append(vec_copy)
return
else:
# recursion
for d in xrange(degree):
mutate_edge(vec_copy, depth+1, degree)
def compute_sim(vectors):
"""
compute similarity matrix from vectors
"""
num_vec = len(vectors)
sim = np.zeros((num_vec, num_vec))
for i, v_lhs in enumerate(vectors):
for j, v_rhs in enumerate(vectors):
sim[i, j] = np.dot(v_lhs, v_rhs)
return sim
def sample_points(diff_vec, num_points):
"""
sample points from multidimensional isotropic gaussians,
given their difference vector
"""
sigma = 20.0
points_per_class = num_points / 2
num_dim = len(diff_vec)
mean_pos = diff_vec * 0.5
mean_neg = diff_vec * -0.5
xt_pos = np.random.multivariate_normal(mean_pos, sigma*np.eye(num_dim), points_per_class)
xt_neg = np.random.multivariate_normal(mean_neg, sigma*np.eye(num_dim), points_per_class)
xt = np.vstack([xt_pos, xt_neg])
lt = np.array( [1]*points_per_class + [-1]*points_per_class )
ret = {"xt": xt, "lt": lt}
return ret
def generate_data(num_dim, xt_per_task_train, xt_per_task_test):
"""
generate toy data set
"""
# clear list
del leaf_vectors[:]
start_vec = np.ones(num_dim)
mutate_edge(start_vec, 0, 2)
sim = compute_sim(leaf_vectors)
#Y = sch.linkage(sim, method='centroid')
#print Y
#plot_dendrogram(sim, Y)
Y = create_linkage_matrix(32)
plot_dendrogram(sim, Y)
pylab.figure()
Z1 = sch.dendrogram(Y, orientation='right')
pylab.imshow(sim, interpolation="nearest")
pylab.show()
train_data = {}
test_data = {}
for task_num, v in enumerate(leaf_vectors):
train_data[task_num] = sample_points(v, xt_per_task_train)
test_data[task_num] = sample_points(v, xt_per_task_test)
return train_data, test_data, sim
def create_mask_stack_level(num_tasks):
"""
create stack of matrices for
each inner node of binary tree
"""
num_levels = np.log2(num_tasks)
stack = np.zeros((num_levels, num_tasks, num_tasks))
for i in range(num_levels):
block_size = 2 ** i
num_blocks = int(num_tasks / block_size)
for j in range(0, num_tasks, block_size):
stack[i,j:j+block_size, j:j+block_size] = 1.0 / float(block_size)
return stack
def create_mask_stack(num_tasks):
"""
create stack of matrices for
each inner node of binary tree
"""
num_levels = int(np.log2(num_tasks))
stack = []
# don't use leaves
for i in range(1, num_levels):
block_size = 2 ** i
#num_blocks = int(num_tasks / block_size)
for j in range(0, num_tasks, block_size):
mask = np.zeros((num_tasks, num_tasks))
mask[j:j+block_size, j:j+block_size] = 1.0
stack.append(mask)
# add all ones
stack.append(np.ones((num_tasks, num_tasks)))
# make sure we have zeros on main diagonal
for i in xrange(len(stack)):
for j in xrange(num_tasks):
stack[i][j, j] = 0.0
# add eye
stack.append(np.eye(num_tasks, num_tasks))
stack = np.array(stack)
print "stack size", len(stack)
return stack
def create_linkage_matrix(num_tasks):
"""
create stack of matrices for
each inner node of binary tree
"""
num_levels = int(np.log2(num_tasks))
stack = []
cumulative = 0
for i in range(num_levels):
block_size = 2 ** i
num_blocks = int(num_tasks / block_size)
for j in range(0, num_blocks, 2):
row = [cumulative+j, cumulative+j+1, i+1, block_size*2]
print row
stack.append(row)
cumulative += num_blocks
stack = np.array(stack, dtype=np.float64)
return stack
def plot_dendrogram(data, Y):
"""
http://stackoverflow.com/questions/2982929/plotting-results-of-hierarchical-clustering-ontop-of-a-matrix-of-data-in-python
"""
D = copy.copy(data)
D /= D[0,0]
# Compute and plot first dendrogram.
fig = pylab.figure(figsize=(8,8))
ax1 = fig.add_axes([0.09,0.1,0.2,0.6])
ax1.set_axis_off()
Z1 = sch.dendrogram(Y, orientation='right', link_color_func=lambda k: "black")
ax1.set_xticks([])
ax1.set_yticks([])
# Compute and plot second dendrogram.
ax2 = fig.add_axes([0.3,0.71,0.6,0.2])
ax2.set_axis_off()
Z2 = sch.dendrogram(Y, link_color_func=lambda k: "black")
ax2.set_xticks([])
ax2.set_yticks([])
# Plot distance matrix.
axmatrix = fig.add_axes([0.3,0.1,0.6,0.6])
idx1 = Z1['leaves']
idx2 = Z2['leaves']
D = D[idx1,:]
D = D[:,idx2]
im = axmatrix.matshow(D, aspect='auto', origin='lower') #, cmap=pylab.cm.YlGnBu)
axmatrix.set_xticks([])
axmatrix.set_yticks([])
# Plot colorbar.
axcolor = fig.add_axes([0.91,0.1,0.02,0.6])
pylab.colorbar(im, cax=axcolor)
fig.show()
pylab.show()
def hierarchical_mask():
"""
"""
#solver = "dcd"
solver = "dcd_shogun"
epsilon = 1e-3
record_interval = 0
min_interval = 0
cost = 0.00001
# define task similarity matrix
num_tasks = 32
num_dim = 100
num_train = 10
num_test = 1000
train_data, test_data, _ = generate_data(num_dim, num_train, num_test)
task_sim = create_mask_stack(num_tasks)
normalize = True
ridge = 0.01
# MT-MKL
for p_norm in [1.1, 2.0, 3.0]:
trained_solver = dcd.train_mtl_svm(train_data, task_sim, solver, cost, epsilon, True, p_norm, record_interval, min_interval, normalize, ridge)
trained_solver.predict(test_data, text="MTMKL p-norm = %.2f" % p_norm)
# vanilla
trained_solver = dcd.train_mtl_svm(train_data, task_sim, solver, cost, epsilon, False, p_norm, record_interval, min_interval, normalize, ridge)
trained_solver.predict(test_data, text="Vanilla")
# individual
task_sim = np.zeros((1, num_tasks, num_tasks))
task_sim[0] = np.eye(num_tasks)
invididual_solver = dcd.train_mtl_svm(train_data, task_sim, solver, cost, epsilon, False, p_norm, record_interval, min_interval, normalize, ridge)
invididual_solver.predict(test_data, text="Individual")
# union
task_sim = np.zeros((1, num_tasks, num_tasks))
task_sim[0] = np.ones(num_tasks) - np.eye(num_tasks)
invididual_solver = dcd.train_mtl_svm(train_data, task_sim, solver, cost, epsilon, False, p_norm, record_interval, min_interval, normalize, ridge)
invididual_solver.predict(test_data, text="Union")
pylab.grid(True)
pylab.show()
if __name__ == "__main__" or __name__ == "pyreport.main":
hierarchical_mask()
#create_mask_stack(8)
#test_mtl()
#run_experiment()
#generate_data(100, 10, 100)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment