Mocking t-SNE with some data not visible
Built with blockbuilder.org
license: gpl-3.0 |
Mocking t-SNE with some data not visible
Built with blockbuilder.org
<!DOCTYPE html> | |
<head> | |
<meta charset="utf-8"> | |
<script src="https://d3js.org/d3.v3.min.js"></script> | |
<script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/1.8.3/jquery.min.js"></script> | |
<style> | |
body { margin:0;position:fixed;top:0;right:0;bottom:0;left:0; } | |
</style> | |
<script> | |
// create main global object | |
var tsnejs = tsnejs || { REVISION: 'ALPHA' }; | |
(function(global) { | |
"use strict"; | |
// utility function | |
var assert = function(condition, message) { | |
if (!condition) { throw message || "Assertion failed"; } | |
} | |
// syntax sugar | |
var getopt = function(opt, field, defaultval) { | |
if(opt.hasOwnProperty(field)) { | |
return opt[field]; | |
} else { | |
return defaultval; | |
} | |
} | |
// return 0 mean unit standard deviation random number | |
var return_v = false; | |
var v_val = 0.0; | |
var gaussRandom = function() { | |
if(return_v) { | |
return_v = false; | |
return v_val; | |
} | |
var u = 2*Math.random()-1; | |
var v = 2*Math.random()-1; | |
var r = u*u + v*v; | |
if(r == 0 || r > 1) return gaussRandom(); | |
var c = Math.sqrt(-2*Math.log(r)/r); | |
v_val = v*c; // cache this for next function call for efficiency | |
return_v = true; | |
return u*c; | |
} | |
// return random normal number | |
var randn = function(mu, std){ return mu+gaussRandom()*std; } | |
// utilitity that creates contiguous vector of zeros of size n | |
var zeros = function(n) { | |
if(typeof(n)==='undefined' || isNaN(n)) { return []; } | |
if(typeof ArrayBuffer === 'undefined') { | |
// lacking browser support | |
var arr = new Array(n); | |
for(var i=0;i<n;i++) { arr[i]= 0; } | |
return arr; | |
} else { | |
return new Float64Array(n); // typed arrays are faster | |
} | |
} | |
// utility that returns 2d array filled with random numbers | |
// or with value s, if provided | |
var randn2d = function(n,d,s) { | |
var uses = typeof s !== 'undefined'; | |
var x = []; | |
for(var i=0;i<n;i++) { | |
var xhere = []; | |
for(var j=0;j<d;j++) { | |
if(uses) { | |
xhere.push(s); | |
} else { | |
xhere.push(randn(0.0, 1e-4)); | |
} | |
} | |
x.push(xhere); | |
} | |
return x; | |
} | |
// compute L2 distance between two vectors | |
var L2 = function(x1, x2) { | |
var D = x1.length; | |
var d = 0; | |
for(var i=0;i<D;i++) { | |
var x1i = x1[i]; | |
var x2i = x2[i]; | |
d += (x1i-x2i)*(x1i-x2i); | |
} | |
return d; | |
} | |
// compute pairwise distance in all vectors in X | |
var xtod = function(X) { | |
var N = X.length; | |
var dist = zeros(N * N); // allocate contiguous array | |
for(var i=0;i<N;i++) { | |
for(var j=i+1;j<N;j++) { | |
var d = L2(X[i], X[j]); | |
dist[i*N+j] = d; | |
dist[j*N+i] = d; | |
} | |
} | |
return dist; | |
} | |
// compute (p_{i|j} + p_{j|i})/(2n) | |
var d2p = function(D, perplexity, tol) { | |
var Nf = Math.sqrt(D.length); // this better be an integer | |
var N = Math.floor(Nf); | |
assert(N === Nf, "D should have square number of elements."); | |
var Htarget = Math.log(perplexity); // target entropy of distribution | |
var P = zeros(N * N); // temporary probability matrix | |
var prow = zeros(N); // a temporary storage compartment | |
for(var i=0;i<N;i++) { | |
var betamin = -Infinity; | |
var betamax = Infinity; | |
var beta = 1; // initial value of precision | |
var done = false; | |
var maxtries = 50; | |
// perform binary search to find a suitable precision beta | |
// so that the entropy of the distribution is appropriate | |
var num = 0; | |
while(!done) { | |
//debugger; | |
// compute entropy and kernel row with beta precision | |
var psum = 0.0; | |
for(var j=0;j<N;j++) { | |
var pj = Math.exp(- D[i*N+j] * beta); | |
if(i===j) { pj = 0; } // we dont care about diagonals | |
prow[j] = pj; | |
psum += pj; | |
} | |
// normalize p and compute entropy | |
var Hhere = 0.0; | |
for(var j=0;j<N;j++) { | |
var pj = prow[j] / psum; | |
prow[j] = pj; | |
if(pj > 1e-7) Hhere -= pj * Math.log(pj); | |
} | |
// adjust beta based on result | |
if(Hhere > Htarget) { | |
// entropy was too high (distribution too diffuse) | |
// so we need to increase the precision for more peaky distribution | |
betamin = beta; // move up the bounds | |
if(betamax === Infinity) { beta = beta * 2; } | |
else { beta = (beta + betamax) / 2; } | |
} else { | |
// converse case. make distrubtion less peaky | |
betamax = beta; | |
if(betamin === -Infinity) { beta = beta / 2; } | |
else { beta = (beta + betamin) / 2; } | |
} | |
// stopping conditions: too many tries or got a good precision | |
num++; | |
if(Math.abs(Hhere - Htarget) < tol) { done = true; } | |
if(num >= maxtries) { done = true; } | |
} | |
// console.log('data point ' + i + ' gets precision ' + beta + ' after ' + num + ' binary search steps.'); | |
// copy over the final prow to P at row i | |
for(var j=0;j<N;j++) { P[i*N+j] = prow[j]; } | |
} // end loop over examples i | |
// symmetrize P and normalize it to sum to 1 over all ij | |
var Pout = zeros(N * N); | |
var N2 = N*2; | |
for(var i=0;i<N;i++) { | |
for(var j=0;j<N;j++) { | |
Pout[i*N+j] = Math.max((P[i*N+j] + P[j*N+i])/N2, 1e-100); | |
} | |
} | |
return Pout; | |
} | |
// helper function | |
function sign(x) { return x > 0 ? 1 : x < 0 ? -1 : 0; } | |
var tSNE = function(opt) { | |
var opt = opt || {}; | |
this.perplexity = getopt(opt, "perplexity", 30); // effective number of nearest neighbors | |
this.dim = getopt(opt, "dim", 2); // by default 2-D tSNE | |
this.epsilon = getopt(opt, "epsilon", 10); // learning rate | |
this.iter = 0; | |
} | |
tSNE.prototype = { | |
// this function takes a set of high-dimensional points | |
// and creates matrix P from them using gaussian kernel | |
initDataRaw: function(X) { | |
var N = X.length; | |
var D = X[0].length; | |
assert(N > 0, " X is empty? You must have some data!"); | |
assert(D > 0, " X[0] is empty? Where is the data?"); | |
var dists = xtod(X); // convert X to distances using gaussian kernel | |
this.P = d2p(dists, this.perplexity, 1e-4); // attach to object | |
this.N = N; // back up the size of the dataset | |
this.initSolution(); // refresh this | |
}, | |
// this function takes a given distance matrix and creates | |
// matrix P from them. | |
// D is assumed to be provided as a list of lists, and should be symmetric | |
initDataDist: function(D) { | |
var N = D.length; | |
assert(N > 0, " X is empty? You must have some data!"); | |
// convert D to a (fast) typed array version | |
var dists = zeros(N * N); // allocate contiguous array | |
for(var i=0;i<N;i++) { | |
for(var j=i+1;j<N;j++) { | |
var d = D[i][j]; | |
dists[i*N+j] = d; | |
dists[j*N+i] = d; | |
} | |
} | |
this.P = d2p(dists, this.perplexity, 1e-4); | |
this.N = N; | |
this.initSolution(); // refresh this | |
}, | |
// (re)initializes the solution to random | |
initSolution: function() { | |
// generate random solution to t-SNE | |
this.Y = randn2d(this.N, this.dim); // the solution | |
this.gains = randn2d(this.N, this.dim, 1.0); // step gains to accelerate progress in unchanging directions | |
this.ystep = randn2d(this.N, this.dim, 0.0); // momentum accumulator | |
this.iter = 0; | |
}, | |
// return pointer to current solution | |
getSolution: function() { | |
return this.Y; | |
}, | |
// perform a single step of optimization to improve the embedding | |
step: function() { | |
this.iter += 1; | |
var N = this.N; | |
var cg = this.costGrad(this.Y); // evaluate gradient | |
var cost = cg.cost; | |
var grad = cg.grad; | |
// perform gradient step | |
var ymean = zeros(this.dim); | |
for(var i=0;i<N;i++) { | |
for(var d=0;d<this.dim;d++) { | |
var gid = grad[i][d]; | |
var sid = this.ystep[i][d]; | |
var gainid = this.gains[i][d]; | |
// compute gain update | |
var newgain = sign(gid) === sign(sid) ? gainid * 0.8 : gainid + 0.2; | |
if(newgain < 0.01) newgain = 0.01; // clamp | |
this.gains[i][d] = newgain; // store for next turn | |
// compute momentum step direction | |
var momval = this.iter < 250 ? 0.5 : 0.8; | |
var newsid = momval * sid - this.epsilon * newgain * grad[i][d]; | |
this.ystep[i][d] = newsid; // remember the step we took | |
// step! | |
this.Y[i][d] += newsid; | |
ymean[d] += this.Y[i][d]; // accumulate mean so that we can center later | |
} | |
} | |
// reproject Y to be zero mean | |
for(var i=0;i<N;i++) { | |
for(var d=0;d<this.dim;d++) { | |
this.Y[i][d] -= ymean[d]/N; | |
} | |
} | |
//if(this.iter%100===0) console.log('iter ' + this.iter + ', cost: ' + cost); | |
return cost; // return current cost | |
}, | |
// for debugging: gradient check | |
debugGrad: function() { | |
var N = this.N; | |
var cg = this.costGrad(this.Y); // evaluate gradient | |
var cost = cg.cost; | |
var grad = cg.grad; | |
var e = 1e-5; | |
for(var i=0;i<N;i++) { | |
for(var d=0;d<this.dim;d++) { | |
var yold = this.Y[i][d]; | |
this.Y[i][d] = yold + e; | |
var cg0 = this.costGrad(this.Y); | |
this.Y[i][d] = yold - e; | |
var cg1 = this.costGrad(this.Y); | |
var analytic = grad[i][d]; | |
var numerical = (cg0.cost - cg1.cost) / ( 2 * e ); | |
console.log(i + ',' + d + ': gradcheck analytic: ' + analytic + ' vs. numerical: ' + numerical); | |
this.Y[i][d] = yold; | |
} | |
} | |
}, | |
// return cost and gradient, given an arrangement | |
costGrad: function(Y) { | |
var N = this.N; | |
var dim = this.dim; // dim of output space | |
var P = this.P; | |
var pmul = this.iter < 100 ? 4 : 1; // trick that helps with local optima | |
// compute current Q distribution, unnormalized first | |
var Qu = zeros(N * N); | |
var qsum = 0.0; | |
for(var i=0;i<N;i++) { | |
for(var j=i+1;j<N;j++) { | |
var dsum = 0.0; | |
for(var d=0;d<dim;d++) { | |
var dhere = Y[i][d] - Y[j][d]; | |
dsum += dhere * dhere; | |
} | |
var qu = 1.0 / (1.0 + dsum); // Student t-distribution | |
Qu[i*N+j] = qu; | |
Qu[j*N+i] = qu; | |
qsum += 2 * qu; | |
} | |
} | |
// normalize Q distribution to sum to 1 | |
var NN = N*N; | |
var Q = zeros(NN); | |
for(var q=0;q<NN;q++) { Q[q] = Math.max(Qu[q] / qsum, 1e-100); } | |
var cost = 0.0; | |
var grad = []; | |
for(var i=0;i<N;i++) { | |
var gsum = new Array(dim); // init grad for point i | |
for(var d=0;d<dim;d++) { gsum[d] = 0.0; } | |
for(var j=0;j<N;j++) { | |
cost += - P[i*N+j] * Math.log(Q[i*N+j]); // accumulate cost (the non-constant portion at least...) | |
var premult = 4 * (pmul * P[i*N+j] - Q[i*N+j]) * Qu[i*N+j]; | |
for(var d=0;d<dim;d++) { | |
gsum[d] += premult * (Y[i][d] - Y[j][d]); | |
} | |
} | |
grad.push(gsum); | |
} | |
return {cost: cost, grad: grad}; | |
} | |
} | |
global.tSNE = tSNE; // export tSNE class | |
})(tsnejs); | |
// export the library to window, or to module in nodejs | |
(function(lib) { | |
"use strict"; | |
if (typeof module === "undefined" || typeof module.exports === "undefined") { | |
window.tsnejs = lib; // in ordinary browser attach library to window | |
} else { | |
module.exports = lib; // in nodejs | |
} | |
})(tsnejs); | |
</script> | |
</head> | |
<body> | |
<div id="embed"></div> | |
<script> | |
var opt = {epsilon: 10, perplexity: 30}; | |
var T = new tsnejs.tSNE(opt); // create a tSNE instance | |
var Y; | |
var data; | |
function updateEmbedding() { | |
var Y = T.getSolution(); | |
svg.selectAll('.u') | |
.data(data.users) | |
.attr("transform", function(d, i) { return "translate(" + | |
((Y[i][0]*20*ss + tx) + 400) + "," + | |
((Y[i][1]*20*ss + ty) + 300) + ")"; }); | |
} | |
var svg; | |
function drawEmbedding() { | |
var div = d3.select("#embed"); | |
// get min and max in each column of Y | |
var Y = T.Y; | |
svg = div.append("svg") // svg is global | |
.attr("width", 800) | |
.attr("height", 600); | |
var g = svg.selectAll(".b") | |
.data(data.users) | |
.enter().append("g") | |
.attr("class", "u"); | |
g.filter(function(d, i) { return !data.privacy[i]; }).append("svg:image") | |
.attr('x', 0) | |
.attr('y', 2) | |
.attr('width', 24) | |
.attr('height', 24) | |
.attr("xlink:href", function(d) { return "http://cs.stanford.edu/people/karpathy/tsnejs/scrape/imgs/" + d.substring(1); }) | |
g.filter(function(d, i) { return data.privacy[i]; }).append("circle") | |
.attr('x', 0) | |
.attr('y', 0) | |
.attr('r', 6) | |
.style('fill', 'none') | |
.style('stroke', '#ccc') | |
.style('stroke-width', '1') | |
var zoomListener = d3.behavior.zoom() | |
.scaleExtent([0.1, 10]) | |
.center([0,0]) | |
.on("zoom", zoomHandler); | |
zoomListener(svg); | |
} | |
var tx=0, ty=0; | |
var ss=1; | |
function zoomHandler() { | |
tx = d3.event.translate[0]; | |
ty = d3.event.translate[1]; | |
ss = d3.event.scale; | |
} | |
function step() { | |
for(var k=0;k<1;k++) { | |
T.step(); // do a few steps | |
} | |
updateEmbedding(); | |
} | |
$(window).load(function() { | |
$.getJSON("out2.json", function( j ) { | |
data = j; | |
console.log(data); | |
data.privacy = data.users.map(function(d, i) { return Math.random() > .2; }); | |
T.initDataDist(data.mat); // init embedding | |
drawEmbedding(); // draw initial embedding | |
//T.debugGrad(); | |
setInterval(step, 0); | |
//step(); | |
}); | |
}); | |
</script> | |
</body> |