Skip to content

Instantly share code, notes, and snippets.

@Gooseus
Last active August 29, 2015 14:21
Show Gist options
  • Save Gooseus/f499ae90889694965c89 to your computer and use it in GitHub Desktop.
Save Gooseus/f499ae90889694965c89 to your computer and use it in GitHub Desktop.
Finding Frequent Pairs in Lists of Items
"use strict";
/*****
Node Pair Stream
by Shawn Marincas
Consumes a CSV as a stream of item sets and outputs lines of comma-separated pairs of items that appear on the specified number of lines
- input : Stream input source, either a filename or defaults to STDIN for pipes
- threshold : Number of lines a pairs must appear on before being emitted
******/
var input = input = process.argv[2] ? require("fs").createReadStream(process.argv[2]) : process.stdin,
threshold = process.argv[3] ? parseInt(process.argv[3],10) : 50,
// our line emitter, sweet built-in node module I didn't know about until today
reader = require("readline").createInterface({
input: input,
terminal: false
}),
// holds ours pair counts
counters = {};
// our main algorithm
function accumulate_pairs(set) {
var size = set.length,
i = 0,
j = 0;
if(size>1) {
for(; i<size; i++) {
for(; j<size; j++) {
// skip if the artists are the same
if(set[i]==set[j]) {
continue;
}
// do a quick alphabetical check to avoid counting reversed pairs
var pair = set[i] > set[j] ? set[i]+','+set[j] : set[j]+','+set[i];
// either create our pair, or increment it;
counters[pair] = counters[pair] ? counters[pair]+1 : 1;
// if we've hit our threshold, we can emit it, it's ok
if(counters[pair] == threshold) {
// write the pair to stdout
process.stdout.write(pair+'\n');
}
}
}
}
}
// called for each line emitted by text stream
function consume_line(line) {
var current = line.toString().split(",");
accumulate_pairs(current);
}
// listen to consume emitted lines and exit process with closed stream
reader.on("line", consume_line);
reader.on("close", process.exit);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment