Skip to content

Instantly share code, notes, and snippets.

@IceCreamYou
Created March 20, 2013 09:35
Show Gist options
  • Save IceCreamYou/5203447 to your computer and use it in GitHub Desktop.
Save IceCreamYou/5203447 to your computer and use it in GitHub Desktop.
This script scrapes the Inc 500 data and turns it into a tab-separated values (TSV) string which can be easily imported into Excel or a database. To run it, go to http://www.inc.com/inc5000/list/2012 and execute the code in your browser's JavaScript console. Be nice! This hits the Inc site 50 times in quick succession. I don't know who technical…
/**
* Get an array containing the text nodes within a DOM node.
*
* Modified from http://stackoverflow.com/a/4399718/843621
*
* @param node Any DOM node.
* @param [includeWhitespaceNodes=false] Whether to include whitespace-only nodes.
* @param [recurse=false] Whether to get all nodes (true) or only the immediate child nodes (false).
* @return An array containing TextNodes.
*/
function getTextNodesIn(node, includeWhitespaceNodes, recurse) {
var textNodes = [], whitespace = /^\s*$/;
function getTextNodes(node, recurse) {
if (node.nodeType == 3) {
if (includeWhitespaceNodes || !whitespace.test(node.nodeValue)) {
textNodes.push(node);
}
}
else if (recurse) {
for (var i = 0, len = node.childNodes.length; i < len; ++i) {
getTextNodes(node.childNodes[i], !recurse);
}
}
}
getTextNodes(node, !recurse);
return textNodes;
}
/**
* Get the data on the current page in TSV format.
*/
function getPageCSV() {
var s = '';
// Walk through each row
$('#fulltable tbody tr:not(:first)').each(function() {
// Walk through each column
$(this).find('td').each(function(i, v) {
// Usually just grab the value
v = $(this).html();
// In the second column, get both the company name and URL
if (i === 1) v = $(this).find('a').html() + "\t" + $(this).find('a')[0].href;
// In the fifth column, get just the category, not the fancy colored block thing
else if (i === 4) v = getTextNodesIn(this)[0].nodeValue;
// Add to the line
s += v + "\t";
});
// Next line
s += "\n";
});
return s;
}
// "s" holds the whole TSV at the end of this script
var s = "Rank\tCompany Name\tURL\t3-year % growth\tRevenue (millions)\tIndustry\t# of Employees\tCity\tState\n";
/**
* Scrapes all the data.
*
* This recursively reads the data from the current page, then loads in the next one.
* After this function runs, the s variable will hold the entire TSV.
*
* @param [max] The maximum number of pages to retrieve. If not specified, gets them all.
* @param [curr] Used internally to keep track of what page we're on.
*/
function getNextPage(max, curr) {
// On the first run, assume we're on the first page
if (typeof curr === 'undefined') curr = 1;
// Get the data
s += getPageCSV();
// If there is a next page and we haven't read too many pages, load the next one
if ($('.next').length && (typeof max === 'undefined' || curr < max)) {
// Copy the next page over the current one. There are a couple advantages of load():
// most importantly, we get just the DOM we care about without running other scripts on the page.
$('#maincolumn_inner').load($('.next')[0].href + ' #maincolumn_inner', function() {
// Recurse!
getNextPage(max, ++curr);
});
}
// When we're done, go back to the first page so we can easily run again if we want.
else {
$('#maincolumn_inner').load('http://www.inc.com/inc5000/list/2012 #maincolumn_inner', function() {
// Print out the result in the console so we can copy it into a document
console.log(s);
});
}
}
// Run the script.
getNextPage();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment