nicholasglazer · May 3, 2016 09:02
diff --git a/wiki-node-simplecrawler.js b/wiki-node-simplecrawler.js
 var cheerio = require('cheerio');
 var Crawler = require('simplecrawler');

 var initialTopic = 'SpaceX';
 var blacklist = ["#", "/w/", "/static/", "/api/", "/beacon/", "File:",
                 "Wikipedia:", "Template:", "MediaWiki:", "Help:", "Special:",
                 "Category:", "Portal:", "Main_Page", "Talk:", "User:",
                 "User_talk:", "Template_talk:", "Module:"]; //useless special cases from wikipedia

 var url = '/wiki/' + initialTopic;
 var crawler = new Crawler('en.wikipedia.org', url, 80);

 crawler.initialProtocol = 'https';
 crawler.maxDepth = 2; //reads only main article and links off of that
 crawler.scanSubdomains = false;
 crawler.stripWWWDomain = true;
 crawler.stripQuerystring = true;

 crawler.addFetchCondition(function(parsedURL, queueItem) {
 	return parsedURL.depth <= crawler.maxDepth;
 });

 crawler.discoverResources = function(buffer, queueItem) {
  var $ = cheerio.load(buffer.toString("utf8"));
  $('.div-col.columns.column-count.column-count-2').prev().nextAll().remove(); //this removes the reference section after the main article
  var resources = $('a[href]').map(function() {
    var link = $(this).attr('href');
    if (!new RegExp(blacklist.join("|")).test(link)) { //this prevents the crawler from crawling irrelevant/useless pages
      return link;
    }
  }).get();

  return crawler.cleanExpandResources(resources, queueItem);
 };

 crawler.on("fetchcomplete", function(queueItem, responseBuffer, response) {
    var fetchedAddress = queueItem.url;
    var $ = cheerio.load(responseBuffer.toString("utf8"));
    $('.div-col.columns.column-count.column-count-2').nextAll().remove(); //removes reference section from article for ease of reading
    var cleanedData = $('#mw-content-text').text() //assigning body of article to variable for testing/logging
    console.log(fetchedAddress); //logs the link of article that it is currently crawling
    articles.push(cleanedData);
 });

 crawler.on("crawlstart", function() {
  console.log("begin!");
  var articles = [];
 });

 crawler.on("complete", function() { //this event does not fire and the console hangs
  console.log("end!");
  console.log(articles.length);
 });

 crawler.start(); //start crawler
	var cheerio = require('cheerio');
	var Crawler = require('simplecrawler');

	var initialTopic = 'SpaceX';
	var blacklist = ["#", "/w/", "/static/", "/api/", "/beacon/", "File:",
	"Wikipedia:", "Template:", "MediaWiki:", "Help:", "Special:",
	"Category:", "Portal:", "Main_Page", "Talk:", "User:",
	"User_talk:", "Template_talk:", "Module:"]; //useless special cases from wikipedia

	var url = '/wiki/' + initialTopic;
	var crawler = new Crawler('en.wikipedia.org', url, 80);

	crawler.initialProtocol = 'https';
	crawler.maxDepth = 2; //reads only main article and links off of that
	crawler.scanSubdomains = false;
	crawler.stripWWWDomain = true;
	crawler.stripQuerystring = true;

	crawler.addFetchCondition(function(parsedURL, queueItem) {
	return parsedURL.depth <= crawler.maxDepth;
	});

	crawler.discoverResources = function(buffer, queueItem) {
	var $ = cheerio.load(buffer.toString("utf8"));
	$('.div-col.columns.column-count.column-count-2').prev().nextAll().remove(); //this removes the reference section after the main article
	var resources = $('a[href]').map(function() {
	var link = $(this).attr('href');
	if (!new RegExp(blacklist.join("\|")).test(link)) { //this prevents the crawler from crawling irrelevant/useless pages
	return link;
	}
	}).get();

	return crawler.cleanExpandResources(resources, queueItem);
	};

	crawler.on("fetchcomplete", function(queueItem, responseBuffer, response) {
	var fetchedAddress = queueItem.url;
	var $ = cheerio.load(responseBuffer.toString("utf8"));
	$('.div-col.columns.column-count.column-count-2').nextAll().remove(); //removes reference section from article for ease of reading
	var cleanedData = $('#mw-content-text').text() //assigning body of article to variable for testing/logging
	console.log(fetchedAddress); //logs the link of article that it is currently crawling
	articles.push(cleanedData);
	});

	crawler.on("crawlstart", function() {
	console.log("begin!");
	var articles = [];
	});

	crawler.on("complete", function() { //this event does not fire and the console hangs
	console.log("end!");
	console.log(articles.length);
	});

	crawler.start(); //start crawler