Created
June 2, 2012 17:23
-
-
Save emirkin/2859254 to your computer and use it in GitHub Desktop.
Scraping restaurants in Javascript
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* This example shows how to collect restaurant information and menus on the fly. | |
*/ | |
// Instantiate Bobik client from Bobik SDK available at http://usebobik.com/sdk. | |
// You're welcome to link directly to the JS file. However we make no guarantees about keeping the link unchanged. | |
// Thus, you should do it only when you have a quick and immediate access to where this url is used (e.g. during development) | |
var bobik = new Bobik("YOUR_AUTH_TOKEN"); | |
// Finds restaurant directory information (name, website, address, menu_url). | |
// Upon success, triggers find_menus(). | |
function find_restaurants(neighborhood, cuisine) { | |
console.log("Looking for " + cuisine + " restaurants in " + neighborhood + "..."); | |
var src_url = "http://sanfrancisco.menupages.com/restaurants/all-areas/" + neighborhood + "/" + cuisine; | |
bobik.scrape({ | |
urls: [src_url], | |
query_set: "menupages" | |
}, function (scraped_data) { | |
if (!scraped_data) { | |
console.log("Data is unavailable"); | |
return; | |
} | |
var restaurants = scraped_data[src_url] | |
if (!restaurants || restaurants.length == 0) { | |
console.log("Did not find any restaurants"); | |
return; | |
} | |
var restaurants = group_restaurants(restaurants); | |
console.log("Found " + restaurants.length + " restaurants"); | |
var print_as_they_become_available = true; | |
if (print_as_they_become_available) | |
find_menus_async(restaurants); | |
else | |
find_menus_sync(restaurants); | |
}) | |
} | |
// A helper function that takes a hash of restaurant names, addresses and websites, | |
// and turns them into an array of grouped restaurant attributes. | |
// Also, each restaurant is augmented with the menu url. | |
function group_restaurants(restaurants) { | |
var names = restaurants['Name']; // an array of names | |
var addresses = restaurants['Address']; // an array of addresses | |
var urls = restaurants['Url']; // an array of urls | |
var restaurants = []; | |
for (var i=0; i<names.length; i++) { | |
var website = "http://sanfrancisco.menupages.com" + urls[i]; | |
// push this restaurant to the array of results | |
restaurants.push({ | |
'name' : names[i], | |
'address' : addresses[i], | |
'website' : website, | |
'menu_url' : website + "menu" | |
}) | |
} | |
return restaurants; | |
} | |
// Finds menus for all restaurants and adds those menus to the corresponding restaurant hashes. | |
// Upon completion, prints full restaurant information. | |
// This variant processes restaurants in parallel and prints them out as the information becomes available. | |
function find_menus_async(restaurants) { | |
console.log("Looking for menus..."); | |
for (var x in restaurants) { | |
var restaurant = restaurants[x]; | |
var menu_url = restaurant['menu_url']; | |
bobik.scrape({ | |
urls: [menu_url], // send only one at a time (and don't wait for it to complete before sending the next) | |
query_set: "menu" | |
}, function (scraped_data) { | |
restaurant['menu'] = scraped_data[menu_url]; | |
console.log("Found restaurant:" + restaurant); | |
}) | |
} | |
} | |
// This variant of find_menu displays results only when all are ready. | |
function find_menus_sync(restaurants) { | |
console.log("Looking for menus..."); | |
// Assemble a list of menu urls and a {url -> restaurant} map. | |
// We need this map to match results (since they will be bucketed by url) | |
var menu_urls = new Array(); | |
var url_to_restaurant = {}; | |
for (var x in restaurants) { | |
var restaurant = restaurants[x]; | |
var menu_url = restaurant['menu_url']; | |
menu_urls.push(menu_url); | |
url_to_restaurant[menu_url] = restaurant; | |
} | |
bobik.scrape({ | |
urls: menu_urls, | |
query_set: "menu" | |
}, function (scraped_data) { | |
for (var url in scraped_data) | |
url_to_restaurant[url]['menu'] = scraped_data[url]; | |
console.log(restaurants); | |
}) | |
} | |
// Go! | |
//find_restaurants('soma', 'italian') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment