Created
July 3, 2012 18:17
-
-
Save emirkin/3041523 to your computer and use it in GitHub Desktop.
Code to Scrape Drugstores
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package com.example; | |
// Download Bobik SDK from http://usebobik.com/sdk | |
import android.util.Log; | |
import bobik.BobikClient; | |
import bobik.BobikHelper; | |
import bobik.Job; | |
import bobik.JobListener; | |
import org.json.JSONException; | |
import org.json.JSONObject; | |
import java.io.UnsupportedEncodingException; | |
import java.util.*; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
/** | |
* Implements search capability for a comparison shopping app | |
*/ | |
public class ComparisonShoppingApp { | |
BobikClient bobik = new BobikClient(YOUR_AUTHENTICATION_TOKEN); | |
public void printDeals(String drug) throws Exception { | |
for (JSONObject option : findBestPrice(drug)) | |
Log.d("Super Shopper", option.toString()); | |
} | |
/** | |
* Searches the web for various buying options for a given drug | |
* and returns results sorted in accordance to the lowest price | |
* @param drug | |
* @return purchasing options, sorted by price | |
*/ | |
public List<JSONObject> findBestPrice(String drug) throws Exception { | |
List<JSONObject> allOptions = findAllOptions(drug); | |
Collections.sort(allOptions, new Comparator<JSONObject>() { | |
@Override | |
public int compare(JSONObject jsonObject1, JSONObject jsonObject2) { | |
try { | |
double price1 = jsonObject1.getDouble("Price"); | |
double price2 = jsonObject2.getDouble("Price"); | |
return (price1 == price2)? 0 : (price1>price2? +1 : -1); | |
} catch (JSONException e) { | |
throw new RuntimeException(e); | |
} | |
} | |
}); | |
return allOptions; | |
} | |
private String[] getSearchUrls(String keyword) { | |
try { | |
String encodedKeyword = java.net.URLEncoder.encode(keyword, "UTF-8"); | |
return new String[]{ | |
"http://www.cvs.com/search/_/N-3mZ2k?pt=product&searchTerm=" + encodedKeyword, | |
"http://www.myotcstore.com/store/Search.aspx?SearchTerms=" + encodedKeyword, | |
"http://www.familymeds.com/search/search-results.aspx?SearchTerm=" + encodedKeyword, | |
"http://www.canadadrugs.com/search.php?keyword=" + encodedKeyword, | |
"http://thebestonlinepharmacy.net/product.php?prod=" + encodedKeyword, | |
"http://www.walgreens.com/search/results.jsp?Ntt=" + encodedKeyword, | |
"http://www.drugstore.com/search/search_results.asp?N=0&Ntx=mode%2Bmatchallpartial&Ntk=All&Ntt=" + encodedKeyword | |
}; | |
} catch (UnsupportedEncodingException e) { | |
e.printStackTrace(); | |
throw new RuntimeException(e); | |
} | |
} | |
/** | |
* Searches on the web for various buying options for a given drug | |
* | |
* @param drug | |
* @return An array of hashes containing some or all of the following elements: | |
* Title - product title | |
* Image - product image | |
* Price - generally a X.XX number, although there can be something as ugly as "$6.99\r\n2/$11.00 or 1/$5.99\r\n \r\nSavings: $1.00 (14%) on 1" | |
* Details - size, weight, and any additional information that could not be categorized easily | |
*/ | |
public List<JSONObject> findAllOptions(String drug) throws Exception { | |
// First, find options in the raw form, then clean them up (transpose, normalize) and return | |
JSONObject request = new JSONObject(); | |
for (String url : getSearchUrls(drug)) | |
request.accumulate("urls", url); | |
for (String query_set : new String[]{"cvs", "MyOTCStore", "drugstore.com", "FamilyMeds", "walgreens", "CanadaDrugs", "thebestonlinepharmacy"}) | |
request.accumulate("query_sets", query_set); | |
request.put("ignore_robots_txt", true); | |
final List<JSONObject> results = new ArrayList<JSONObject>(); | |
Job job = bobik.scrape(request, new JobListenerImpl() { | |
@Override | |
public void onSuccess(JSONObject jsonObject) { | |
// Aggregate results across all search urls | |
Iterator search_urls = jsonObject.keys(); | |
while (search_urls.hasNext()) { | |
String search_url = (String)search_urls.next(); | |
String url_base = getUrlBase(search_url); | |
try { | |
JSONObject results_parallel_arrays_of_attributes = jsonObject.getJSONObject(search_url); | |
if (results_parallel_arrays_of_attributes.getJSONArray("Price").length() == 0) | |
continue; // no priced results from this source | |
List<JSONObject> results_from_this_url = BobikHelper.transpose(results_parallel_arrays_of_attributes); | |
// Perform some remaining cleanup | |
for (JSONObject r : results_from_this_url) { | |
// 1. Make urls absolute | |
for (String link_key : new String[]{"Image", "Link"}) { | |
try { | |
r.put(link_key, url_base + r.get(link_key)); | |
} catch (JSONException e) { | |
// continue to the next result if Image or Link is missing | |
} | |
} | |
// 2. Extract price | |
r.put("Price", cleanPrice(r.getString("Price"))); | |
} | |
results.addAll(results_from_this_url); | |
} catch (JSONException e) { | |
e.printStackTrace(); | |
// continue to the next store if this search url is broken | |
} | |
} | |
} | |
}); | |
// Feel free to remove this call if you'd rather show results as they become available | |
job.waitForCompletion(); | |
return results; | |
} | |
/** | |
* Cleans up a price fragment. | |
* Example: | |
* $6.99\r\n2/$11.00 or 1/$5.99\r\n \r\nSavings: $1.00 (14%) on 1 | |
* will become | |
* 6.99 | |
* @param priceBlurb | |
* @throws IllegalArgumentException if price cannot be determined. If such an exception is thrown, | |
* it's best to either to keep the original text fragment or throw away the result | |
* @return a single simplest price number | |
*/ | |
private static final Pattern price_pattern = Pattern.compile("\\$[0-9]+\\.?[0-9]?[0-9]?"); | |
private double cleanPrice(String priceBlurb) throws IllegalArgumentException { | |
try { | |
Matcher matcher = price_pattern.matcher(priceBlurb); | |
if (matcher.find()) { | |
priceBlurb = matcher.group(0); | |
if (priceBlurb.startsWith("$")) | |
priceBlurb = priceBlurb.substring(1); | |
} | |
return Double.parseDouble(priceBlurb); | |
} catch (Exception e) { | |
e.printStackTrace(); | |
throw new IllegalArgumentException("No price found"); | |
} | |
} | |
private String getUrlBase(String url) { | |
int slashslash = url.indexOf("//") + 2; | |
return url.substring(0, url.indexOf('/', slashslash)); | |
} | |
// Since we don't expect any errors and don't care about progress (in this example), | |
// stub 2 functions with simple loggers | |
private abstract static class JobListenerImpl extends JobListener { | |
@Override | |
public void onProgress(float currentProgress) { | |
Log.d(log_tag, "Current progress for job " + job.id() + " is " + currentProgress * 100 + "%"); | |
} | |
@Override | |
public void onErrors(Collection<String> errors){ | |
for (String s : errors) | |
Log.e(log_tag, "Error for job " + job.id() + ": " + s); | |
} | |
private final String log_tag = "Super Shopper"; | |
}; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment