Last active
December 14, 2015 02:19
-
-
Save cosmocatalano/5013048 to your computer and use it in GitHub Desktop.
Quick-and-dirty web-to-JSON scrape to get info on your most recent beer. Should be readily adaptable.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from bs4 import BeautifulSoup #you will probably have to install this: http://www.crummy.com/software/BeautifulSoup/ | |
import urllib2 | |
import sys | |
import cgitb | |
import string | |
import json | |
#This takes a URL and turns it into BeautifulSoup object | |
def make_soup(url): | |
try: | |
source = urllib2.urlopen(url).read() | |
soup = BeautifulSoup(source) | |
return soup | |
except: | |
print 'couldn\'t connect to source' | |
sys.exit() | |
cgitb.enable(format='txt') #error reporting on, in text | |
print 'Content-Type: text/plain\n' #specifies text, adds required after header info | |
#Starting the connection | |
opener = urllib2.build_opener() | |
opener.addheaders = [('User-agent', 'Mozilla/5.0')] | |
#Time-saving variables | |
username = 'cosmocatalano' #you'll probably want to change this | |
u_url = 'http://untappd.com' | |
u_user = 'http://untappd.com/user/' + username | |
#Grabbing the data | |
response = opener.open(u_user) | |
search_soup = make_soup(u_user) | |
result = search_soup.find('div', 'details') #not .find_all because I'm just looking for the latest beer | |
#All the links in the most recent <div class="details">; a very close match for API data. | |
API_bits = result.find_all('a') | |
#Timestamp | |
timestamp = result.find('li', class_='timezoner') | |
#Getting the main page of the last beer checked-in | |
beer_page = make_soup(u_url + API_bits[4].get('href')) | |
#Getting the latest image, if one exists | |
try: | |
beer_image = beer_page.find('div', class_='photo') | |
image_url = beer_image.a.img['src'] | |
except: | |
beer_image = beer_page.find('span', class_='icon') | |
image_url = beer_image.a.img['src'] | |
#Getting the rating of the beer through some belabored contortions | |
rating_span = beer_page.find('span', class_='rating') | |
rating_classes = rating_span['class'] | |
#A dictionary to map class to score | |
score_value = {'r05': 0.5, | |
'r10': 1, | |
'r15': 1.5, | |
'r20': 2, | |
'r25': 2.5, | |
'r30': 3, | |
'r35': 3.5, | |
'r40': 4, | |
'r45': 4.5, | |
'r50': 5 } | |
#This is a touch janky because it relies on the class indicating the rating to always be the third listed | |
my_score = score_value[rating_classes[2]] | |
#Let's give our links some names | |
count_to_name = ('user', | |
'beer', | |
'brewer', | |
'location', | |
'checkin', | |
'extra') #another janky move, safety for when there are six <a> tags in the <details> div | |
#This dictionary will eventually become our API object | |
scrape_obj = {'timestamp': timestamp.contents[0], | |
'image' : image_url, | |
'score' : my_score} | |
#Setting up a loop to deal with all our links from <div class="details"> | |
count = 0; | |
for bit in API_bits: | |
links = [u_url + bit['href'], bit.contents[0]] | |
scrape_obj[count_to_name[count]] = links | |
count = count + 1 | |
#Turning it into a JSON object for you to use as you see fit. | |
print json.dumps(scrape_obj) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment