Created
June 24, 2016 13:18
-
-
Save Litwilly/4ca0d14b3612d9bfdc143469ee37b62c to your computer and use it in GitHub Desktop.
cURL Fitbit community groups with pycurl, scrape pages with BeautifulSoup, send to Redis (wrote for getting data to create Fitbit dashboard on dashing.io framework)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
from bs4 import BeautifulSoup | |
import pycurl | |
import re | |
import os | |
from urllib import urlencode | |
from io import BytesIO | |
from StringIO import StringIO | |
import sys | |
import redis | |
import time | |
class getFitbitData: | |
cookieDir = './fbcookie.txt' #where we're storing our cookies | |
#Config Redis server we're connecting to | |
pool = redis.ConnectionPool( host='0.0.0.0', port=6379,password='ifneeded',db=12 ) | |
redisServer = redis.Redis( connection_pool=pool ) | |
pipe = redisServer.pipeline() | |
# 2D array group name used in Redis key, and groups fitbit URL | |
groups = [["XXXX","https://www.fitbit.com/group/XXXXXX"],["XXXX","https://www.fitbit.com/group/XXXXXX"]] | |
date = time.strftime("%Y-%m-%d") | |
def __init__( self ): | |
#Where we store our Unity Crap | |
self.password = 'yourpass' | |
self.user = 'youruser' | |
def getHTML( self, groupURL, page ): | |
print "authenticate" | |
buffer = BytesIO() | |
c = pycurl.Curl() | |
c.setopt(c.SSL_VERIFYPEER, False) | |
c.setopt(c.FOLLOWLOCATION, True) | |
c.setopt(c.TIMEOUT, 60) | |
c.setopt(c.USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:8.0) Gecko/20100101 Firefox/8.0') | |
c.setopt(c.URL, 'https://www.fitbit.com/login') | |
c.setopt(c.WRITEFUNCTION, buffer.write) | |
c.perform() | |
html = str(buffer.getvalue()) | |
#Get hidden values for post | |
if "_sourcePage" in html: | |
rex = re.compile( "input type=\"hidden\" name=\"_sourcePage\" value=\"(.*?)\"") | |
sourcepage = rex.search( html ).groups()[0] | |
if "__fp" in html: | |
rex = re.compile( "input type=\"hidden\" name=\"__fp\" value=\"(.*)\"") | |
fp = rex.search( html ).groups()[0] | |
datastuff = {'login':'Log In','disableThirdPartyLogin':'false','email':self.user,'password':self.password,'rememberMe':'true'} | |
#post datastuff | |
c.setopt(c.USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:8.0) Gecko/20100101 Firefox/8.0') | |
c.setopt(c.URL, 'https://www.fitbit.com/login' ) | |
c.setopt(c.COOKIEJAR, self.cookieDir) | |
c.setopt(c.COOKIEFILE, self.cookieDir ) | |
c.setopt(c.WRITEFUNCTION, buffer.write) | |
c.setopt(c.FOLLOWLOCATION, True) | |
c.setopt(c.POST, True) | |
c.setopt(c.POSTFIELDS, urlencode( datastuff )) | |
c.perform() | |
buffer.flush() | |
buffer = BytesIO() | |
c.setopt(c.USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:8.0) Gecko/20100101 Firefox/8.0') | |
c.setopt(c.URL, groupURL+'/leaders?timeWindow=CURRENT_MONTH&page='+page) | |
c.setopt(c.COOKIEJAR, self.cookieDir) | |
c.setopt(c.COOKIEFILE, self.cookieDir ) | |
c.setopt(c.WRITEFUNCTION, buffer.write) | |
c.perform() | |
html = str(buffer.getvalue()) | |
return html | |
# c.close() | |
def parseHTML( self, html, group ): | |
soup = BeautifulSoup(html, "html.parser") | |
count = 0 | |
# find all a hrefs with class formlink | |
for leftCell in soup.find_all("div", {"class": "leaderboardCell left"}): | |
for mylink in leftCell.find_all("div", {"class": "info"}): | |
for link in mylink.find_all("a"): | |
name = (link.get_text()) | |
for link in mylink.find_all("li", {"class": "stat ellipsis"}): | |
t = (link.get_text()) | |
t = "".join(t.split()) | |
t = t[:-5] | |
steps = int(t.replace(',', '')) | |
for link in mylink.find_all("li", {"class": "average ellipsis"}): | |
a = (link.get_text()[:-5]) | |
avg = int(a.replace(',', '')) | |
print name | |
print steps | |
self.redisServer.zadd("all:steps",name,steps) | |
self.redisServer.zadd(group+":steps",name,steps) | |
count += 1 | |
return count | |
fit = getFitbitData() | |
#Run through groups if more then 25 listings on a page go to the next page | |
for group in fit.groups: | |
html = fit.getHTML( group[1], "0" ) | |
listCount = fit.parseHTML( html, group[0] ) | |
page = 1 | |
while listCount == 25: | |
print "run again" | |
html = fit.getHTML( group[1], str(page) ) | |
listCount = fit.parseHTML( html, group[0] ) | |
page +=1 | |
# Calculate Average - Pull all data from redis | |
# get total and start counter for every step count greater then 0 | |
s = 0 | |
d = 0 | |
z = fit.redisServer.zrange(group[0]+':steps',0,-1,withscores=True) | |
for x in z: | |
if x[1] > 0: | |
s = s +x[1] | |
d += 1 | |
# math for average | |
avg = s/d | |
fit.redisServer.delete(group[0]+":avg:"+fit.date) | |
fit.redisServer.sadd(group[0]+":avg:"+fit.date,avg) | |
print "main done" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment