Created
June 6, 2017 19:16
-
-
Save kevinpaulconnor/de0fec7733e1bdee951120005dcdcceb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" Scrape WDRV looking for non-eagles songs with "easy" in the title """ | |
import requests | |
import datetime | |
from bs4 import BeautifulSoup | |
import re | |
base_url = 'http://wdrv.com/wdrv-music-logs/' | |
payload = {} | |
# spoofing browser headers | |
headers = {'Pragma': 'no-cache', | |
'Origin': 'http://wdrv.com', | |
'Accept-Encoding': 'gzip, deflate', | |
'Accept-Language': 'en-US,en;q=0.8', | |
'Upgrade-Insecure-Requests': '1', | |
'Content-Type': 'application/x-www-form-urlencoded', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'Cache-Control': 'no-cache', | |
'Referer': 'http://wdrv.com/wdrv-music-logs/', | |
'Cookie': 'visited=true; __gads=ID=0313412959dd87a8:T=1496763619:S=ALNI_MboLkiRvVQSPS5S7TNz8fJlY-iBXg; _ga=GA1.2.2104241702.1496763620; _gid=GA1.2.664781839.1496763620', | |
'Connection': 'keep-alive', | |
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"} | |
date = datetime.datetime(2012,12,31) | |
peaceful_count = 0 | |
peaceful_dates = [] | |
take_it_count = 0 | |
take_it_dates = [] | |
with open('wdrv', 'w+') as f: | |
# number of days, 12/31/13 to 6/5/17 | |
for i in range(1618): | |
date += datetime.timedelta(days=1) | |
print date | |
formatted_date = date.strftime('%Y-%m-%d') | |
payload['musiclogtoview'] = date.strftime('%Y%m%d') + '-MusicLog.txt' | |
#r = requests.get(base_url, data=payload) | |
r = requests.post(base_url, headers=headers, data=payload) | |
if r.status_code is 200: | |
soup = BeautifulSoup(r.text, "html.parser") | |
for node in soup.find_all('tr'): | |
match = re.findall('<td>.*EASY.*</td>', str(node)) | |
if match: | |
for item in match: | |
peaceful = re.search('PEACEFUL EASY FEELING', item) | |
take_it = re.search('TAKE IT EASY', item) | |
if peaceful: | |
peaceful_count = peaceful_count + 1 | |
peaceful_dates.append(formatted_date) | |
elif take_it: | |
take_it_count = take_it_count + 1 | |
take_it_dates.append(formatted_date) | |
else: | |
f.write(formatted_date + '\n') | |
f.write(str(match)+ '\n') | |
f.write("Peaceful count: " + str(peaceful_count) + '\n') | |
f.write('(') | |
for peaceful_date in peaceful_dates: | |
f.write(peaceful_date + ',') | |
f.write(')\n') | |
f.write("Take It count: " + str(take_it_count) + '\n') | |
for take_it_date in take_it_dates: | |
f.write(take_it_date + ',') | |
f.write(')\n') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment