Skip to content

Instantly share code, notes, and snippets.

@Buffer0x7cd
Last active April 8, 2018 15:01
Show Gist options
  • Save Buffer0x7cd/73b86672fc0293642f14967022b90331 to your computer and use it in GitHub Desktop.
Save Buffer0x7cd/73b86672fc0293642f14967022b90331 to your computer and use it in GitHub Desktop.
''' Create a virtualenv with python3 or run with default os installation (adjust the sheband based on you enviornment)
Install script dependencies with pip (requests, beautifulsoup4, lxml
make the script executable with chmod +x crawler.py
run the script with ./crawler.py
'''
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import requests
import os
import shutil
screencast_list = []
ROOT_DOMAIN = "https://www.destroyallsoftware.com"
SCREENCAST_DOMAIN = "https://www.destroyallsoftware.com/screencasts/catalog/"
DOWNLOAD_SUFFIX = "/download?resoluation=1080p"
TARGET_HOST = "https://www.destroyallsoftware.com/screencasts/catalog"
FILENAME_START_INDEX=55
FILENAME_END_INDEX=-27
FILE_SUFFIX = ".mp4"
VALID_LINK_LENGTH = 81
def build_screencastlist(html_doc):
soup = BeautifulSoup(html_doc, "lxml")
for link in soup.find_all('a'):
if link.get('href'):
tmpScreenCastLink = ROOT_DOMAIN+link.get('href')+DOWNLOAD_SUFFIX
if tmpScreenCastLink[:55] == SCREENCAST_DOMAIN and len(tmpScreenCastLink) > VALID_LINK_LENGTH:
screencast_list.append(tmpScreenCastLink)
def get_screencast(url):
screencast_name = url[FILENAME_START_INDEX:FILENAME_END_INDEX]+FILE_SUFFIX
file = requests.get(url, stream=True)
if file.status_code == 200:
with open (screencast_name, 'wb') as f:
file.raw.decode_content=True
shutil.copyfileobj(file.raw,f)
return screencast_name
else:
print("Error during downloading file {} return code: {}".format(url, file.status_code))
return
def main():
req = requests.get(TARGET_HOST)
if req.status_code == 200:
print("Connection Succeeded")
html_doc = req.text
build_screencastlist(html_doc)
for link in screencast_list:
print(link)
screencast = get_screencast(link)
if screencast:
print("Completed downloading screencasr: {}".format(screencast))
else:
print("Some error occured during establishing initial connection")
print(req.status_code)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment