Skip to content

Instantly share code, notes, and snippets.

@goweiting
Created January 14, 2017 22:26
Show Gist options
  • Save goweiting/4ce396c7d18d311af7927b258177bd2b to your computer and use it in GitHub Desktop.
Save goweiting/4ce396c7d18d311af7927b258177bd2b to your computer and use it in GitHub Desktop.
Web scrapper for downloading documents from a webpage
from bs4 import BeautifulSoup
import urllib
import os
# define the link to scrap from:
mainLink = 'http://www.cs.cmu.edu/~aarti/Class/10701_Spring14/lecs.html'
link = urllib.urlopen(mainLink)
# ext = '.pdf'
# use bs4 to download all the files with .pdf extension
soup = BeautifulSoup(link.read(), 'lxml') # parse it with lxml parser
links = soup.body.find_all('a')
for link in links:
hreflink = link.get('href')
# need to change the name of extension below:
# need to change the number of characters fitting the extension
if (hreflink is not None) and (hreflink[-4:] == '.pdf'):
name = hreflink.split('/')[-1] # usually the last term
print(hreflink, name)
# retrieve it:
try:
urllib.urlretrieve(hreflink, name)
print('done')
except IOError:
# remove the .html extension; and try downloading again
mainLink = 'http://www.cs.cmu.edu/~aarti/Class/10701_Spring14/'
# TODO: WHAT IF THERE IS NO SUCH FILE !
# if hreflink[1:8] == 'http://' or hreflink[1:3] == 'www':
# print(name + ' cannot be downloaded')
# else:
hreflink = mainLink + hreflink # make it not relative
try:
urllib.urlretrieve(hreflink, name)
print('done')
except IOError:
print(name + ' cannot be downloaded')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment