Skip to content

Instantly share code, notes, and snippets.

@fpom
Created March 2, 2016 08:03
Show Gist options
  • Save fpom/ef83e95afa71605c71b7 to your computer and use it in GitHub Desktop.
Save fpom/ef83e95afa71605c71b7 to your computer and use it in GitHub Desktop.
Get links from The David W. Niven Collection of Early Jazz Legends
# this is for Python 2.x
import urlparse
from pyquery import PyQuery as pq
url = "https://archive.org/details/davidwnivenjazz"
doc = pq(url)
todo = []
for a in doc("a[href]") :
href = a.attrib["href"]
if not href.startswith("/details/") or href.startswith("/details/davidwnivenjazz") :
continue
todo.append(urlparse.urljoin(url, href))
for link in todo :
print "#", link
doc = pq(link)
for a in doc("a.hover-badge") :
href = a.attrib["href"]
if "/formats=" in href :
continue
path = href.split("/")[-1]
print "wget --continue -O 'dl/%s.zip' '%s'" % (path, urlparse.urljoin(url, href))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment