Last active
January 10, 2020 11:24
-
-
Save voznesenskym/a4ff7681aa723bd25b9ea7ee5177827e to your computer and use it in GitHub Desktop.
Bulk Download the Toronto emotional speech set (TESS) https://tspace.library.utoronto.ca/handle/1807/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Python Version: 3.4.2 | |
# bs4 version: 4.3.2-2 | |
from urllib.request import urlopen | |
from bs4 import BeautifulSoup | |
import pickle | |
links = [] | |
def fetch_links(path): | |
html = urlopen(path) # Insert your URL to extract | |
bsObj = BeautifulSoup(html.read()); | |
for link in bsObj.find_all('a'): | |
if "bitstream" in link.get('href'): | |
print("fetched:" + path) | |
links.append("https://tspace.library.utoronto.ca" + link.get('href')) | |
# Magical range the University of Toronto used for this dataset | |
for i in range(24488, 24502): | |
path = "https://tspace.library.utoronto.ca/handle/1807/" + str(i) | |
print ("fetching: " + path) | |
fetch_links(path) | |
print ("Fetched " + str(len(links)) + " paths") | |
with open('tspace-links.pickle', 'wb') as fp: | |
pickle.dump(links, fp) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment