Skip to content

Instantly share code, notes, and snippets.

@joshdick
Forked from dperelman/download_podcast.py
Last active January 20, 2020 01:29
Show Gist options
  • Save joshdick/338af38a2eda382a41180b7865727751 to your computer and use it in GitHub Desktop.
Save joshdick/338af38a2eda382a41180b7865727751 to your computer and use it in GitHub Desktop.
Download all items in a podcast.
#!/usr/bin/env python3
import datetime
import feedparser
import time
import os
import sys
from urllib.request import Request, urlopen
# Some hosts (i.e. Patreon) will deny requests with an HTTP 403
# if they appear to come from scripts; pretend to be Firefox.
USER_AGENT='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:72.0) Gecko/20100101 Firefox/72.0'
# From http://stackoverflow.com/a/1160227
if sys.version_info < (3, 3):
def touch(fname, mode=0o666, dir_fd=None, **kwargs):
flags = os.O_CREAT | os.O_APPEND
times = kwargs['times'] if 'times' in kwargs else None
with os.fdopen(os.open(fname, flags, mode)) as f:
os.utime(fname, times)
else:
def touch(fname, mode=0o666, dir_fd=None, **kwargs):
flags = os.O_CREAT | os.O_APPEND
with os.fdopen(os.open(fname, flags=flags, mode=mode, dir_fd=dir_fd)) as f:
os.utime(f.fileno() if os.utime in os.supports_fd else fname,
dir_fd=None if os.supports_fd else dir_fd, **kwargs)
# From http://stackoverflow.com/a/7244263
def downloadFile(url, file_name):
req = Request(url)
req.add_header('User-Agent', USER_AGENT)
# Download the file from `url` and save it locally under `file_name`:
with urlopen(req) as response, open(file_name, 'wb') as out_file:
data = response.read() # a `bytes` object
out_file.write(data)
def getContentLength(url):
print("Enclosure length mismatch. Checking content-length explicitly...")
req = Request(url)
req.add_header('User-Agent', USER_AGENT)
with urlopen(req) as response:
size = response.headers.get("content-length")
return int(size) if size else 0
def downloadAll(feedURL):
feed = feedparser.parse(feedURL)
print("Processing feed %s..." % feed['feed']['title'])
for post in reversed(feed.entries):
print("Processing item %s... " % post.title, end="")
if len(post.enclosures) != 1:
print("Post has %d enclosures, not 1. Skipping post."
% len(post.enclosures))
else:
media = post.enclosures[0]
mediaURL = media.href
# cut the extension off the end to use in the filename
mediaExt = mediaURL[mediaURL.rfind('.')+1:]
quesPos = mediaExt.find('?')
if quesPos != -1:
mediaExt = mediaExt[:quesPos]
mediaSize = int(media.length) # media.length is of type str
filename = "%s.%s" % (post.title.replace('/', '_'), mediaExt)
stat = os.stat(filename) if os.path.isfile(filename) else None
if stat and stat.st_size > 0\
and (\
stat.st_size == mediaSize\
or mediaSize == 0\
or getContentLength(mediaURL) == stat.st_size # manually verify enclosure length; it might be wrong
):
print("File already downloaded. Skipping.")
else:
if stat and stat.st_size != mediaSize:
print("Incorrect file found. Redownloading... ", end="")
sys.stdout.flush()
else:
print("Downloading... ", end="")
sys.stdout.flush()
# Download the file...
downloadFile(mediaURL, filename)
print("Done.")
# From http://stackoverflow.com/a/1697907
pubTimestamp = time.mktime(post.published_parsed)
# ... and set its created time to the publication time.
touch(filename, times=(stat.st_atime if stat else pubTimestamp,
pubTimestamp))
_feedURLfilename = ".podcast_source"
if __name__ == "__main__":
# Remember the feed URL in a hidden file named _feedURLfilename
# Always use the command-line URL if given and remember it in that file.
if len(sys.argv) == 2:
feedURL = sys.argv[1]
with open(_feedURLfilename, 'w') as feedURLfile:
feedURLfile.write(feedURL)
elif len(sys.argv) == 1 and os.path.isfile(_feedURLfilename):
with open(_feedURLfilename, 'r') as feedURLfile:
feedURL = feedURLfile.read()
else:
print("USAGE: %s feedURL" % sys.argv[0])
sys.exit(1)
downloadAll(feedURL)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment