joshdick · January 20, 2020 01:29
diff --git a/download_podcast.py b/download_podcast.py
 #!/usr/bin/env python3

 import datetime
 import feedparser
 import time
 import os
 import sys
 from urllib.request import Request, urlopen

 # Some hosts (i.e. Patreon) will deny requests with an HTTP 403
 # if they appear to come from scripts; pretend to be Firefox.
 USER_AGENT='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:72.0) Gecko/20100101 Firefox/72.0'

 # From http://stackoverflow.com/a/1160227
 if sys.version_info < (3, 3):
    def touch(fname, mode=0o666, dir_fd=None, **kwargs):
        flags = os.O_CREAT | os.O_APPEND
        times = kwargs['times'] if 'times' in kwargs else None
        with os.fdopen(os.open(fname, flags, mode)) as f:
            os.utime(fname, times)
 else:
    def touch(fname, mode=0o666, dir_fd=None, **kwargs):
        flags = os.O_CREAT | os.O_APPEND
        with os.fdopen(os.open(fname, flags=flags, mode=mode, dir_fd=dir_fd)) as f:
            os.utime(f.fileno() if os.utime in os.supports_fd else fname,
                dir_fd=None if os.supports_fd else dir_fd, **kwargs)

 # From http://stackoverflow.com/a/7244263
 def downloadFile(url, file_name):
    req = Request(url)
    req.add_header('User-Agent', USER_AGENT)
    # Download the file from `url` and save it locally under `file_name`:
    with urlopen(req) as response, open(file_name, 'wb') as out_file:
        data = response.read() # a `bytes` object
        out_file.write(data)

 def getContentLength(url):
    print("Enclosure length mismatch. Checking content-length explicitly...")
    req = Request(url)
    req.add_header('User-Agent', USER_AGENT)
    with urlopen(req) as response:
        size = response.headers.get("content-length")
        return int(size) if size else 0

 def downloadAll(feedURL):
    feed = feedparser.parse(feedURL)
    print("Processing feed %s..." % feed['feed']['title'])

    for post in reversed(feed.entries):
        print("Processing item %s... " % post.title, end="")

        if len(post.enclosures) != 1:
            print("Post has %d enclosures, not 1. Skipping post."
                    % len(post.enclosures))
        else:
            media = post.enclosures[0]
            mediaURL = media.href
            # cut the extension off the end to use in the filename
            mediaExt = mediaURL[mediaURL.rfind('.')+1:]
            quesPos = mediaExt.find('?')
            if quesPos != -1:
                mediaExt = mediaExt[:quesPos]
            mediaSize = int(media.length) # media.length is of type str

            filename = "%s.%s" % (post.title.replace('/', '_'), mediaExt)

            stat = os.stat(filename) if os.path.isfile(filename) else None
            if stat and stat.st_size > 0\
                    and (\
                            stat.st_size == mediaSize\
                            or mediaSize == 0\
                            or getContentLength(mediaURL) == stat.st_size # manually verify enclosure length; it might be wrong
                        ):
                print("File already downloaded. Skipping.")
            else:
                if stat and stat.st_size != mediaSize:
                    print("Incorrect file found. Redownloading... ", end="")
                    sys.stdout.flush()
                else:
                    print("Downloading... ", end="")
                    sys.stdout.flush()
                # Download the file...
                downloadFile(mediaURL, filename)
                print("Done.")

            # From http://stackoverflow.com/a/1697907
            pubTimestamp = time.mktime(post.published_parsed)

            # ... and set its created time to the publication time.
            touch(filename, times=(stat.st_atime if stat else pubTimestamp,
                                   pubTimestamp))

 _feedURLfilename = ".podcast_source"

 if __name__ == "__main__":
    # Remember the feed URL in a hidden file named _feedURLfilename
    #   Always use the command-line URL if given and remember it in that file.
    if len(sys.argv) == 2:
        feedURL = sys.argv[1]
        with open(_feedURLfilename, 'w') as feedURLfile:
            feedURLfile.write(feedURL)
    elif len(sys.argv) == 1 and os.path.isfile(_feedURLfilename):
        with open(_feedURLfilename, 'r') as feedURLfile:
            feedURL = feedURLfile.read()
    else:
        print("USAGE: %s feedURL" % sys.argv[0])
        sys.exit(1)

    downloadAll(feedURL)
	#!/usr/bin/env python3

	import datetime
	import feedparser
	import time
	import os
	import sys
	from urllib.request import Request, urlopen

	# Some hosts (i.e. Patreon) will deny requests with an HTTP 403
	# if they appear to come from scripts; pretend to be Firefox.
	USER_AGENT='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:72.0) Gecko/20100101 Firefox/72.0'

	# From http://stackoverflow.com/a/1160227
	if sys.version_info < (3, 3):
	def touch(fname, mode=0o666, dir_fd=None, **kwargs):
	flags = os.O_CREAT \| os.O_APPEND
	times = kwargs['times'] if 'times' in kwargs else None
	with os.fdopen(os.open(fname, flags, mode)) as f:
	os.utime(fname, times)
	else:
	def touch(fname, mode=0o666, dir_fd=None, **kwargs):
	flags = os.O_CREAT \| os.O_APPEND
	with os.fdopen(os.open(fname, flags=flags, mode=mode, dir_fd=dir_fd)) as f:
	os.utime(f.fileno() if os.utime in os.supports_fd else fname,
	dir_fd=None if os.supports_fd else dir_fd, **kwargs)

	# From http://stackoverflow.com/a/7244263
	def downloadFile(url, file_name):
	req = Request(url)
	req.add_header('User-Agent', USER_AGENT)
	# Download the file from `url` and save it locally under `file_name`:
	with urlopen(req) as response, open(file_name, 'wb') as out_file:
	data = response.read() # a `bytes` object
	out_file.write(data)

	def getContentLength(url):
	print("Enclosure length mismatch. Checking content-length explicitly...")
	req = Request(url)
	req.add_header('User-Agent', USER_AGENT)
	with urlopen(req) as response:
	size = response.headers.get("content-length")
	return int(size) if size else 0

	def downloadAll(feedURL):
	feed = feedparser.parse(feedURL)
	print("Processing feed %s..." % feed['feed']['title'])

	for post in reversed(feed.entries):
	print("Processing item %s... " % post.title, end="")

	if len(post.enclosures) != 1:
	print("Post has %d enclosures, not 1. Skipping post."
	% len(post.enclosures))
	else:
	media = post.enclosures[0]
	mediaURL = media.href
	# cut the extension off the end to use in the filename
	mediaExt = mediaURL[mediaURL.rfind('.')+1:]
	quesPos = mediaExt.find('?')
	if quesPos != -1:
	mediaExt = mediaExt[:quesPos]
	mediaSize = int(media.length) # media.length is of type str

	filename = "%s.%s" % (post.title.replace('/', '_'), mediaExt)

	stat = os.stat(filename) if os.path.isfile(filename) else None
	if stat and stat.st_size > 0\
	and (\
	stat.st_size == mediaSize\
	or mediaSize == 0\
	or getContentLength(mediaURL) == stat.st_size # manually verify enclosure length; it might be wrong
	):
	print("File already downloaded. Skipping.")
	else:
	if stat and stat.st_size != mediaSize:
	print("Incorrect file found. Redownloading... ", end="")
	sys.stdout.flush()
	else:
	print("Downloading... ", end="")
	sys.stdout.flush()
	# Download the file...
	downloadFile(mediaURL, filename)
	print("Done.")

	# From http://stackoverflow.com/a/1697907
	pubTimestamp = time.mktime(post.published_parsed)

	# ... and set its created time to the publication time.
	touch(filename, times=(stat.st_atime if stat else pubTimestamp,
	pubTimestamp))

	_feedURLfilename = ".podcast_source"

	if __name__ == "__main__":
	# Remember the feed URL in a hidden file named _feedURLfilename
	# Always use the command-line URL if given and remember it in that file.
	if len(sys.argv) == 2:
	feedURL = sys.argv[1]
	with open(_feedURLfilename, 'w') as feedURLfile:
	feedURLfile.write(feedURL)
	elif len(sys.argv) == 1 and os.path.isfile(_feedURLfilename):
	with open(_feedURLfilename, 'r') as feedURLfile:
	feedURL = feedURLfile.read()
	else:
	print("USAGE: %s feedURL" % sys.argv[0])
	sys.exit(1)

	downloadAll(feedURL)