Created
July 4, 2016 20:45
-
-
Save rplacd/f7e4870dd2201ed8642219abda8fdfe8 to your computer and use it in GitHub Desktop.
Dump media from a Tumblr blog - a Node.js script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"use strict"; | |
/* | |
tumblr-downloader.js - a Node.js script to download all media files posted in | |
a Tumblr blog, most recent first. | |
Usage: | |
node tumblr-downloader.js | |
Options - domain to download from, API key, and so on - are exposed below. | |
It's not good behavior in general, but it's well adapted to how I use this: | |
often, but for very few sites (and so very few combinations of arguments.) | |
Remember to set API_KEY. | |
Setup: | |
Built with ES6 and Node.js v6.2.2. After installing Node.js... | |
- You'll need a Tumblr API key set in API_KEY below. An example of one is | |
available in the official Tumblr API demo. (Tumblr offers the source in good | |
faith; please don't abuse that access.) | |
- Download the various modules required via npm: | |
"npm install underscore cli async request sync-request wu mkdirp" | |
Fine details: | |
- Since media files on Tumblr don't come attached with their original | |
filenames, each download is given a computed filename based on the tumblr | |
title and post description. | |
- Downloads that correspond to filenames already existing in the target | |
folder will be skipped. | |
- A JSON "metadata log" is output on stdout for every file downloaded; | |
progress information on stderr. | |
- The dumper downloads as many media files as it can find. | |
- To quit the dumper at any time, send SIGINT to the application - Control-C | |
in the vast majority of setups. The dumper should remove any half-downloaded | |
files. | |
I put this source in the public domain. | |
*/ | |
const DOMAIN = (() => throw "set me - to wallpapers.tumblr.com, for example.")(); | |
const DOWNLOAD_DIRECTORY = (() => throw "set me - to ~/Photos/Wallpapers, for example")(); | |
const API_KEY = (() => throw "set me to a Tumblr API key - see the documentation.")(); | |
// Setting these will change behavior from that documented above. | |
// Their use is unsupported. | |
const NUM_CONCURRENT_DOWNLOADS = 2; | |
const REDOWNLOAD_EXISTING = "REWRITING"; | |
const NUM_TARGET_DOWNLOADS = Number.POSITIVE_INFINITY; | |
// Modules you'll have to install. | |
const _ = require('underscore'); | |
const cli = require('cli'); | |
const async = require('async'); | |
const request = require('request'); | |
const srequest = require('sync-request'); | |
const wu = require('wu'); | |
const mkdirp = require('mkdirp'); | |
// Standard library modules, and a few monkeypatches. | |
const fs = require('fs'); | |
const https = require('https'); | |
const url = require('url'); | |
const path = require('path'); | |
console.info = console.warn; | |
// We reserve stdout as a source of structured output data; | |
// Logging then goes, by default, to stderr. | |
// Begin control flow here. | |
main(); | |
// Error handling policy - similar to Erlang's fail-fast: | |
// - unhandled - deletes the working directory: | |
// Serious logic errors. | |
// Failures in the main thread. | |
// - handled and suppressed: | |
// Failures in worker threads. | |
function main() { | |
// Create the UI state. | |
let ui = "a placeholder."; | |
// Set up filesystem state. | |
mkdirp.sync(DOWNLOAD_DIRECTORY); | |
// Now, given a stream of posts that | |
// - we've trimmed to the correct size; | |
// - converted into "download specifications". | |
let downloads = wu(posts(DOMAIN, API_KEY)) | |
.take(NUM_TARGET_DOWNLOADS) | |
.concatMap(postToDownloads); | |
// ...for each post, | |
async.eachLimit(downloads, NUM_CONCURRENT_DOWNLOADS, | |
(download, cont) => { | |
console.log(DOWNLOAD_DIRECTORY, download["reporting"]["fname"]); | |
let guarded_IO_download = IO_handlingSIGINT( | |
IO_download, | |
() => { | |
let fpath = path.join( | |
DOWNLOAD_DIRECTORY, | |
download["reporting"]["fname"] | |
); | |
console.info("Handled SIGINT; gracefully removing ", fpath); | |
if(fs.existsSync(fpath)) { | |
fs.unlinkSync(fpath); | |
} | |
} | |
); | |
// download the post (and rolling back if interrupted); | |
guarded_IO_download.apply(this, | |
download["downloadInvoc"].concat(ui, _cont => | |
// output metadata on stdout. | |
IO_outputMetadata(download, cont) | |
) | |
); | |
}, | |
// When done with all of our downloads, applaud. | |
() => console.info("main: Done.") | |
); | |
} | |
// These IO functions have a property that all functions implementing | |
// asynchronous IO in node.js have: they guarantee their last argument, | |
// "cont", will be called after all asynchronous operations have finished. | |
process.once('SIGINT', () => { | |
process.exit(); | |
}); | |
function IO_handlingSIGINT(body, rollback) { | |
// An IO function combinator that installs a rollback callback before | |
// entering the body, and removes it in the continuation (regardless | |
// or not of whether the callback is passed an error or not.) | |
return function IO_wrapper() { | |
let args = Array.from(arguments); args.pop(); | |
let continuation = arguments[arguments.length - 1]; | |
process.setMaxListeners(1 + process.getMaxListeners()); | |
process.prependOnceListener('SIGINT', rollback); | |
return body(...args, err => { | |
process.removeListener('SIGINT', rollback); | |
return continuation(err); | |
}); | |
}; | |
} | |
function IO_download(url, fname, targetDir, ui, cont) { | |
console.info('IO_download: beginning download', fname); | |
if(!REDOWNLOAD_EXISTING) { | |
console.info("main: user has chosen to replace posts already downloaded"); | |
} | |
let fpath = path.join(targetDir, fname); | |
if(REDOWNLOAD_EXISTING && fs.existsSync(fpath)) { | |
console.info("IO_download: skipping existing download " + | |
fpath + "a s requested by user."); | |
return cont(); | |
} | |
let file = fs.createWriteStream(fpath); | |
return https.get(url, res => { | |
let totalBytes = parseInt(res.headers['content-length'], 10); | |
res.pipe(file); | |
file.on('data', function(downloadChunk) { | |
// Do something interesting: show a progress bar, perhaps. | |
}); | |
file.on('finish', function() { | |
console.info('IO_download: finished downloading', fname, fpath); | |
file.close(cont); | |
}); | |
file.on('error', function(err) { | |
console.error('IO_download: error downloading', fname, err); | |
fs.unlink(cont); | |
}); | |
}); | |
} | |
function IO_outputMetadata(metadata_record, cont) { | |
console.log(JSON.stringify(metadata_record)); | |
cont(); | |
} | |
// Operations on posts. | |
function postToHumanTitle(post, opt_suffix) { | |
// Extract, from the post, a title that is, in order of acceptability, | |
// (a) human-readable, | |
// (b) human-memorisable. | |
// Given a substantial amount of metadata, we select from many possibilites: | |
if (post['slug'] !== "") { | |
return post['slug']; | |
} | |
// (Check to see whether the source_url has a descriptive name, or | |
// whether it simply uses a numerical post ID), | |
const urlFname = addr => path.basename(url.parse(addr).pathname); | |
if (undefined !== post['source_url'] && "" !== post['source_url']) { | |
let srcUrlB = urlFname(post['source_url']); | |
if(isNaN(parseInt(srcUrlB, 10))) | |
return srcUrlB; | |
} | |
// The same for the reblog_key: is it a name, or a number? | |
if(isNaN(parseInt(post['reblog_key'], 10))) { | |
return post['reblog_key']; | |
} | |
// If that's failed, use the source_title + reblog_key... | |
if (undefined !== post['source_url'] && "" !== post['source_title']) { | |
return post['source_title'] + '-' + post['reblog_key']; | |
} | |
// And if the source_title doesn't exist either, use the reblog_key only. | |
return post['reblog_key']; | |
} | |
function postToDownloads(post) { | |
// TODO: remove some of the duplicated object literal structure | |
// here with an object property spread operator. | |
let name = postToHumanTitle(post); | |
let fname = (name, dotExt) => name + dotExt; | |
console.info('postsToDownload: extracted from reponse post ' + name); | |
switch (post['type']) { | |
case 'video': | |
{ | |
let dotExt = path.extname(url.parse(post['video_url']).pathname); | |
return [{ | |
reporting: { | |
title: name, | |
source_title: post['source_title'], | |
source_url: post['source_url'], | |
fname: fname(name, dotExt) | |
}, | |
tumblrMetadata: post, | |
downloadInvoc: [post['video_url'], fname(name, dotExt), DOWNLOAD_DIRECTORY] | |
}]; | |
} | |
case 'photo': | |
{ | |
return post['photos'].map(function(photo, idx) { | |
let dotExt = path.extname(url.parse(photo['original_size']['url']).pathname); | |
return { | |
reporting: { | |
title: name, | |
source_title: post['source_title'], | |
source_url: post['source_url'], | |
fname: fname(name + '-' + idx, dotExt) | |
}, | |
tumblrMetadata: post, | |
downloadInvoc: [photo['original_size']['url'], fname(name + '-' + idx, dotExt), DOWNLOAD_DIRECTORY] | |
}; | |
}); | |
} | |
default: | |
{ | |
console.error('postsToDownload: passed a post I can\'t handle: ' + name + ', of type ' + post['type']); | |
return []; | |
} | |
} | |
} | |
// Operations on arrays of download specs. | |
function downloadsLessExisting(downloads) { | |
// TODO: I should be reading some metadata report and comparing | |
// Tumblr's authoritative post IDs, rather than simply looking at filenames | |
// - the mapping between posts and filenames will change, after all. | |
// SUBTLETLY: This will treat posts with multiple images as effectively one | |
// download. | |
// Forgive me - the forbidden fruit of synchronous | |
// IO lays bitten. | |
return downloads.filter(d => !downloadExistsP(d)); | |
function downloadExistsP(download) { | |
// TODO: I should be reading some metadata report and comparing | |
// Tumblr's authoritative post IDs, rather than simply looking at filenames | |
// - the mapping between posts and filenames will change, after all. | |
// SUBTLETLY: This will treat posts with multiple images as effectively one | |
// download. | |
// Forgive me - the forbidden fruit of synchronous | |
// IO lays bitten. | |
let potentialPath = path.join(DOWNLOAD_DIRECTORY, download['reporting']['fname']); | |
let existsP = fs.existsSync(potentialPath); | |
if(existsP) { | |
console.info('downloadExistsP: ' + | |
download['reporting']['title'] + ' is already downloaded at ' + | |
potentialPath + ' Filtering out.'); | |
} else { | |
console.info('downloadExistsP: ' + | |
download['reporting']['title'] + ' is not yet downloaded. Not filtering out.'); | |
} | |
return existsP; | |
}} | |
// An iterator that continuously returns Tumblr posts, newest-first | |
// - sequential access to Tumblr posts is the API's sole form of | |
// access to posts. | |
function* posts(domain, apiKey) { | |
// Define (but not enter) a loop that downloads post descriptors from | |
// within a certain range, parses them, and yields them to the user, | |
// one-by-one... | |
// (The loop iterates by tail-calling itself.) | |
function* download20MorePosts(currBatch) { | |
// Ask tumblr for a fixed number of posts. | |
var res = srequest( | |
'GET', | |
'https://api.tumblr.com/v2/blog/' + domain + '/posts?api_key=' + apiKey + '&offset=' + (currBatch * 20) | |
); | |
// The request may fail, of course; check for these cases. | |
if (res.statusCode == 404) { | |
console.error("posts: 404 from tumblr. Check input blog identifier or API key."); | |
throw res.statusCode; | |
} else if (res.statusCode != 200 && res.statusCode != 404) { | |
console.error("posts: status code not denoting success that I can't handle: ", res.statusCode); | |
throw res.statusCode; | |
} else if (res.statusCode == 200) { | |
// Unless it indicates it hasn't; we may then continue. | |
console.info( | |
"posts: now processing posts " + | |
currBatch * 20 + | |
' - ' + | |
(currBatch + 1) * 20 | |
); | |
// Destructure the raw response; | |
let posts = JSON.parse(res.body)['response']['posts']; | |
// At which point we've done all we can to process the posts - | |
// either we've run out of posts, and so terminate; or | |
// hand them over the user. | |
if(posts.length == 0) { | |
return; | |
} else { | |
yield* posts; | |
yield* download20MorePosts(currBatch + 1); | |
} | |
} | |
} | |
// ...now actually start the loop. | |
yield* download20MorePosts(0); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment