Skip to content

Instantly share code, notes, and snippets.

@jharjono
Created February 14, 2012 06:05
Show Gist options
  • Save jharjono/1824149 to your computer and use it in GitHub Desktop.
Save jharjono/1824149 to your computer and use it in GitHub Desktop.
tumblr-scraper.rb
# Quick hack - really hacky, untested, and probably breaks a lot
require 'rubygems'
require 'mechanize'
module TumblrScraper
class TumblrPost
attr_accessor :url, :post_id, :tumblr_id
# @param [Nokogiri::XML::Element] post - a DOM element representing a Tumblr div.post
def initialize(post)
@url = post.search("./a").first.attr("href")
@post_id = @url.split("/")[4].to_i
@tumblr_id = @url.split("/")[2].split(".").first
puts "processing post_id #{@post_id} #{@url}"
end
end
class TumblrImagePost < TumblrPost
def initialize(post)
@img_url = nil
begin
super(post)
@img_url = post.search("img").first.attr('src')
rescue => e
puts "Not an image file!"
end
end
def download(destination_dir=Dir.pwd)
if @img_url.nil?
return
end
out_fname = File.join(destination_dir, "#{@tumblr_id}_#{@post_id}.jpg")
# %x[wget #{@img_url} -O #{out_fname}]
puts "Downloaded #{@img_url} as #{out_fname}."
end
end
# Scraper for Tumblr images
class ImageScraper
def initialize(tumblr_id)
@url = "http://#{tumblr_id}.tumblr.com"
@agent = Mechanize.new
end
# @param [Int] until - post ID that we will stop scraping at - note that post ID for a tumblr user monotonically increase with time
def scrape(limit, download_dir=Dir.pwd)
limit_reached = false
url = @url
page_num = 1
while not limit_reached
page = @agent.get(url + "/page/#{page_num}/")
posts = page.search(".post").map {|p| TumblrImagePost.new(p)}
posts.each do |post|
if post.post_id <= limit
limit_reached = true
puts "Limit reached at post #{post.post_id} <= limit #{limit}. Aborting scraper..."
break
else
post.download(download_dir)
end
end
if posts.size == 0
# no more pages left
limit_reached = true
puts "Reached end of archive. Aborting scraper..."
break
end
# all image posts in this page downloaded, going backwards in history
page_num += 1
end
end
end
end
if __FILE__ == $0
scraper = TumblrScraper::ImageScraper.new("tumblr-id-here")
scraper.scrape(1)
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment