Created
February 5, 2011 19:59
-
-
Save june29/812737 to your computer and use it in GitHub Desktop.
tumblr-scraper.rb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "rubygems" | |
require "mechanize" | |
require "nokogiri" | |
module Tumblr | |
class Scraper | |
def initialize(email, password, trim_reblog_info = true) | |
@email = email | |
@password = password | |
@trim_reblog_info = trim_reblog_info | |
@agent = Mechanize.new | |
@agent.max_history = 1 | |
login | |
end | |
def login | |
login_page = @agent.get("http://www.tumblr.com/login") | |
login_form = login_page.forms[1] | |
login_form.fields.find { |field| field.name == "email" }.value = @email | |
login_form.fields.find { |field| field.name == "password" }.value = @password | |
login_form.submit | |
end | |
def dashboard(page = 1) | |
doc = Nokogiri::HTML(@agent.get("http://www.tumblr.com/dashboard/#{page}/").body) | |
posts = [] | |
prev_user = nil | |
doc.xpath("//li[contains(concat(' ', normalize-space(@class), ' '), ' post ')]").reject { |post| | |
# Reject 'video' and 'audio' temporary, because these are unavailable in mobile Safari | |
/new_post|video|audio/ =~ post.attributes["class"].to_s | |
}.each do |post| | |
begin | |
id = post.attributes["id"].to_s.sub(/^post/, "") | |
permalink = post.xpath(".//a[@title='Permalink']").attr("href") | |
post_info_link = post.xpath("./div[@class='post_info']/a").first | |
user = nil | |
unless post_info_link.nil? | |
user_id = post_info_link.text | |
user_url = post_info_link.attributes["href"].to_s | |
user_avatar = post.xpath(".//a[@class='post_avatar']").attr("style").to_s.scan(/url\('(.+)'\)/)[0][0] | |
user = User.new(user_id, user_url, user_avatar) | |
else | |
user = prev_user | |
end | |
prev_user = user | |
c = post.attributes["class"].to_s | |
case c | |
when /regular/ | |
title = post.xpath("./div[@class='post_title']").text.strip | |
post.xpath("./div").remove | |
body = post.xpath("./*|./text()").plain_html | |
posts << Post.new(id, permalink, user, :regular, title, body, nil) | |
when /photo/ | |
body = post.xpath("./div//img[@class='image']").attr("src") | |
caption = post.xpath(".//div[@class='caption']/*|.//div[@class='caption']/text()").plain_html | |
posts << Post.new(id, permalink, user, :photo, nil, body, caption) | |
when /quote/ | |
body = post.xpath("./span[@class='quote']/*|./span[@class='quote']/text()").plain_html | |
caption = post.xpath(".//td[@class='quote_source']/*|.//td[@class='quote_source']/text()").plain_html | |
posts << Post.new(id, permalink, user, :quote, nil, body, caption) | |
when /link/ | |
title = post.xpath(".//div[@class='post_title']/*|.//div[@class='post_title']/text()").plain_html | |
posts << Post.new(id, permalink, user, :link, title, body, caption) | |
when /conversation/ | |
title = post.xpath("./div[@class='post_title']").text.strip | |
post.xpath("./div").remove | |
body = post.xpath("./*|./text()").plain_html | |
posts << Post.new(id, permalink, user, :conversation, title, body, caption) | |
end | |
rescue => e | |
puts "!!!!! #{e} in ID #{id} !!!!!" | |
next | |
end | |
end | |
posts | |
end | |
def reblog(url) | |
doc = Nokogiri::HTML(@agent.get(url).body) | |
url_of_controls = doc.xpath("//iframe[@id='tumblr_controls']").attr("src") | |
doc_of_controls = Nokogiri::HTML(@agent.get(url_of_controls).body) | |
puts @agent.inspect | |
puts doc_of_controls.xpath(".//a")########## | |
url_of_reblog = "http://www.tumblr.com" + doc_of_controls.xpath("//a[starts-with(@href, '/reblog/')]").attr("href") | |
page_of_reblog = @agent.get(url_of_reblog) | |
form_of_reblog = page_of_reblog.forms[1] | |
if @trim_reblog_info | |
doc_of_reblog = Nokogiri::HTML(page_of_reblog.body) | |
post_type = doc_of_reblog.xpath("//input[@name='post[type]']").attr("value") | |
case post_type | |
when "link" | |
field = form_of_reblog.fields.find { |field| field.name == "post[three]" } | |
field.value = trim(field.value) | |
when "regular", "photo", "video" | |
field = form_of_reblog.fields.find { |field| field.name == "post[two]" } | |
field.value = trim(field.value) | |
when "quote" | |
field = form_of_reblog.fields.find { |field| field.name == "post[two]" } | |
field.value = field.value.gsub(/ \(via <a.*?<\/a>\)/, "") | |
end | |
end | |
form_of_reblog.submit | |
end | |
def follow(url) | |
page = @agent.get("http://www.tumblr.com/following") | |
follow_form = page.forms[1] | |
follow_form.fields.find { |field| field.name == "follow_this" }.value = url | |
follow_form.submit | |
end | |
private | |
def trim(str) | |
str.gsub!(/<p><\/p>/, "").gsub!(/<p><a[^<]+<\/a>:<\/p>/, "") | |
str = trim_quote(str) | |
str.strip | |
end | |
private | |
def trim_quote(str) | |
str.sub(/<blockquote>(([\n\r]|.)+)<\/blockquote>/m) { trim_quote($1) } | |
end | |
end | |
class User | |
attr_reader :id, :url, :avatar_image_url | |
def initialize(id, url, avatar_image_url) | |
@id = id | |
@url = url | |
@avatar_image_url = avatar_image_url | |
end | |
def to_h | |
{ | |
"id" => @id, | |
"url" => @url, | |
"avatar_image_url" => @avatar_image_url | |
} | |
end | |
end | |
class Post | |
attr_reader :id, :permalink, :user, :post_type, :title, :body, :caption | |
def initialize(id, permalink, user, post_type, title, body, caption) | |
@id = id | |
@permalink = permalink | |
@user = user | |
@post_type = post_type | |
@title = title | |
@body = body | |
@caption = caption | |
end | |
def to_h | |
{ | |
:id => @id, | |
:permalink => @permalink, | |
:user => @user.to_h, | |
:post_type => @post_type, | |
:title => @title, | |
:body => @body, | |
:caption => @caption | |
} | |
end | |
end | |
end | |
class Nokogiri::XML::Node | |
def plain_html | |
if self.class == Nokogiri::XML::Text | |
return self.text.gsub(/\s{2,}/, " ") | |
else | |
attrs = self.attributes.inject("") { |str, attr| | |
str += " #{attr[0]}=\"#{attr[1]}\"" | |
} | |
if self.children.empty? | |
"<#{self.name}#{attrs} />" | |
else | |
"<#{self.name}#{attrs}>#{self.children.plain_html}</#{self.name}>" | |
end | |
end | |
end | |
end | |
class Nokogiri::XML::NodeSet | |
def plain_html | |
self.inject([]) { |a, node| | |
a << node.plain_html | |
}.join("") | |
end | |
end | |
class Nokogiri::XML::Element | |
def plain_html | |
super | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment