Skip to content

Instantly share code, notes, and snippets.

@boctor
Created March 18, 2011 08:29
Show Gist options
  • Save boctor/875774 to your computer and use it in GitHub Desktop.
Save boctor/875774 to your computer and use it in GitHub Desktop.
This ruby script uses Hpricot to scrape the Freshly Pressed pages on Wordpress.com. It then stores its results as JSON on S3. Here is the related blog post to this Gist: http://idevrecipes.com/?p=260
#!/usr/bin/ruby -rubygems
require 'open-uri'
require 'hpricot'
require 'aws/s3'
require 'yaml'
require 'json'
def save_in_s3(key, data, bucket, options)
amazon_s3_settings = YAML.load(open(File.join(File.dirname(__FILE__), "amazon_s3.yml"){ |f| f.read }))
AWS::S3::Base.establish_connection!(:access_key_id => amazon_s3_settings[:access_key_id], :secret_access_key => amazon_s3_settings[:secret_access_key])
AWS::S3::S3Object.store(key, data, amazon_s3_settings[bucket], options)
end
num_pages = 10
(1..num_pages).each do |page|
picks = Array.new
doc = Hpricot(open("http://wordpress.com/?load=editorpicks&fp=#{page}"))
doc.search('.pick').each do |pick_element|
# Sponsored posts are being skipped until I can figure out a way to get a 320px wide version
# of the images used for Sponsored posts
next if pick_element.inner_html.include? 'Sponsored Post'
pick = Hash.new
pick[:url] = pick_element.at('a')['href']
pick[:title] = pick_element.at('.posttitle').inner_text
subtitle = pick_element.at('small')
pick[:subtitle] = subtitle.inner_text if subtitle
picture = pick_element.at('.picture')
if picture
style_array = picture['style'].split(';').collect{|x| Hash[*x.split(':',2).collect{|a|a.strip}]}
style_hash = Hash.new
style_array.each {|x| style_hash = style_hash.merge(x)}
background_image = style_hash["background-image"]
image_url = background_image.match(/url\('([^']+)'\)/)[1]
pick[:image] = image_url.gsub('w=223', 'w=320')
scale_increase = 1.43497757847534 # 320.0/223.0
pick[:y_offset] = style_hash["background-position"].split[1].sub('px','').to_f * scale_increase
else
pick[:image] = pick_element.at('img')['src']
end
picks << pick
end
picks << {:next_page => page + 1} unless (page == num_pages - 1)
save_in_s3("freshlypressed/#{page}.json", picks.to_json, :wordpress_bucket, {:access => 'public-read'})
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment