-
-
Save duonoid/6758038 to your computer and use it in GitHub Desktop.
I played with it a little, attempting to understand their site...
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# usage: ruby -Imechanize extract_facilities.rb | |
require "mechanize" | |
class FacilityScraper | |
def self.call | |
new.call | |
end | |
def self.mechanize | |
@mechanize ||= MechanizeFactory.() | |
end | |
attr_accessor :page_number, :last_page_contents | |
def initialize(last_page_contents = nil, page_number = nil) | |
@last_page_contents = last_page_contents | |
@page_number = page_number | |
end | |
# we may need these later | |
# form.add_field!("__EVENTTARGET",'ctl00$ContentPlaceHolder1$grdHealth') | |
# form.add_field!("__EVENTARGUMENT", 'Page$2') | |
def call | |
Eats::Facilities::Parser::Results.(page_contents).each do |row| | |
STDERR.puts "#{self.class}.#{__method__} #{row.inspect}" if $DEBUG | |
data_row = "#{row[:name]}\t#{row[:facility_type]}\t#{row[:address]}" | |
puts data_row unless data_row.strip.empty? | |
end | |
if next_page_number | |
throttle_limit | |
self.class.new(page_contents, next_page_number).call | |
end | |
end | |
private | |
def next_page_number | |
Eats::Facilities::Parser::NextPage.(page_contents) | |
end | |
def extracted_form | |
Eats::Html::FormExtractor.(last_page_contents) | |
end | |
def page_contents | |
unless @page_contents | |
@page_contents = if page_number | |
STDERR.puts "pulling page: #{page_number}..." if $VERBOSE | |
Eats::Facilities::ListPage.(extracted_form, page_number) | |
else | |
STDERR.puts "pulling first page..." if $VERBOSE | |
Eats::Search::Form.(mechanize) | |
end | |
end | |
@page_contents | |
end | |
def throttle_limit | |
sleep 3 | |
end | |
def mechanize | |
self.class.mechanize | |
end | |
end | |
module Eats | |
module Html | |
# e.g., | |
# http://eats.washoecounty.us/ | |
class FirstPage | |
BASE_URL = "http://eats.washoecounty.us/" | |
def self.call(mechanize) | |
mechanize.get(BASE_URL) | |
end | |
end | |
class FormExtractor | |
FORM_ID = "aspnetForm" | |
def self.call(page_contents) | |
page_contents.form(FORM_ID) | |
end | |
end | |
end | |
module Search | |
class Form | |
def self.call(mechanize) | |
new(mechanize).call | |
end | |
attr_reader :mechanize | |
def initialize(mechanize) | |
@mechanize = mechanize | |
end | |
def call | |
form.submit | |
end | |
private | |
def form | |
unless @form | |
@form = Html::FormExtractor.(page_contents) | |
Filter::Search.(form) | |
Filter::City.(form) | |
end | |
@form | |
end | |
def page_contents | |
@page_contents ||= Html::FirstPage.(mechanize) | |
end | |
end | |
module Filter | |
class Search | |
SEARCH_BUTTON_ID = "ctl00$ContentPlaceHolder1$btnSearch" | |
SEARCH_VALUE = "Search" | |
def self.call(form) | |
form[SEARCH_BUTTON_ID] = SEARCH_VALUE | |
end | |
end | |
class City | |
CITY_FIELD_ID = "ctl00$ContentPlaceHolder1$txtCity" | |
def self.call(form, city = "RENO") | |
form[CITY_FIELD_ID] = city | |
end | |
end | |
class FacilityType | |
ID = "ctl00$ContentPlaceHolder1$txtFacilityType" | |
def self.call(form, facility_type = "Snackbar") | |
form[ID] = facility_type | |
end | |
end | |
class PageNumber | |
def self.call(form, page_number) | |
form["__EVENTTARGET"] = "ctl00$ContentPlaceHolder1$grdHealth" | |
form["__EVENTARGUMENT"] = "#{page_number}" | |
end | |
end | |
end | |
end | |
module Facilities | |
class ListPage | |
def self.call(form, page_number) | |
Search::Filter::PageNumber.(form, page_number) | |
Search::Filter::City.(form) | |
form.submit | |
end | |
end | |
module Parser | |
class Results | |
FACILITY_COLS = %w[ link name score facility_type address | |
inspection_date ] | |
def self.call(page_contents) | |
new(page_contents).call | |
end | |
attr_reader :page_contents | |
def initialize(page_contents) | |
@page_contents = page_contents | |
end | |
def call | |
results.xpath('tr').collect do |tr| | |
next if tr[:align] == "center" | |
# hash per row of data | |
row = {} | |
# get data from cells | |
tr.xpath('td').each_with_index do |td, i| | |
row[FACILITY_COLS[i].to_sym] = if i == 0 | |
# get the link | |
td.at_css("a")[:href].to_s.strip | |
else | |
td.text.to_s.strip | |
end | |
end | |
row | |
end.compact | |
end | |
private | |
def results | |
ResultsTable.(page_contents) | |
end | |
end | |
class ResultsTable | |
RESULTS_TABLE_ID = "table#ctl00_ContentPlaceHolder1_grdHealth" | |
def self.call(page_contents) | |
page_contents.search(RESULTS_TABLE_ID) | |
end | |
end | |
class NextPage | |
def self.call(page_contents) | |
new(page_contents).call | |
end | |
attr_reader :page_contents | |
def initialize(page_contents) | |
@page_contents = page_contents | |
end | |
def call | |
return unless next_page | |
next_page_number.empty? ? nil : next_page_number | |
end | |
private | |
def next_page_number | |
@next_page_number ||= next_page.attribute("href").to_s.match(/Page\$(\d+)/)[0] | |
end | |
def next_page | |
results.search('//td/table/tr/td[span]/following-sibling::td[1]/a[@href]') | |
end | |
def results | |
ResultsTable.(page_contents) | |
end | |
end | |
end | |
end | |
end | |
class MechanizeFactory | |
def self.call | |
new.call | |
end | |
def call | |
setup_agent(Mechanize.new) | |
end | |
protected | |
def setup_agent(mechanize) | |
mechanize.agent.keep_alive = true | |
mechanize.agent.read_timeout = 60 | |
mechanize.agent.retry_change_requests = true | |
mechanize.agent.http.debug_output = $stderr if $DEBUG | |
mechanize.user_agent_alias = 'Mac Safari' | |
mechanize | |
end | |
end | |
STDERR.puts ">> running scraper..." if $VERBOSE | |
FacilityScraper.() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment