Skip to content

Instantly share code, notes, and snippets.

@alexshapalov
Created July 3, 2024 05:23
Show Gist options
  • Save alexshapalov/870ef80c204da321b7267bcef1d01178 to your computer and use it in GitHub Desktop.
Save alexshapalov/870ef80c204da321b7267bcef1d01178 to your computer and use it in GitHub Desktop.
General Scraper
class ScraperDSL
attr_accessor :name, :home_url, :jobs_url, :structure
def initialize(name)
@name = name
@structure = ScraperStructure.new
end
def home_url(url)
@home_url = url
end
def jobs_url(url)
@jobs_url = url
end
def structure(&block)
@structure.instance_eval(&block)
end
def scrape
all_jobs_from_db = ScraperJob.where(site_name: @name).pluck(:unique_job_ref_id)
active_jobs_from_site = fetch_active_jobs_from_site
update_inactive_jobs(all_jobs_from_db, active_jobs_from_site)
ScraperJob.reindex
end
private
def fetch_active_jobs_from_site
active_jobs = []
begin
response = HTTParty.get(@jobs_url, timeout: 3)
doc = Nokogiri::HTML(response.body)
doc.css(@structure.html_job_link_class).each do |link|
job_url = URI.join(@home_url, link["href"]).to_s
process_job_link(job_url, link, active_jobs)
end
rescue => e
log_error(@jobs_url, e.message)
end
active_jobs
end
def process_job_link(job_url, link, active_jobs)
begin
job_response = HTTParty.get(job_url, timeout: 3)
job_doc = Nokogiri::HTML(job_response.body)
unique_job_ref_id = Digest::SHA256.hexdigest(link["href"])
job = find_or_initialize_job(unique_job_ref_id)
if job.new_record? || job.expired?
update_job_attributes(job, job_doc, unique_job_ref_id)
job.save
end
active_jobs << unique_job_ref_id
rescue => e
log_error(job_url, e.message)
end
end
def find_or_initialize_job(unique_job_ref_id)
ScraperJob.find_or_initialize_by(unique_job_ref_id: unique_job_ref_id)
end
def update_job_attributes(job, job_doc, unique_job_ref_id)
job.attributes = {
title: extract_text(job_doc, @structure.html_title),
company_name: extract_company_name(job_doc),
body: extract_text(job_doc, @structure.html_description),
city: extract_text(job_doc, @structure.html_city)&.[](0, 30),
state: extract_text(job_doc, @structure.html_state),
country: extract_country_name(job_doc),
compensation: extract_text(job_doc, @structure.html_compensation),
site_name: @name,
unique_job_ref_id: unique_job_ref_id,
expired: false
}
end
def extract_text(doc, xpath)
doc.at_xpath(xpath)&.text&.strip if doc.at_xpath(xpath)
end
def extract_company_name(doc)
extract_text(doc, @structure.html_company)&.split("at")&.last&.strip
end
def extract_country_name(doc)
extract_text(doc, @structure.html_country)&.split("-")&.last&.strip
end
def update_inactive_jobs(all_jobs_from_db, active_jobs_from_site)
inactive_jobs = all_jobs_from_db - active_jobs_from_site
ScraperJob.where(unique_job_ref_id: inactive_jobs, site_name: @name).update_all(expired: true)
end
def log_error(url, message)
puts "#{url}: #{message}"
end
end
class ScraperStructure
attr_accessor :html_job_link_class, :html_title, :html_company, :html_description,
:html_city, :html_state, :html_country, :html_compensation
def job_link_class(css_class)
@html_job_link_class = css_class
end
def title(xpath)
@html_title = xpath
end
def company(xpath)
@html_company = xpath
end
def description(xpath)
@html_description = xpath
end
def city(xpath)
@html_city = xpath
end
def state(xpath)
@html_state = xpath
end
def country(xpath)
@html_country = xpath
end
def compensation(xpath)
@html_compensation = xpath
end
end
def define_scraper(name, &block)
scraper = ScraperDSL.new(name)
scraper.instance_eval(&block)
scraper.scrape
end
# Assuming you have a Structure model in your database
Structure.where(active: true).find_each do |site_structure|
structure = ScraperStructure.new
structure.job_link_class(site_structure.html_job_link_class)
structure.title(site_structure.html_title)
structure.company(site_structure.html_company)
structure.description(site_structure.html_description)
structure.city(site_structure.html_city)
structure.state(site_structure.html_state)
structure.country(site_structure.html_country)
structure.compensation(site_structure.html_compensation)
scraper = ScraperDSL.new(site_structure.site_name)
scraper.home_url(site_structure.home_url)
scraper.jobs_url(site_structure.jobs_url)
scraper.structure = structure
scraper.scrape
end
# or DSL style for one site
define_scraper "GreenHouse" do
home_url "https://www.greenhouse.io"
jobs_url "https://www.greenhouse.io/jobs"
structure do
job_link_class ".job-link"
title "//h1[@class='title']"
company "//div[@class='company']"
description "//div[@class='description']"
city "//span[@class='city']"
state "//span[@class='state']"
country "//span[@class='country']"
compensation "//span[@class='compensation']"
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment