Skip to content

Instantly share code, notes, and snippets.

@slicksammy
Created January 15, 2021 17:35
Show Gist options
  • Save slicksammy/f9e90e993fe2cc9255b4aa1855f8a3ad to your computer and use it in GitHub Desktop.
Save slicksammy/f9e90e993fe2cc9255b4aa1855f8a3ad to your computer and use it in GitHub Desktop.
require 'open-uri'
require 'public_suffix'
require 'nokogiri'
require 'httparty'
class Crawler
attr_reader :uri
def initialize(hospital_website)
@uri = URI.parse(hospital_website)
end
def find_docs
host = uri.host
scheme = uri.scheme
to_crawl = [uri]
crawled = {}
documents = {}
start_time = Time.zone.now
max_crawl_time = 300 # 5 minutes
rules = "max crawl time #{max_crawl_time}"
until to_crawl.count.zero? || (Time.zone.now - start_time > max_crawl_time)
crawling = to_crawl.shift
next if crawled[crawling.path]
puts "crawling: #{crawling.to_s}"
begin
response = HTTParty.get(crawling.to_s)
next unless response.success?
doc = Nokogiri::HTML(response.body)
rescue
puts "failed to crawl #{crawling.to_s}"
crawled[crawling.path] = true
next
end
doc.css('a').each do |found_link|
begin
# meta rules
# 1. only find relative links (prob should compare to domain or relative)
# 2. remove everything after "?" or "#"
# 3. do not look at .pdf .jpeg .jpg png
found_uri = URI.parse(found_link.attributes["href"].value)
found_uri.host = host if found_uri.host.nil?
found_uri.scheme = scheme if found_uri.scheme.nil?
if found_uri.path.match?(/(\.csv|\.xlsx)$/)
documents[found_uri.to_s] = { text: found_link.text, page: crawling.to_s }
next
end
if found_uri.host.start_with?('www.') && host.start_with?('www.')
next if found_uri.host != host
elsif found_uri.host.start_with?('www.')
next if found_uri.host[4..-1] != host
elsif host.start_with?('www.')
next if found_uri.host != host[4..-1]
else # neither start with www
next if found_uri.host != host
end
next if found_uri.path.match?(/(\.pdf|\.png|\.jpg|\.jpeg|\.mp4|\.mp3|\.mov)$/)
to_crawl << found_uri unless crawled[found_uri.path]
rescue StandardError => e
puts e
puts "found uri #{found_uri&.to_s}"
puts "error with href #{found_link}"
end
end
crawled[crawling.path] = true
puts "crawled #{crawling.to_s}"
end
{ documents: documents, crawled: crawled, rules: rules }
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment