Created
January 15, 2021 17:35
-
-
Save slicksammy/f9e90e993fe2cc9255b4aa1855f8a3ad to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require 'open-uri' | |
require 'public_suffix' | |
require 'nokogiri' | |
require 'httparty' | |
class Crawler | |
attr_reader :uri | |
def initialize(hospital_website) | |
@uri = URI.parse(hospital_website) | |
end | |
def find_docs | |
host = uri.host | |
scheme = uri.scheme | |
to_crawl = [uri] | |
crawled = {} | |
documents = {} | |
start_time = Time.zone.now | |
max_crawl_time = 300 # 5 minutes | |
rules = "max crawl time #{max_crawl_time}" | |
until to_crawl.count.zero? || (Time.zone.now - start_time > max_crawl_time) | |
crawling = to_crawl.shift | |
next if crawled[crawling.path] | |
puts "crawling: #{crawling.to_s}" | |
begin | |
response = HTTParty.get(crawling.to_s) | |
next unless response.success? | |
doc = Nokogiri::HTML(response.body) | |
rescue | |
puts "failed to crawl #{crawling.to_s}" | |
crawled[crawling.path] = true | |
next | |
end | |
doc.css('a').each do |found_link| | |
begin | |
# meta rules | |
# 1. only find relative links (prob should compare to domain or relative) | |
# 2. remove everything after "?" or "#" | |
# 3. do not look at .pdf .jpeg .jpg png | |
found_uri = URI.parse(found_link.attributes["href"].value) | |
found_uri.host = host if found_uri.host.nil? | |
found_uri.scheme = scheme if found_uri.scheme.nil? | |
if found_uri.path.match?(/(\.csv|\.xlsx)$/) | |
documents[found_uri.to_s] = { text: found_link.text, page: crawling.to_s } | |
next | |
end | |
if found_uri.host.start_with?('www.') && host.start_with?('www.') | |
next if found_uri.host != host | |
elsif found_uri.host.start_with?('www.') | |
next if found_uri.host[4..-1] != host | |
elsif host.start_with?('www.') | |
next if found_uri.host != host[4..-1] | |
else # neither start with www | |
next if found_uri.host != host | |
end | |
next if found_uri.path.match?(/(\.pdf|\.png|\.jpg|\.jpeg|\.mp4|\.mp3|\.mov)$/) | |
to_crawl << found_uri unless crawled[found_uri.path] | |
rescue StandardError => e | |
puts e | |
puts "found uri #{found_uri&.to_s}" | |
puts "error with href #{found_link}" | |
end | |
end | |
crawled[crawling.path] = true | |
puts "crawled #{crawling.to_s}" | |
end | |
{ documents: documents, crawled: crawled, rules: rules } | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment