slicksammy · January 15, 2021 17:35
diff --git a/gistfile1.txt b/gistfile1.txt
 require 'open-uri'
 require 'public_suffix'
 require 'nokogiri'
 require 'httparty'

 class Crawler
  attr_reader :uri

  def initialize(hospital_website)
    @uri = URI.parse(hospital_website)
  end

  def find_docs
    host = uri.host
    scheme = uri.scheme
    to_crawl = [uri]
    crawled = {}
    documents = {}
    start_time = Time.zone.now
    max_crawl_time = 300 # 5 minutes
    rules = "max crawl time #{max_crawl_time}"
    until to_crawl.count.zero? || (Time.zone.now - start_time > max_crawl_time)
      crawling = to_crawl.shift
      next if crawled[crawling.path]

      puts "crawling: #{crawling.to_s}"

      begin
        response = HTTParty.get(crawling.to_s)
        next unless response.success?

        doc = Nokogiri::HTML(response.body)
      rescue
        puts "failed to crawl #{crawling.to_s}"
        crawled[crawling.path] = true
        next
      end
      doc.css('a').each do |found_link|
        begin
          # meta rules
          # 1. only find relative links (prob should compare to domain or relative)
          # 2. remove everything after "?" or "#"
          # 3. do not look at .pdf .jpeg .jpg png
          found_uri = URI.parse(found_link.attributes["href"].value)
          found_uri.host = host if found_uri.host.nil?
          found_uri.scheme = scheme if found_uri.scheme.nil?

          if found_uri.path.match?(/(\.csv|\.xlsx)$/)
            documents[found_uri.to_s] = { text: found_link.text, page: crawling.to_s }
            next
          end

          if found_uri.host.start_with?('www.') && host.start_with?('www.')
            next if found_uri.host != host
          elsif found_uri.host.start_with?('www.')
            next if found_uri.host[4..-1] != host
          elsif host.start_with?('www.')
            next if found_uri.host != host[4..-1]
          else # neither start with www
            next if found_uri.host != host
          end

          next if found_uri.path.match?(/(\.pdf|\.png|\.jpg|\.jpeg|\.mp4|\.mp3|\.mov)$/)

          to_crawl << found_uri unless crawled[found_uri.path]
        rescue StandardError => e
          puts e
          puts "found uri #{found_uri&.to_s}"
          puts "error with href #{found_link}"
        end
      end
      crawled[crawling.path] = true

      puts "crawled #{crawling.to_s}"
    end
    { documents: documents, crawled: crawled, rules: rules }
  end
 end
	require 'open-uri'
	require 'public_suffix'
	require 'nokogiri'
	require 'httparty'

	class Crawler
	attr_reader :uri

	def initialize(hospital_website)
	@uri = URI.parse(hospital_website)
	end

	def find_docs
	host = uri.host
	scheme = uri.scheme
	to_crawl = [uri]
	crawled = {}
	documents = {}
	start_time = Time.zone.now
	max_crawl_time = 300 # 5 minutes
	rules = "max crawl time #{max_crawl_time}"
	until to_crawl.count.zero? \|\| (Time.zone.now - start_time > max_crawl_time)
	crawling = to_crawl.shift
	next if crawled[crawling.path]

	puts "crawling: #{crawling.to_s}"

	begin
	response = HTTParty.get(crawling.to_s)
	next unless response.success?

	doc = Nokogiri::HTML(response.body)
	rescue
	puts "failed to crawl #{crawling.to_s}"
	crawled[crawling.path] = true
	next
	end
	doc.css('a').each do \|found_link\|
	begin
	# meta rules
	# 1. only find relative links (prob should compare to domain or relative)
	# 2. remove everything after "?" or "#"
	# 3. do not look at .pdf .jpeg .jpg png
	found_uri = URI.parse(found_link.attributes["href"].value)
	found_uri.host = host if found_uri.host.nil?
	found_uri.scheme = scheme if found_uri.scheme.nil?

	if found_uri.path.match?(/(\.csv\|\.xlsx)$/)
	documents[found_uri.to_s] = { text: found_link.text, page: crawling.to_s }
	next
	end

	if found_uri.host.start_with?('www.') && host.start_with?('www.')
	next if found_uri.host != host
	elsif found_uri.host.start_with?('www.')
	next if found_uri.host[4..-1] != host
	elsif host.start_with?('www.')
	next if found_uri.host != host[4..-1]
	else # neither start with www
	next if found_uri.host != host
	end

	next if found_uri.path.match?(/(\.pdf\|\.png\|\.jpg\|\.jpeg\|\.mp4\|\.mp3\|\.mov)$/)

	to_crawl << found_uri unless crawled[found_uri.path]
	rescue StandardError => e
	puts e
	puts "found uri #{found_uri&.to_s}"
	puts "error with href #{found_link}"
	end
	end
	crawled[crawling.path] = true

	puts "crawled #{crawling.to_s}"
	end
	{ documents: documents, crawled: crawled, rules: rules }
	end
	end