Skip to content

Instantly share code, notes, and snippets.

@rhizome
Created July 31, 2012 04:32
Show Gist options
  • Save rhizome/3213628 to your computer and use it in GitHub Desktop.
Save rhizome/3213628 to your computer and use it in GitHub Desktop.
Retrieve a list of all URLs within a site's pages
#!/usr/bin/env ruby
require 'anemone'
require 'pp'
require 'optparse'
options = OpenStruct.new
OptionParser.new do |o|
o.banner = "geturls.rb [options] URL"
o.on("-s", "--sortby [url|count]", [:url,:count], "Sort by URL or count") do |s|
options.sortby = s
end
o.on("-l", "--[no-]links", "Include links within pages") do |l|
options.links = l
end
o.on_tail("-h", "--help", "Show this message") do
puts o
exit
end
end.parse!
options.url = ARGV
lnks = Hash.new
externals = Hash.new
Anemone.crawl(options.url) do |anem|
anem.on_every_page do |page|
lnks["#{page.url}"] ||= 0
lnks["#{page.url}"] += 1
if options.links
if page.doc.present?
if page.doc.css('a')
page.doc.css('a').map { |link|
externals["#{link['href']}"] ||= 0
externals["#{link['href']}"] += 1
}
end
end
end
end
end
lnks.sort_by {|url,count| options.sortby.nil? ? count : options.sortby }.each { |k,v|
puts "#{k}: #{v}"
}
externals.sort_by {|url,count| options.sortby.nil? ? count : options.sortby }.each { |k,v|
puts "#{k}: #{v}"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment