Last active
June 10, 2024 12:40
-
-
Save Dan-Q/b2ea5af644b14bd40ae9dd3fdd02e47f to your computer and use it in GitHub Desktop.
Extract all sites listed on https://woocommerce.com/showcase/ and check each, reporting to CSV if it (apparently) isn't running WooCommerce
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
# rubocop:disable Security/Open | |
# | |
# Version History | |
# 1.0.0 - Initial version | |
# Constants | |
SHOWCASE_ROOT = 'https://woocommerce.com/showcase/' | |
SHOWCASE_LIST_PAGE = 'https://woocommerce.com/showcase/page/%d/' | |
PROGRESSBAR_FORMAT = '%t | %B | %e' | |
SLEEP_TIME = 0.25 # time between requests to WCCOM (throttling) | |
CACHE_FILE = 'showcase-cache.json' | |
OUTPUT_FILE = 'showcase-status.csv' | |
GOOD_STATUS = '✅ Good' | |
# Dependencies: | |
require 'bundler/inline' | |
gemfile do | |
source 'https://rubygems.org' | |
gem 'nokogiri' | |
gem 'ruby-progressbar' | |
end | |
require 'open-uri' | |
require 'json' | |
require 'net/http' | |
# Attempt to load site list from cache | |
if File.exist? CACHE_FILE | |
puts "0-3. Loading showcase cache from '#{CACHE_FILE}' (delete this file if you want to regenerate the site list)" | |
showcases = JSON.parse(File.read(CACHE_FILE)) | |
else | |
# Determine how many pages of showcase sites there are by extracting the pagination data: | |
puts "0. Enumerating showcase pages, starting from #{SHOWCASE_ROOT}..." | |
root_page = Nokogiri::HTML(URI.open(SHOWCASE_ROOT)) | |
last_page_link = root_page.css('a.page-numbers:not(.prev):not(.next)').last | |
last_page_number = last_page_link.inner_text.gsub(/[^\d]/, '').to_i | |
# Process each page from the showcase list to extract a list of sites | |
showcases = [] | |
progressbar = ProgressBar.create(title: '1. Enumerating showcases', total: last_page_number, format: PROGRESSBAR_FORMAT) | |
last_page_number.times do |i| | |
sleep(SLEEP_TIME) | |
page = Nokogiri::HTML(URI.open(format(SHOWCASE_LIST_PAGE, i + 1))) | |
# Extract the list of showcase sites from the page | |
page.css('.showcase-list .title').each do |showcase_item| | |
showcases << { 'title' => showcase_item.inner_text.strip, 'url' => showcase_item['href'].strip } | |
end | |
progressbar.increment | |
end | |
# Derive site URLs from the list of showcase sites | |
progressbar = ProgressBar.create(title: '2. Deriving site URLs', total: showcases.length, format: PROGRESSBAR_FORMAT) | |
showcases.each do |showcase| | |
sleep(SLEEP_TIME) | |
showcase_page = Nokogiri::HTML(URI.open(showcase['url'])) | |
showcase['site_url'] = showcase_page.css('.site-link').last['href'].strip | |
progressbar.increment | |
end | |
# Save the list of showcase sites to a file for re-use next time (performance) | |
puts "3. Saving showcase cache to '#{CACHE_FILE}' (to make this faster next time!)..." | |
File.open(CACHE_FILE, 'w') { |f| f.puts showcases.to_json } | |
end | |
# Test each site URL and write to CSV | |
puts " Writing output to #{OUTPUT_FILE}..." | |
File.open(OUTPUT_FILE, 'w') do |csv| | |
csv << "\xEF\xBB\xBF".force_encoding('UTF-8') # BOM marker, help identify as UTF-8 'cos we're using emoji! | |
csv.puts 'Title,URL,Showcase URL,Status' | |
end | |
progressbar = ProgressBar.create(title: '4. Testing sites', total: showcases.length, format: PROGRESSBAR_FORMAT) | |
showcases.each do |showcase| | |
wc_test_url = URI("#{showcase['site_url']}/wp-content/plugins/woocommerce/woocommerce.php") | |
begin | |
status = Net::HTTP.get_response(wc_test_url) | |
showcase['status'] = | |
if status.is_a? Net::HTTPSuccess | |
GOOD_STATUS | |
else | |
# No obvious WC detection, but we can try | |
# No WC, but might still be WP? Worth checking in case WC is just concealed or on a sub-site. | |
site_home = Nokogiri::HTML(URI.open(showcase['site_url'])) | |
if site_home.css('[href*="/wp-content/"]') | |
'❔ Woocommerce not detected; but WordPress detected' | |
else | |
'❌ WooCommerce not detected; and WordPress not detected' | |
end | |
end | |
rescue Errno::ETIMEDOUT, Net::ReadTimeout, Net::OpenTimeout | |
showcase['status'] = '💣 Timeout (try another time?)' | |
rescue Errno::ECONNREFUSED, Errno::ECONNRESET, OpenURI::HTTPError | |
showcase['status'] = '💣 Connection Error (try another time? or might be doing human-verification?)' | |
rescue OpenSSL::SSL::SSLError | |
showcase['status'] = '💣 SSL Error' | |
end | |
unless showcase['status'] == GOOD_STATUS | |
File.open(OUTPUT_FILE, 'a') do |csv| | |
csv.puts "#{showcase['title']},#{showcase['url']},#{showcase['site_url']},#{showcase['status']}" | |
end | |
end | |
progressbar.increment | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment