bundle install
./pull_data.rb
Processing information is displayed as the program runs. Output is placed into CSV files on disk.
source 'https://rubygems.org' | |
gem 'nokogiri' |
#!/usr/bin/env ruby | |
require 'open-uri' | |
require 'nokogiri' | |
require 'csv' | |
require 'pp' | |
years = (2001..2017) | |
teams = %w{crd atl rav buf car chi cin cle dal den det gnb htx clt jax kan sdg ram mia min nwe nor nyg nyj rai phi pit sfo sea tam oti was} | |
fields = %w{game_date at_or_vs opp game_result quarter distance score_type description} | |
years.each do |year| | |
teams.each do |team| | |
# Try 3 times in case of errors | |
3.times do |time| | |
rows = [] | |
begin | |
puts "Processing #{year} - #{team}" | |
# Pull down the HTML page | |
doc = Nokogiri::HTML(open("https://www.pro-football-reference.com/teams/#{team}/#{year}.htm")) | |
# Extract content from a hidden comment (yes really) | |
inner_doc = Nokogiri::HTML(doc.xpath("//div[@id='all_team_td_log']/comment()").text) | |
# Operate on hidden table data | |
trs = inner_doc.css "#div_team_td_log tbody tr" | |
trs.each do |tr| | |
row = [] | |
fields.each do |field| | |
td = tr.xpath "td[@data-stat='#{field}']" | |
row << td.text | |
end | |
rows << row | |
end | |
CSV.open("#{year}.#{team}.csv", 'w') do |csv| | |
csv << fields | |
rows.each do |row| | |
csv << row | |
end | |
end | |
# break the 3.times loop if we succeed | |
break | |
rescue RuntimeError => e | |
if time < 2 | |
puts "ERROR: #{year} #{team} trying again" | |
else | |
puts "ERROR: #{year} #{team} skipping" | |
end | |
puts e.message | |
puts e.backtrace | |
end | |
end | |
end | |
end |