Created
May 22, 2013 16:03
-
-
Save ajmontag/5628758 to your computer and use it in GitHub Desktop.
Scrapes random wikipedia pages for the last page edit time.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use WWW::Curl::Easy; | |
use DateTime::Format::Strptime; | |
use IO::Handle qw( ); # For autoflush | |
STDOUT->autoflush(1); | |
my $rand_url = "http://en.wikipedia.org/wiki/Special:Random"; | |
my $next_url = $rand_url; | |
# 19 February 2013 at 21:50 | |
my $Strp = new DateTime::Format::Strptime( | |
# pattern => '%m/%d/%Y %H:%M:%S.%3N', | |
pattern => '%d %B %Y at %H:%M', | |
time_zone => '-0600', | |
); | |
my $now = DateTime->now; | |
my $curl = WWW::Curl::Easy->new; | |
$curl->setopt(CURLOPT_HEADER,1); | |
# A filehandle, reference to a scalar or reference to a typeglob can be used here. | |
my $response_body; | |
$curl->setopt(CURLOPT_WRITEDATA,\$response_body); | |
while (1) { | |
$curl->setopt(CURLOPT_URL, $next_url); | |
$response_body = ""; | |
# Starts the actual request | |
my $retcode = $curl->perform; | |
# Looking at the results... | |
if ($retcode == 0) { | |
my $response_code = $curl->getinfo(CURLINFO_HTTP_CODE); | |
if ($response_code == 302) { | |
# parse out the next url | |
if ($response_body =~ /\nLocation: (.+)\r\n/) { | |
# print "parsed next url of [$1]\n"; | |
$next_url = $1; | |
} else { | |
print STDERR "WARN unable to parse next url\n"; | |
$next_url = $rand_url; | |
} | |
} elsif ($response_code == 200) { | |
if ($response_body =~ /<li id="footer-info-lastmod"> This page was last modified on (.+)\./) { | |
# print "parsed next mod time of $1"; | |
my $dt = $Strp->parse_datetime($1); | |
my $delta = $dt->delta_days($now); | |
# my $days = $delta->days(); | |
my $days = $delta->in_units('days'); | |
print STDOUT "$days\t$next_url\n"; | |
} else { | |
print STDERR "WARN unable to parse a timestamp\n"; | |
} | |
# get another random page | |
$next_url = $rand_url; | |
} else { | |
print STDERR "WARN unacceptable http response_code $response_code\n"; | |
$next_url = $rand_url; | |
} | |
# judge result and next action based on $response_code | |
} else { | |
# Error code, type of error, error message | |
print STDERR "An error happened: $retcode ".$curl->strerror($retcode)." ".$curl->errbuf."\n"; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment