Created
March 27, 2011 00:30
-
-
Save guipn/888789 to your computer and use it in GitHub Desktop.
Script to download from a webpage all files the extensions of which are among the ones listed by the user.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Downloads all files directly linked within a webpage that are of a given list of extensions. | |
# | |
# The webpage in question is to be provided as the first argument for the program. | |
# The formats shall be specified as a single, second argument by the user, like so: "pdf txt tar.gz" | |
# | |
# If a third argument is given, it will be used as a target folder in which to save downloaded files. | |
# | |
# example: getfiles.pl http://google.com "gif js" | |
# | |
# gdjs | |
use warnings; | |
use strict; | |
use WWW::Mechanize; | |
usage() unless @ARGV > 1; | |
my $targetfile = shift; | |
my @extensions = split " ", shift; | |
my $targetfolder = shift; | |
## setup destination folder, if any ## | |
$targetfolder = $targetfolder ? "$targetfolder/" : ""; | |
mkdir($targetfolder) unless -e $targetfolder; | |
## extra configs ## | |
my $verbose = 1; | |
my %mechconf = ( | |
autocheck => 0, | |
agent => 'downloader', | |
stack_depth => 0, # don't keep history in memory | |
quiet => 1, | |
onerror => undef, | |
timeout => 3, # s | |
); | |
my $client = WWW::Mechanize->new(%mechconf); | |
my $urlregex = buildurlregex(); | |
## this is where it starts ## | |
msg("[$targetfile]"); | |
$client->get($targetfile); | |
error( $client->status() ) unless $client->success(); | |
my @todownload = $client->find_all_links( url_regex => qr/$urlregex/ ); | |
print "\n\n"; | |
for (@todownload) | |
{ | |
my $filename = (split "/", $_->url)[-1]; | |
print "\n\tGetting '$filename'"; | |
$client->get($_->url, ':content_file' => $targetfolder.$filename); | |
print "\t\tfailed (", $client->status(), ")!\n" unless $client->success(); | |
} | |
print "\n\n\tDone.\n\n"; | |
sub usage | |
{ | |
die "\n\tUsage: $0 [uri] \"ext1 ext2 ...\" [localfolder?]\n\n"; | |
} | |
## builds "(ext1|ext2|...)" to be used as an argument to Mech. ## | |
sub buildurlregex | |
{ | |
my $ret = join "|", @extensions; | |
return "($ret)"; | |
} | |
sub msg | |
{ | |
return unless $verbose; | |
print "\n\t", shift; | |
} | |
sub error | |
{ | |
my $errorcode = shift; | |
die "Operation aborted.\n\t\tError $errorcode reported."; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment