Created
August 5, 2024 11:45
-
-
Save 7415963987456321/608f6e9f529128717a99e4d6f3068404 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use strict; | |
use warnings; | |
use utf8; | |
use WWW::Mechanize::Sleepy; | |
use HTML::TokeParser; | |
use Data::Dumper; | |
use open qw( :std :encoding(UTF-8) ); | |
# Scraping tvinna.is for job postings, to be used for analysis of job market | |
# New mech | |
my $mech = WWW::Mechanize::Sleepy->new( | |
sleep => '1', | |
autocheck => 1, | |
timeout => 100 | |
); | |
sub read_list{ | |
my $filename = $ARGV[0] or die("No urllist found in args"); | |
open(my $fh, '<:encoding(UTF-8)', $filename) | |
or die "Could not open file '$filename' $!"; | |
while (my $row = <$fh>) { | |
chomp $row; | |
print fetch_comp($row); | |
} | |
} | |
sub fetch_comp { | |
my $url = shift or die("invalid url"); | |
# Get the page | |
$mech->get($url ); | |
# Parse the content | |
my $stream = HTML::TokeParser->new(\$mech->{content}); | |
$stream->get_tag("span"); # First span contains company and url | |
my $href_url = $stream | |
->get_tag("a") | |
->[1]{href} | |
|| "-"; | |
my $company = $stream->get_trimmed_text("/span"); # Second span is the company name | |
$stream->get_tag("span"); # Post date | |
my $date_txt = $stream->get_trimmed_text("/span"); | |
$stream->get_tag("span"); # Type of job, full e.t.c. | |
my $type = $stream->get_trimmed_text( "/span"); | |
return qq("$company", "$href_url", "$date_txt", "$type"\n); # CSV return | |
} | |
read_list(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment