Skip to content

Instantly share code, notes, and snippets.

@7415963987456321
Created August 5, 2024 11:45
Show Gist options
  • Save 7415963987456321/608f6e9f529128717a99e4d6f3068404 to your computer and use it in GitHub Desktop.
Save 7415963987456321/608f6e9f529128717a99e4d6f3068404 to your computer and use it in GitHub Desktop.
#!/usr/bin/perl
use strict;
use warnings;
use utf8;
use WWW::Mechanize::Sleepy;
use HTML::TokeParser;
use Data::Dumper;
use open qw( :std :encoding(UTF-8) );
# Scraping tvinna.is for job postings, to be used for analysis of job market
# New mech
my $mech = WWW::Mechanize::Sleepy->new(
sleep => '1',
autocheck => 1,
timeout => 100
);
sub read_list{
my $filename = $ARGV[0] or die("No urllist found in args");
open(my $fh, '<:encoding(UTF-8)', $filename)
or die "Could not open file '$filename' $!";
while (my $row = <$fh>) {
chomp $row;
print fetch_comp($row);
}
}
sub fetch_comp {
my $url = shift or die("invalid url");
# Get the page
$mech->get($url );
# Parse the content
my $stream = HTML::TokeParser->new(\$mech->{content});
$stream->get_tag("span"); # First span contains company and url
my $href_url = $stream
->get_tag("a")
->[1]{href}
|| "-";
my $company = $stream->get_trimmed_text("/span"); # Second span is the company name
$stream->get_tag("span"); # Post date
my $date_txt = $stream->get_trimmed_text("/span");
$stream->get_tag("span"); # Type of job, full e.t.c.
my $type = $stream->get_trimmed_text( "/span");
return qq("$company", "$href_url", "$date_txt", "$type"\n); # CSV return
}
read_list();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment