Skip to content

Instantly share code, notes, and snippets.

@tushargugnani
Created February 26, 2019 05:29
Show Gist options
  • Save tushargugnani/c4f1cee52c8395810e563f4d0732a130 to your computer and use it in GitHub Desktop.
Save tushargugnani/c4f1cee52c8395810e563f4d0732a130 to your computer and use it in GitHub Desktop.
<?php
namespace Tests\Browser;
use App\Page;
use Facebook\WebDriver\WebDriverBy;
use Tests\DuskTestCase;
use Laravel\Dusk\Browser;
use Illuminate\Foundation\Testing\DatabaseMigrations;
class duskSpiderTest extends DuskTestCase
{
protected static $domain = 'laravel.com';
protected static $startUrl = 'https://laravel.com/';
public function setUp(): void{
parent::setUp();
$this->artisan('migrate:fresh');
}
/** @test */
public function urlSpider()
{
$startingLink = Page::create([
'url' => self::$startUrl,
'isCrawled' => false,
]);
$this->browse(function (Browser $browser) use ($startingLink) {
$this->getLinks($browser, $startingLink);
});
}
protected function getLinks(Browser $browser, $currentUrl){
$this->processCurrentUrl($browser, $currentUrl);
try{
foreach(Page::where('isCrawled', false)->get() as $link) {
$this->getLinks($browser, $link);
}
}catch(Exception $e){
}
}
protected function processCurrentUrl(Browser $browser, $currentUrl){
//Check if already crawled
if(Page::where('url', $currentUrl->url)->first()->isCrawled == true)
return;
//Visit URL
$browser->visit($currentUrl->url);
//Get Links and Save to DB if Valid
$linkElements = $browser->driver->findElements(WebDriverBy::tagName('a'));
foreach($linkElements as $element){
$href = $element->getAttribute('href');
$href = $this->trimUrl($href);
if($this->isValidUrl($href)){
//var_dump($href);
Page::create([
'url' => $href,
'isCrawled' => false,
]);
}
}
//Update current url status to crawled
$currentUrl->isCrawled = true;
$currentUrl->status = $this->getHttpStatus($currentUrl->url);
$currentUrl->title = $browser->driver->getTitle();
$currentUrl->save();
}
protected function isValidUrl($url){
$parsed_url = parse_url($url);
if(isset($parsed_url['host'])){
if(strpos($parsed_url['host'], self::$domain) !== false && !Page::where('url', $url)->exists()){
return true;
}
}
return false;
}
protected function trimUrl($url){
$url = strtok($url, '#');
$url = rtrim($url,"/");
return $url;
}
protected function getHttpStatus($url){
$headers = get_headers($url, 1);
return intval(substr($headers[0], 9, 3));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment