Skip to content

Instantly share code, notes, and snippets.

@silvioq
Last active February 10, 2017 13:05
Show Gist options
  • Save silvioq/6957a72f279ca7fb729a5ea685ffbcde to your computer and use it in GitHub Desktop.
Save silvioq/6957a72f279ca7fb729a5ea685ffbcde to your computer and use it in GitHub Desktop.
Proyecto de prueba para obtener datos de una página
ID: 12170236
Installation:
composer install
Usage:
php crawler.php URL-to-process
Example:
php crawler.php https://www.freelancer.com/
{
"require": {
"fabpot/goutte": "^3.2"
}
}
{
"_readme": [
"This file locks the dependencies of your project to a known state",
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
"This file is @generated automatically"
],
"hash": "defc342ae76c7f0ce5b64dd4004facca",
"content-hash": "425aac93d170ece8a1f83c754fbc0f1d",
"packages": [
{
"name": "fabpot/goutte",
"version": "v3.2.0",
"source": {
"type": "git",
"url": "https://github.com/FriendsOfPHP/Goutte.git",
"reference": "8cc89de5e71daf84051859616891d3320d88a9e8"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/FriendsOfPHP/Goutte/zipball/8cc89de5e71daf84051859616891d3320d88a9e8",
"reference": "8cc89de5e71daf84051859616891d3320d88a9e8",
"shasum": ""
},
"require": {
"guzzlehttp/guzzle": "^6.0",
"php": ">=5.5.0",
"symfony/browser-kit": "~2.1|~3.0",
"symfony/css-selector": "~2.1|~3.0",
"symfony/dom-crawler": "~2.1|~3.0"
},
"type": "application",
"extra": {
"branch-alias": {
"dev-master": "3.2-dev"
}
},
"autoload": {
"psr-4": {
"Goutte\\": "Goutte"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Fabien Potencier",
"email": "fabien@symfony.com"
}
],
"description": "A simple PHP Web Scraper",
"homepage": "https://github.com/FriendsOfPHP/Goutte",
"keywords": [
"scraper"
],
"time": "2016-11-15 16:27:29"
},
{
"name": "guzzlehttp/guzzle",
"version": "6.2.2",
"source": {
"type": "git",
"url": "https://github.com/guzzle/guzzle.git",
"reference": "ebf29dee597f02f09f4d5bbecc68230ea9b08f60"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/guzzle/guzzle/zipball/ebf29dee597f02f09f4d5bbecc68230ea9b08f60",
"reference": "ebf29dee597f02f09f4d5bbecc68230ea9b08f60",
"shasum": ""
},
"require": {
"guzzlehttp/promises": "^1.0",
"guzzlehttp/psr7": "^1.3.1",
"php": ">=5.5"
},
"require-dev": {
"ext-curl": "*",
"phpunit/phpunit": "^4.0",
"psr/log": "^1.0"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "6.2-dev"
}
},
"autoload": {
"files": [
"src/functions_include.php"
],
"psr-4": {
"GuzzleHttp\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Michael Dowling",
"email": "mtdowling@gmail.com",
"homepage": "https://github.com/mtdowling"
}
],
"description": "Guzzle is a PHP HTTP client library",
"homepage": "http://guzzlephp.org/",
"keywords": [
"client",
"curl",
"framework",
"http",
"http client",
"rest",
"web service"
],
"time": "2016-10-08 15:01:37"
},
{
"name": "guzzlehttp/promises",
"version": "1.3.0",
"source": {
"type": "git",
"url": "https://github.com/guzzle/promises.git",
"reference": "2693c101803ca78b27972d84081d027fca790a1e"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/guzzle/promises/zipball/2693c101803ca78b27972d84081d027fca790a1e",
"reference": "2693c101803ca78b27972d84081d027fca790a1e",
"shasum": ""
},
"require": {
"php": ">=5.5.0"
},
"require-dev": {
"phpunit/phpunit": "~4.0"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "1.0-dev"
}
},
"autoload": {
"psr-4": {
"GuzzleHttp\\Promise\\": "src/"
},
"files": [
"src/functions_include.php"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Michael Dowling",
"email": "mtdowling@gmail.com",
"homepage": "https://github.com/mtdowling"
}
],
"description": "Guzzle promises library",
"keywords": [
"promise"
],
"time": "2016-11-18 17:47:58"
},
{
"name": "guzzlehttp/psr7",
"version": "1.3.1",
"source": {
"type": "git",
"url": "https://github.com/guzzle/psr7.git",
"reference": "5c6447c9df362e8f8093bda8f5d8873fe5c7f65b"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/guzzle/psr7/zipball/5c6447c9df362e8f8093bda8f5d8873fe5c7f65b",
"reference": "5c6447c9df362e8f8093bda8f5d8873fe5c7f65b",
"shasum": ""
},
"require": {
"php": ">=5.4.0",
"psr/http-message": "~1.0"
},
"provide": {
"psr/http-message-implementation": "1.0"
},
"require-dev": {
"phpunit/phpunit": "~4.0"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "1.4-dev"
}
},
"autoload": {
"psr-4": {
"GuzzleHttp\\Psr7\\": "src/"
},
"files": [
"src/functions_include.php"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Michael Dowling",
"email": "mtdowling@gmail.com",
"homepage": "https://github.com/mtdowling"
}
],
"description": "PSR-7 message implementation",
"keywords": [
"http",
"message",
"stream",
"uri"
],
"time": "2016-06-24 23:00:38"
},
{
"name": "psr/http-message",
"version": "1.0.1",
"source": {
"type": "git",
"url": "https://github.com/php-fig/http-message.git",
"reference": "f6561bf28d520154e4b0ec72be95418abe6d9363"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/php-fig/http-message/zipball/f6561bf28d520154e4b0ec72be95418abe6d9363",
"reference": "f6561bf28d520154e4b0ec72be95418abe6d9363",
"shasum": ""
},
"require": {
"php": ">=5.3.0"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "1.0.x-dev"
}
},
"autoload": {
"psr-4": {
"Psr\\Http\\Message\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "PHP-FIG",
"homepage": "http://www.php-fig.org/"
}
],
"description": "Common interface for HTTP messages",
"homepage": "https://github.com/php-fig/http-message",
"keywords": [
"http",
"http-message",
"psr",
"psr-7",
"request",
"response"
],
"time": "2016-08-06 14:39:51"
},
{
"name": "symfony/browser-kit",
"version": "v3.1.7",
"source": {
"type": "git",
"url": "https://github.com/symfony/browser-kit.git",
"reference": "901319a31c9b3cee7857b4aeeb81b5d64dfa34fc"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/browser-kit/zipball/901319a31c9b3cee7857b4aeeb81b5d64dfa34fc",
"reference": "901319a31c9b3cee7857b4aeeb81b5d64dfa34fc",
"shasum": ""
},
"require": {
"php": ">=5.5.9",
"symfony/dom-crawler": "~2.8|~3.0"
},
"require-dev": {
"symfony/css-selector": "~2.8|~3.0",
"symfony/process": "~2.8|~3.0"
},
"suggest": {
"symfony/process": ""
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "3.1-dev"
}
},
"autoload": {
"psr-4": {
"Symfony\\Component\\BrowserKit\\": ""
},
"exclude-from-classmap": [
"/Tests/"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Fabien Potencier",
"email": "fabien@symfony.com"
},
{
"name": "Symfony Community",
"homepage": "https://symfony.com/contributors"
}
],
"description": "Symfony BrowserKit Component",
"homepage": "https://symfony.com",
"time": "2016-09-06 11:02:40"
},
{
"name": "symfony/css-selector",
"version": "v3.1.7",
"source": {
"type": "git",
"url": "https://github.com/symfony/css-selector.git",
"reference": "a37b3359566415a91cba55a2d95820b3fa1a9658"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/css-selector/zipball/a37b3359566415a91cba55a2d95820b3fa1a9658",
"reference": "a37b3359566415a91cba55a2d95820b3fa1a9658",
"shasum": ""
},
"require": {
"php": ">=5.5.9"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "3.1-dev"
}
},
"autoload": {
"psr-4": {
"Symfony\\Component\\CssSelector\\": ""
},
"exclude-from-classmap": [
"/Tests/"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Jean-François Simon",
"email": "jeanfrancois.simon@sensiolabs.com"
},
{
"name": "Fabien Potencier",
"email": "fabien@symfony.com"
},
{
"name": "Symfony Community",
"homepage": "https://symfony.com/contributors"
}
],
"description": "Symfony CssSelector Component",
"homepage": "https://symfony.com",
"time": "2016-11-03 08:04:31"
},
{
"name": "symfony/dom-crawler",
"version": "v3.1.7",
"source": {
"type": "git",
"url": "https://github.com/symfony/dom-crawler.git",
"reference": "1eb3b4d216e8db117218dd2bb7d23dfe67bdf518"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/dom-crawler/zipball/1eb3b4d216e8db117218dd2bb7d23dfe67bdf518",
"reference": "1eb3b4d216e8db117218dd2bb7d23dfe67bdf518",
"shasum": ""
},
"require": {
"php": ">=5.5.9",
"symfony/polyfill-mbstring": "~1.0"
},
"require-dev": {
"symfony/css-selector": "~2.8|~3.0"
},
"suggest": {
"symfony/css-selector": ""
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "3.1-dev"
}
},
"autoload": {
"psr-4": {
"Symfony\\Component\\DomCrawler\\": ""
},
"exclude-from-classmap": [
"/Tests/"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Fabien Potencier",
"email": "fabien@symfony.com"
},
{
"name": "Symfony Community",
"homepage": "https://symfony.com/contributors"
}
],
"description": "Symfony DomCrawler Component",
"homepage": "https://symfony.com",
"time": "2016-11-14 16:20:02"
},
{
"name": "symfony/polyfill-mbstring",
"version": "v1.3.0",
"source": {
"type": "git",
"url": "https://github.com/symfony/polyfill-mbstring.git",
"reference": "e79d363049d1c2128f133a2667e4f4190904f7f4"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/symfony/polyfill-mbstring/zipball/e79d363049d1c2128f133a2667e4f4190904f7f4",
"reference": "e79d363049d1c2128f133a2667e4f4190904f7f4",
"shasum": ""
},
"require": {
"php": ">=5.3.3"
},
"suggest": {
"ext-mbstring": "For best performance"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "1.3-dev"
}
},
"autoload": {
"psr-4": {
"Symfony\\Polyfill\\Mbstring\\": ""
},
"files": [
"bootstrap.php"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Nicolas Grekas",
"email": "p@tchwork.com"
},
{
"name": "Symfony Community",
"homepage": "https://symfony.com/contributors"
}
],
"description": "Symfony polyfill for the Mbstring extension",
"homepage": "https://symfony.com",
"keywords": [
"compatibility",
"mbstring",
"polyfill",
"portable",
"shim"
],
"time": "2016-11-14 01:06:16"
}
],
"packages-dev": [],
"aliases": [],
"minimum-stability": "stable",
"stability-flags": [],
"prefer-stable": false,
"prefer-lowest": false,
"platform": [],
"platform-dev": []
}
<?php
# Installation:
# composer install
#
# Usage:
# php crawler.php URL-to-process
#
# Example
# php crawler.php https://www.freelancer.com/
require "vendor/autoload.php";
if( count( $argv ) != 2 )
{
echo "Usage:\n php crawler.php URL-to-process\n\nExample:\n php crawler.php https://www.freelancer.com/\n\n";
exit( 1 );
}
$client = new Goutte\Client;
try
{
$start = microtime(true);
$crawler = $client->request('GET', $argv[1] );
$end = microtime(true);
} catch( GuzzleHttp\Exception\ConnectException $e )
{
print( "Can't connect to " . $argv[1] ."\nDetailed error:\n" );
print $e->getMessage() . PHP_EOL;
exit( 1 );
}
$result = [
"title" => 'No title',
"description" => 'No metatag description',
"keywords" => 'No metatag keywords',
"time" => sprintf("%d ms." , ($end - $start)*1000),
] ;
$crawler->filter("head > meta")->each(function($node,$i) use( &$result ){
$name = strtolower( $node->attr('name') );
switch( $name )
{
case 'description':
case 'keywords':
$result[$name] = strlen( $node->attr('content') );
}
});
$crawler->filter("head > title")->each(function($node) use (&$result){
$result['title'] = $node->text();
} );
print_r( $result );
exit( 0 );
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment