Skip to content

Instantly share code, notes, and snippets.

@thoslin
Created April 24, 2014 15:14
Show Gist options
  • Save thoslin/11258378 to your computer and use it in GitHub Desktop.
Save thoslin/11258378 to your computer and use it in GitHub Desktop.
<?php
/**
* A simple PHP web scraper
* >> php -f scraper.php
*/
//echo "Hello World!";
/**
* Fire a get request
*/
function request_url($url) {
$curl = curl_init();
curl_setopt_array($curl, array(
CURLOPT_URL => $url,
CURLOPT_FOLLOWLOCATION => 1,
CURLOPT_RETURNTRANSFER => 1,
CURLOPT_USERAGENT => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/534.55.3 (KHTML, like Gecko) Version/5.1.3 Safari/534.53.10'
));
$resp = curl_exec($curl);
curl_close($curl);
return $resp;
}
/**
* Construct URL from relative path
*/
function build_url($url) {
$domain = "www.unite-students.com";
if (strpos($url, $domain) === false) {
$url = $domain . $url;
}
return $url;
}
function extract_urls($url, $xpath) {
$links = array();
$response = request_url($url);
$dom_document = new DOMDocument();
$dom_document->loadHTML($response);
$dom_xpath = new DOMXpath($dom_document);
$elements = $dom_xpath->query($xpath);
foreach($elements as $element) {
array_push($links, build_url($element->nodeValue));
array_unique($links);
}
return $links;
}
function extract_data($dom_xpath, $query) {
$data = array();
$nodes = $dom_xpath->query($query);
foreach($nodes as $node) {
array_push($data, $node->data);
}
if(count($data) == 1) {
return $data[0];
}
return $data;
}
/**
* Parse information from a detail page
*/
function parse_item($url) {
$response = request_url($url);
$dom_document = new DOMDocument();
$dom_document->loadHTML($response);
$dom_xpath = new DOMXpath($dom_document);
$listing = array(
'url' => $url,
'name' => extract_data($dom_xpath, "//span[@itemprop='name']/text()"),
'price' => extract_data($dom_xpath, "//div[@data-tab='overview']/h2/span/text()"),
'address' => extract_data($dom_xpath, "//span[@itemprop='streetAddress']/text()")
);
$room_nodes = $dom_xpath->query("//h2[@class='property-room-header']/text()");
$rooms = array();
foreach ($room_nodes as $room_node) {
array_push($rooms, array(
'name' => $room_node->data,
'price' => $room_node->nextSibling->nodeValue
));
}
$listing['rooms'] = $rooms;
return $listing;
}
function main() {
// Suppress warnings
error_reporting(E_ERROR | E_PARSE);
$start_url = "http://www.unite-students.com/London";
$listings = array();
echo "Start from here: {$start_url}\n\n";
// List page URLS are in a special form like "/pp\d+/City"
$list_page_urls = extract_urls($start_url,
"//a[contains(@href, 'pp') and contains(text(), 'View all properties')]/@href");
// Extract all detail page links
foreach ($list_page_urls as $list_page_url) {
$response = request_url($list_page_url);
preg_match_all('/.*(\/london\/[\w-]+)/', $response, $matches);
echo "Matched URLs:\n";
print_r($matches[1]);
if ($matches) {
// Scrape useful contents from all detail pages
foreach ($matches[1] as $match) {
echo "\n\nCrawling URL " . $match . " ...\n";
try {
$item = parse_item(build_url($match));
array_push($listings, $item);
print_r($item);
} catch (Exception $e) {
echo 'Caught exception: ', $e->getMessage(), "\n";
}
// Take a break.
sleep(10);
}
}
}
// Save it to json file
$filename = __DIR__ . "/" . date("Y-m-d H:i:s") . '.json';
$file = fopen($filename, 'w');
fwrite($file, json_encode($listings));
fclose($file);
echo "Done! Results saved to {$filename}\n";
}
main();
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment