Created
May 20, 2020 11:20
-
-
Save stilliard/b981fcefe70fc7ab2649eb624479cd97 to your computer and use it in GitHub Desktop.
Scrape some blog posts and their images
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// | |
// Quickly download some blog posts | |
// | |
// setup: | |
// mkdir -p images/post-{content,logos} # for images | |
// mkdir out # for the csv output | |
// composer init -q | |
// composer require voku/simple_html_dom | |
// composer require voku/portable-utf8 | |
// composer require stilliard/csvparser | |
// | |
require_once __DIR__ . '/vendor/autoload.php'; | |
use voku\helper\HtmlDomParser; | |
function dd(...$args) | |
{ | |
var_dump(...$args); | |
exit; | |
} | |
function debugLog($title, $data = null) | |
{ | |
echo "[{$title}] " . ($data ? json_encode($data) : '') . "\n"; | |
} | |
function download($url) | |
{ | |
return file_get_contents($url); | |
} | |
function downloadToFile($from, $to) | |
{ | |
debugLog('downloading', ['from' => $from, 'to' => $to]); | |
file_put_contents(__DIR__ . '/' . $to, download($from)); | |
} | |
function writeCsv($array, $file) | |
{ | |
$parser = new \CsvParser\Parser(); | |
$csv = $parser->fromArray($array); | |
var_dump($parser->toFile($csv, __DIR__ . '/' . $file)); | |
} | |
function parseHtml($html) | |
{ | |
return HtmlDomParser::str_get_html($html); | |
} | |
function crawlForPosts($url) | |
{ | |
debugLog('crawl', $url); | |
$dom = parseHtml(download($url)); | |
$data = findPosts($dom); | |
return $data; | |
} | |
function findPosts($dom) | |
{ | |
$posts = $dom->find('.news_box'); | |
if ($posts->count() == 0) { | |
debugLog('no found posts'); | |
return null; | |
} | |
debugLog('found posts', $posts->count()); | |
$data = []; | |
foreach ($posts as $post) { | |
$link = $post->findOne('.news_info h3 a'); | |
debugLog('post', $link->href); | |
$image = $post->findOne('img')->src; | |
$imagePath = 'images/post-logos/' . basename($image); | |
downloadToFile(CRAWL_ORIGIN . $image, $imagePath); | |
$postData = parseHtml(download(CRAWL_ORIGIN . $link->href)); | |
$postContent = $postData->find('.content-main'); | |
$postDate = preg_replace('/[^\d\-]/', '', $postContent->findOne('.news_title span')->innertext); | |
foreach ($postData->find('.content-main img') as $contentImage) { | |
$contentImagePath = 'images/post-content/' . basename($contentImage->src); | |
downloadToFile(CRAWL_ORIGIN . $contentImage->src, $contentImagePath); | |
$contentImage->src = '/thumbnail/600x600/userfiles/' . $contentImagePath; | |
} | |
$data[] = (object) [ | |
'link' => CRAWL_ORIGIN . $link->href, | |
'title' => $link->innertext, | |
'date' => $postDate, | |
'image' => '/userfiles/' . $imagePath, | |
'body' => (string) $postContent, | |
]; | |
// return $data; | |
} | |
return $data; | |
} | |
define('CRAWL_ORIGIN', 'http://www.forgemotorsport.asia/'); | |
$data = crawlForPosts(CRAWL_ORIGIN . 'news.php'); | |
$data = array_merge($data, crawlForPosts(CRAWL_ORIGIN . 'news.php?page=2')); | |
$data = array_merge($data, crawlForPosts(CRAWL_ORIGIN . 'news.php?page=3')); | |
var_dump($data); | |
$blog = []; | |
$blog_blocks = []; | |
$i = 1; | |
foreach ($data as $post) { | |
$blog[] = [ | |
'id' => $i, | |
'blog_title' => $post->title, | |
'date_added' => $post->date . ' 00:00:00', | |
'related_image' => $post->image, | |
'status' => 'live', | |
'related_id' => '0', | |
'is_private_blog' => '', | |
'allow_comments' => '', | |
'allow_guest_comments' => '', | |
'auto_approve_comments' => '', | |
'blog_tags' => '', | |
'category' => '0', | |
'user_id' => '1', | |
'email_when_comments_added' => '', | |
'extra_field_content_1' => '', | |
'extra_field_content_2' => '', | |
'featured' => '0', | |
'meta_title' => '', | |
'meta_keywords' => '', | |
'meta_description' => '', | |
]; | |
$blog_blocks[] = [ | |
'id' => $i, | |
'blog_id' => $i, | |
'type' => 'content-block', | |
'content' => $post->body, | |
'sort_order' => '1', | |
]; | |
$i++; | |
} | |
writeCsv($blog, 'out/blog.csv'); | |
writeCsv($blog_blocks, 'out/blog_blocks.csv'); | |
debugLog('complete'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment