Skip to content

Instantly share code, notes, and snippets.

@exileed
Last active April 15, 2018 18:38
Show Gist options
  • Save exileed/5f817c2cebe8e46d691778d2419c1b7a to your computer and use it in GitHub Desktop.
Save exileed/5f817c2cebe8e46d691778d2419c1b7a to your computer and use it in GitHub Desktop.
<?php
$sitemap = "sitemap.xml";
$startUrl = "http://php.net";
$extencions = [
".html",
".php",
"/",
];
$scanned = [];
function checkUrl($url)
{
global $startUrl, $extencions, $scanned, $pf;
if (in_array($url, $scanned) == true) {
return;
}
array_push($scanned, $url);
$html = getUrl($url);
preg_match_all('/href="([^"]+)"/', $html, $links);
foreach ($links[ 1 ] as $key => $link) {
$baseUrl = preg_split('/[#|?]/', $link);
$nextUrl = $baseUrl[ 0 ];
if ((substr($nextUrl, 0, 7) == "mailto:") || (substr($nextUrl, 0, 6) == "ftp://")) {
return;
} elseif ((substr($nextUrl, 0, 7) != "http://") && (substr($nextUrl, 0, 8) != "https://")) {
$nextUrl = @filterUrls($nextUrl, $startUrl);
} else {
$nextUrl = @checkHost($nextUrl, $startUrl);
}
if (substr($nextUrl, 0, strlen($startUrl)) == $startUrl) {
$ignore = false;
if (in_array($nextUrl, $scanned)) {
$ignore = true;
}
if ( ! $ignore) {
foreach ($extencions as $ext) {
if (strpos($nextUrl, $ext) != false) {
fwrite($pf, " <url>\n" .
" <loc>" . $nextUrl . "</loc>\n" .
" </url>\n");
checkUrl($nextUrl);
}
}
}
}
}
}
function getUrl($url)
{
$agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36';
$ch = curl_init();
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_USERAGENT, $agent);
curl_setopt($ch, CURLOPT_VERBOSE, 1);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
function checkHost($currentUrl, $startUrl)
{
$parsedUrl = parse_url($startUrl);
$host = $parsedUrl[ 'host' ];
if (parse_url($currentUrl, PHP_URL_HOST) != $host) {
return;
}
}
function filterUrls($currentUrl, $startUrl)
{
$parsedUrl = parse_url($startUrl);
$scheme = $parsedUrl[ 'scheme' ];
$host = $parsedUrl[ 'host' ];
if (strpos($currentUrl, "/") == 0) {
return $scheme . '://' . $host . $currentUrl;
}
if ((parse_url($currentUrl, PHP_URL_SCHEME) == null) || ! strpos($currentUrl, "/")) {
return $scheme . '://' . $host . '/' . $currentUrl;
}
}
$pf = fopen($sitemap, "w");
fwrite($pf, "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\"\n" .
"xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\"\n" .
"xsi:schemaLocation=\"http://www.sitemaps.org/schemas/sitemap/0.9\n" .
"http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd\">\n" .
" <url>\n" .
" <loc>" . $startUrl . "</loc>\n" .
" </url>\n");
checkUrl($startUrl);
fwrite($pf, "</urlset>\n");
fclose($pf);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment