Last active
February 22, 2023 10:49
-
-
Save icanhasjonas/8a591e9ea0c955f273687ad888adf56a to your computer and use it in GitHub Desktop.
Web Crawler in C#
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Buffers; | |
using System.Collections.Concurrent; | |
using System.Collections.Generic; | |
using System.Globalization; | |
using System.Linq; | |
using System.Net; | |
using System.Net.Http; | |
using System.Threading; | |
using System.Threading.Tasks; | |
using AngleSharp; | |
using AngleSharp.Dom; | |
using AngleSharp.Network; | |
using AngleSharp.Services.Default; | |
using Ansi; | |
using static System.ConsoleColor; | |
using static Ansi.AnsiFormatter; | |
/* | |
* | |
* Using Nuget: | |
* Ansi | |
* AngleSharp | |
* | |
* */ | |
namespace Crawler { | |
public class Downloader { | |
private readonly HttpClient _httpClient; | |
public Downloader( HttpClient httpClient ) | |
{ | |
_httpClient = httpClient; | |
} | |
public async Task<IResponse> DownloadHtmlPage( Uri url, CancellationToken cancellationToken ) | |
{ | |
using( var response = await _httpClient.GetAsync( url, HttpCompletionOption.ResponseHeadersRead, cancellationToken ) ) { | |
if( response.Content.Headers.ContentType.MediaType == "text/html" ) { | |
var text = await response.Content.ReadAsStringAsync(); | |
return VirtualResponse.Create( r => r | |
.Content( text ) | |
.Address( url ) | |
.Status( HttpStatusCode.OK ) ); | |
} | |
} | |
return null; | |
} | |
} | |
public class WebCrawler { | |
private readonly Downloader _downloader; | |
private readonly ConcurrentQueue<Uri> _queue; | |
private readonly ISet<Uri> _seen; | |
private readonly string _validHost; | |
private static readonly IConfiguration _configuration = Configuration.Default | |
.SetCulture( CultureInfo.InvariantCulture ) | |
.WithDefaultLoader( x => { | |
x.IsNavigationEnabled = false; | |
x.IsResourceLoadingEnabled = false; | |
} ) | |
; | |
public WebCrawler( Downloader downloader, Uri url ) | |
{ | |
_downloader = downloader; | |
_queue = new ConcurrentQueue<Uri>(); | |
_seen = new HashSet<Uri>(); | |
_validHost = url.Host; | |
Enqueue( url ); | |
} | |
public async Task Run( int maxTasks = 5, CancellationToken cancellationToken = default ) | |
{ | |
var tasks = new HashSet<Task> { | |
TryProcessQueue( cancellationToken ) | |
}; | |
void AddProcessTask() | |
{ | |
var task = TryProcessQueue( cancellationToken ); | |
if( task != null ) { | |
tasks.Add( task ); | |
} | |
} | |
async Task WaitForTaskCompletion() | |
{ | |
var completedTask = await Task.WhenAny( tasks ); | |
tasks.Remove( completedTask ); | |
} | |
while( tasks.Count > 0 && !cancellationToken.IsCancellationRequested ) { | |
try { | |
await WaitForTaskCompletion(); | |
} | |
catch( TaskCanceledException ) { | |
return; | |
} | |
AddProcessTask(); | |
if( _queue.Count > maxTasks && tasks.Count < maxTasks ) { | |
Console.WriteLine( Colorize( $" * {DarkMagenta}Adding worker{Gray}" ) ); | |
AddProcessTask(); | |
} | |
} | |
} | |
private Task TryProcessQueue( CancellationToken cancellationToken ) | |
{ | |
if( _queue.TryDequeue( out var url ) ) { | |
return ProcessUrl( url, cancellationToken ); | |
} | |
return null; | |
} | |
private void Enqueue( Uri url ) | |
{ | |
if( url.Scheme != "https" || url.Scheme == "http" ) { | |
return; | |
} | |
if( url.Host != _validHost ) { | |
return; | |
} | |
if( _seen.Add( url ) ) { | |
Console.WriteLine( Colorize( $" * {DarkCyan}Found {Cyan}{url}{Gray}" ) ); | |
_queue.Enqueue( url ); | |
} | |
} | |
private async Task ProcessUrl( Uri url, CancellationToken cancellationToken ) | |
{ | |
Console.WriteLine( Colorize( $"{DarkCyan}Downloading {Yellow}{url}{Gray}" ) ); | |
var response = await _downloader.DownloadHtmlPage( url, cancellationToken ); | |
if( response == null ) { | |
return; | |
} | |
var document = await ParseDocument( response, cancellationToken ); | |
foreach( var x in document.All.OfType<IUrlUtilities>().Where( x => !string.IsNullOrEmpty( x.Href ) ) ) { | |
var href = WithoutHash( x.Href ); | |
if( !string.IsNullOrEmpty( href ) ) { | |
Enqueue( new Uri( url, href ) ); | |
} | |
} | |
} | |
private static async Task<IDocument> ParseDocument( IResponse response, CancellationToken cancellationToken ) | |
{ | |
var browsingContext = BrowsingContext.New( _configuration ); | |
var createDocumentOptions = new CreateDocumentOptions( response, _configuration ); | |
var document = await new DocumentFactory().CreateAsync( browsingContext, createDocumentOptions, cancellationToken ); | |
return document; | |
} | |
private static string WithoutHash( string href ) | |
{ | |
var hashIndex = href.IndexOf( '#' ); | |
if( hashIndex != -1 ) { | |
href = href.Substring( 0, hashIndex ); | |
} | |
return href; | |
} | |
} | |
class Program { | |
static Task Main( string[] args ) | |
{ | |
WindowsConsole.TryEnableVirtualTerminalProcessing(); | |
var crawler = new WebCrawler( new Downloader( new HttpClient() ), new Uri( "https://en.wikipedia.org/wiki/Sweden" ) ); | |
return crawler.Run( 5 ); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment