Skip to content

Instantly share code, notes, and snippets.

@Jatin-8898
Created October 2, 2018 16:15
Show Gist options
  • Save Jatin-8898/f6cd1cf7b0238f041f0ad7a8df9df567 to your computer and use it in GitHub Desktop.
Save Jatin-8898/f6cd1cf7b0238f041f0ad7a8df9df567 to your computer and use it in GitHub Desktop.
WebCrawler using Java
//package com.webCrawler;
import java.util.*;
import java.net.*;
import java.io.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class WebCrawler{
private Queue<String> queue;
private List<String> discoveredWebsiteList;
public WebCrawler(){
//We are instatianting queue as a Linked list
this.queue = new LinkedList<>();
//And an arraylist of the discovered websites
this.discoveredWebsiteList = new ArrayList<>();
}
public void discoverWeb(String root){
this.queue.add(root); /*add in queue*/
this.discoveredWebsiteList.add(root); /*add in ArrayList*/
while( !queue.isEmpty()){
String v = this.queue.remove(); /*remove it from the queue*/
String rawHtml = readURL(v); /*Call to func*/
String regexp = "http://(\\w+\\.)*(\\w+)"; /*Regex*/
Pattern pattern = Pattern.compile(regexp);
Matcher matcher = pattern.matcher(rawHtml); /*see if it matches*/
while( matcher.find() ){ /*While it has more urls*/
String actualUrl = matcher.group(); /*make group*/
if( !discoveredWebsiteList.contains(actualUrl)){ /*if doesnt contains*/
discoveredWebsiteList.add(actualUrl); /*then add it to AL*/
System.out.println("Website has been added " + actualUrl);
queue.add(actualUrl);
}
}
}
}
private String readURL(String v){
String rawHtml = "";
try{
URL url = new URL(v);
//with this url openStream
BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream()));
String inputLine = "";
while( (inputLine = in.readLine()) != null){
rawHtml +=inputLine;
}
in.close();
}catch(Exception e){
e.printStackTrace();
}
return rawHtml;
}
public static void main(String[] args){
WebCrawler crawler = new WebCrawler();
//Pass the name of the webite you want to crawl upon
String rootUrl = "http://www.geeksforgeeks.com";
crawler.discoverWeb(rootUrl);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment