Java – Multi-threaded crawler with ExecutorService

Tags: , ,



I’m working to make a crawler in Java. I made a single-threaded crawler to visit a single page and fetch all links on that page. Now I want to make it multi-threaded but facing difficulties. In the very beginning I start with single link of the page and crawl through all the links in it and Now I want to run an ExecutorService in which the thread starts with fetching a single url from unvisitedLinks and start working on it just like it did with a single-threaded crawler and same for a few more threads doing the same thing. Here is the crawler class I made which implements Runnable to make it a thread:

import java.net.URI;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class MyCrawler implements Runnable {
    volatile static int counter =0;
    String originaUrl, currentUrl;
    List<String> unvisitedLinks = new ArrayList<>();
    Set<String> visitedLinks = new HashSet<>();
    URI uri;
    ExecutorService executor = null;
    int pagesVisited = 0;


    public MyCrawler(String url) {
        this.originaUrl = url;
        unvisitedLinks.add(url);
         this.uri = URI.create(url);
    }

    @Override
    public void run() {
        do{
            try{
                executor = Executors.newFixedThreadPool(10);
                String url; 
                synchronized (this) {
                    url = unvisitedLinks.get(0);
                    while (unvisitedLinks.contains(url)) {
                        unvisitedLinks.remove(url);
                    }
                }
                //Visit this page and fetch all the links;
                VisitPage(url);

                visitedLinks.add(url);

                for(int i = 0; i< 10; i++){
                    synchronized (this) {
                        url = unvisitedLinks.get(i);
                        while (unvisitedLinks.contains(url)) {
                            unvisitedLinks.remove(url);
                        }
                    }
                    Runnable worker = new MyCrawler(url);
                    executor.execute(worker);
                }

                executor.shutdown();
                while(!executor.isTerminated()){ //WAIT FOR EXECUTOR TO FINISH

                }
                executor = null;
            }catch(Exception e){
                e.printStackTrace();
            }

        }while(unvisitedLinks.size() != 0);
        System.out.println("total pages visited: " + counter);
        System.out.println("TOTAL LINKS FOUND " + visitedLinks.size());

        for(String s: visitedLinks){
            System.out.println(s + "n");
        }
    }

    private void VisitPage(String url){

        List<String> linksOnthisPage = new ArrayList<>();

        if(!visitedLinks.contains(url)){
            if(!url.contains("javascript") && !url.contains("#")){

                try{
                    Document doc = Jsoup.connect(url).timeout(0).get();
                    Elements linkTags = doc.select("a[href]");

                    for(Element e : linkTags){
                        String link = e.attr("href");
                        if(!visitedLinks.contains(link) && !link.contains("#") && !link.contains("javascript") && !link.equals(url)){
                            if(link.startsWith("http") || link.startsWith("www")){
                                if(link.contains(uri.getHost())){
                                    linksOnthisPage.add(link);
                                }else{
                                    System.out.println("SOME OTHER WEBSITE -- " + link);
                                }

                            }else if(link.startsWith("/")){
                                link = url + link.substring(1, link.length());
                                linksOnthisPage.add(link);
                            }else{
                                System.out.println("LINK IGNORED DUE TO  -- " + url);
                            }
                        }else{
                            System.out.println("LINK IGNORED -- " + url);
                        }
                    }
                    System.out.println("nnLinks found in "" + url+ "" : " + linksOnthisPage.size());
                    unvisitedLinks.addAll(linksOnthisPage);
                    System.out.println("UNVISITED LINKS NOW: " + unvisitedLinks.size());
                }catch(Exception e){
                    System.out.println("EXCEPTION -- " + url);
                    return;
                }
            }else{
                System.out.println("UNWANTED URL -- " + url);
            }
        }else{
            System.out.println("LINK VISITED -- " + url);
        }
    }

}

And here is the main method where I’m submitting the link to start with.

public class MainClass {

    public static void main(String[] args) {

        try{
            Thread t = new Thread(new MyCrawler("http://www.example.com/"));

            t.start();
            t.join();
            System.out.println("nFinished all threadsn---------------------------------");

        }catch(Exception e){
            e.printStackTrace();
        }

        System.out.println("DONE!");


    }

}

P.S There may be a lot of blunders you will go through in this code. Please correct me in every way you can.

Answer

I think that what you need to do is to handle in the Runnables the url visiting part only, which means that the Runnable class will be kind of this :

public class MyCrawler implements Runnable {

    URI uri;



    public MyCrawler(String url) {
         this.uri = URI.create(url);
    }

    @Override
    public void run() {

        try{
            VisitPage(url);

        }catch(Exception e){
            e.printStackTrace();
        }


    }

    private void VisitPage(String url){

        List<String> linksOnthisPage = new ArrayList<>();

        if(!url.contains("javascript") && !url.contains("#")){

            try{
                Document doc = Jsoup.connect(url).timeout(0).get();
                Elements linkTags = doc.select("a[href]");

                for(Element e : linkTags){
                    String link = e.attr("href");
                    if(!link.contains("#") && !link.contains("javascript") && !link.equals(url)){
                        if(link.startsWith("http") || link.startsWith("www")){
                            if(link.contains(uri.getHost())){
                                linksOnthisPage.add(link);
                            }else{
                                System.out.println("SOME OTHER WEBSITE -- " + link);
                            }

                        }else if(link.startsWith("/")){
                            link = url + link.substring(1, link.length());
                            linksOnthisPage.add(link);
                        }else{
                            System.out.println("LINK IGNORED DUE TO  -- " + url);
                        }
                    }else{
                        System.out.println("LINK IGNORED -- " + url);
                    }
                }
                System.out.println("nnLinks found in "" + url+ "" : " + linksOnthisPage.size());

            }catch(Exception e){
                System.out.println("EXCEPTION -- " + url);
                return;
            }
        }else{
            System.out.println("UNWANTED URL -- " + url);
        }
    }

}

Next loop over the links and add a job to the executor for each url,( you can do this in your main method or do it in a new class), the code snippet will look like this :

for(String url : unvisitedLinks ){
{
    Runnable worker = new MyCrawler(url);
    executor.execute(worker);
}


Source: stackoverflow