- 5th Aug 2020
- 10:39 am
- Adan Salman
Java Homework Question
The purpose of this Java Homework is to develop an elementary web crawler that explores a part of the Internet in a breadth-first search manner, then use Page rank to develop a simple search engine. More precisely, a spider is a program that automatically ventures out on the Web and analyses documents. Build a search engine by implementing a spider.
Java Homework Solution
Spider
package SpiderLeg; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; import java.io.*; public class Spider { private static final int MAX_PAGES_TO_SEARCH = 20; private Set pagesVisited = new HashSet(); private List pagesToVisit = new LinkedList(); public void search(String url, String searchWord) { String FILENAME = "C:\\Users\\ngampala\\Desktop\\Web Crawler\\output.txt"; try(BufferedWriter bw = new BufferedWriter(new FileWriter(FILENAME))){ while(this.pagesVisited.size() < MAX_PAGES_TO_SEARCH) { String currentUrl; SpiderLeg leg = new SpiderLeg(); if(this.pagesToVisit.isEmpty()) { currentUrl = url; this.pagesVisited.add(url); } else { currentUrl = this.nextUrl(); } leg.crawl(currentUrl); // Lots of stuff happening here. Look at the crawl method in // SpiderLeg boolean success = leg.searchForWord(searchWord); if(success) { bw.write(currentUrl+"\r\n"); System.out.println(String.format("Word %s found at %s", searchWord, currentUrl)); } this.pagesToVisit.addAll(leg.getLinks()); } System.out.println("\n Visited " + this.pagesVisited.size() + " web page(s)"); } catch (IOException e) { e.printStackTrace(); } } private String nextUrl() { String nextUrl; do { nextUrl = this.pagesToVisit.remove(0); } while(this.pagesVisited.contains(nextUrl)); this.pagesVisited.add(nextUrl); return nextUrl; } }
SpiderTest
package SpiderLeg; import java.io.IOException; import java.util.LinkedList; import java.util.List; import org.jsoup.Connection; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; public class SpiderLeg { private static final String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1"; private List links = new LinkedList(); private Document htmlDocument; public boolean crawl(String url) { try { Connection connection = Jsoup.connect(url).userAgent(USER_AGENT); Document htmlDocument = connection.get(); this.htmlDocument = htmlDocument; if(connection.response().statusCode() == 200) // 200 is the HTTP OK status code // indicating that everything is great. { //System.out.println("\npresent URL visiting " + url); } if(!connection.response().contentType().contains("text/html")) { System.out.println("**Failure** Retrieved something other than HTML"); return false; } Elements linksOnPage = htmlDocument.select("a[href]"); //System.out.println("Found (" + linksOnPage.size() + ") links"); for(Element link : linksOnPage) { this.links.add(link.absUrl("href")); } return true; } catch(IOException ioe) { return false; } } public boolean searchForWord(String searchWord) { // Defensive coding. This method should only be used after a successful crawl. if(this.htmlDocument == null) { System.out.println("ERROR! Call crawl() before performing analysis on the document"); return false; } //System.out.println("Searching for the word " + searchWord + "..."); String bodyText = this.htmlDocument.body().text(); return bodyText.toLowerCase().contains(searchWord.toLowerCase()); } public List getLinks() { return this.links; } }
SpiderTest
package SpiderLeg; import java.io.*; public class SpiderTest { /** * This is our test. It creates a spider (which creates spider legs) and crawls the web. * * @param args * - not used */ public static void main(String[] args) throws Exception { Spider spider = new Spider(); InputStreamReader r=new InputStreamReader(System.in); BufferedReader br =new BufferedReader(r); System.out.println("Enter Keyword to Search"); String searchWord = br.readLine(); System.out.println("Entre URLs "); String url = br.readLine(); System.out.println("Searching ......."); // spider.search("http://arstechnica.com/", "computer"); spider.search(url, searchWord); } }