- 5th Aug 2020
- 10:39 am
- Adan Salman
Java Homework Question
The purpose of this Java Homework is to develop an elementary web crawler that explores a part of the Internet in a breadth-first search manner, then use Page rank to develop a simple search engine. More precisely, a spider is a program that automatically ventures out on the Web and analyses documents. Build a search engine by implementing a spider.
Java Homework Solution
Spider
package SpiderLeg;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.io.*;
public class Spider
{
private static final int MAX_PAGES_TO_SEARCH = 20;
private Set pagesVisited = new HashSet();
private List pagesToVisit = new LinkedList();
public void search(String url, String searchWord)
{
String FILENAME = "C:\\Users\\ngampala\\Desktop\\Web Crawler\\output.txt";
try(BufferedWriter bw = new BufferedWriter(new FileWriter(FILENAME))){
while(this.pagesVisited.size() < MAX_PAGES_TO_SEARCH)
{
String currentUrl;
SpiderLeg leg = new SpiderLeg();
if(this.pagesToVisit.isEmpty())
{
currentUrl = url;
this.pagesVisited.add(url);
}
else
{
currentUrl = this.nextUrl();
}
leg.crawl(currentUrl); // Lots of stuff happening here. Look at the crawl method in
// SpiderLeg
boolean success = leg.searchForWord(searchWord);
if(success)
{
bw.write(currentUrl+"\r\n");
System.out.println(String.format("Word %s found at %s", searchWord, currentUrl));
}
this.pagesToVisit.addAll(leg.getLinks());
}
System.out.println("\n Visited " + this.pagesVisited.size() + " web page(s)");
}
catch (IOException e) {
e.printStackTrace();
}
}
private String nextUrl()
{
String nextUrl;
do
{
nextUrl = this.pagesToVisit.remove(0);
} while(this.pagesVisited.contains(nextUrl));
this.pagesVisited.add(nextUrl);
return nextUrl;
}
}
SpiderTest
package SpiderLeg;
import java.io.IOException;
import java.util.LinkedList;
import java.util.List;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class SpiderLeg
{
private static final String USER_AGENT =
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.112 Safari/535.1";
private List links = new LinkedList();
private Document htmlDocument;
public boolean crawl(String url)
{
try
{
Connection connection = Jsoup.connect(url).userAgent(USER_AGENT);
Document htmlDocument = connection.get();
this.htmlDocument = htmlDocument;
if(connection.response().statusCode() == 200) // 200 is the HTTP OK status code
// indicating that everything is great.
{
//System.out.println("\npresent URL visiting " + url);
}
if(!connection.response().contentType().contains("text/html"))
{
System.out.println("**Failure** Retrieved something other than HTML");
return false;
}
Elements linksOnPage = htmlDocument.select("a[href]");
//System.out.println("Found (" + linksOnPage.size() + ") links");
for(Element link : linksOnPage)
{
this.links.add(link.absUrl("href"));
}
return true;
}
catch(IOException ioe)
{
return false;
}
}
public boolean searchForWord(String searchWord)
{
// Defensive coding. This method should only be used after a successful crawl.
if(this.htmlDocument == null)
{
System.out.println("ERROR! Call crawl() before performing analysis on the document");
return false;
}
//System.out.println("Searching for the word " + searchWord + "...");
String bodyText = this.htmlDocument.body().text();
return bodyText.toLowerCase().contains(searchWord.toLowerCase());
}
public List getLinks()
{
return this.links;
}
}
SpiderTest
package SpiderLeg;
import java.io.*;
public class SpiderTest
{
/**
* This is our test. It creates a spider (which creates spider legs) and crawls the web.
*
* @param args
* - not used
*/
public static void main(String[] args) throws Exception
{
Spider spider = new Spider();
InputStreamReader r=new InputStreamReader(System.in);
BufferedReader br =new BufferedReader(r);
System.out.println("Enter Keyword to Search");
String searchWord = br.readLine();
System.out.println("Entre URLs ");
String url = br.readLine();
System.out.println("Searching .......");
// spider.search("http://arstechnica.com/", "computer");
spider.search(url, searchWord);
}
}