Monday, May 21, 2007

Java:Example code for a WebCrawler


import java.io.BufferedReader;
import java.io.InputStreamReader;

import java.net.HttpURLConnection;
import java.net.URL;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


public class WebCrawler {

public static void main(String[] args) {
String sProxy = args[0];
String sHost = args[1];
String sFilePath = args[3];
// timeout connection after 500 miliseconds, optional ofcourse
//System.setProperty("sun.net.client.defaultConnectTimeout", "500");
//System.setProperty("sun.net.client.defaultReadTimeout", "1000");

System.setProperty("http.proxySet", "true");
System.setProperty("http.proxyHost", sProxy);
System.setProperty("http.proxyPort", sHost);

// existence symbol table of examined web pages
HashSet st = new HashSet();
ArrayList crawlerLinksList = getHTMLLines(sFilePath);
for (String link : crawlerLinksList){
st.add(link);
}

// breadth first search crawl of web
for (String v : crawlerLinksList) {
System.out.println(v);
String input = getHTML(v);

/*************************************************************
* Find links of the form: http://xxx.yyy.zzz
* \\w+ for one or more alpha-numeric characters
* \\. for dot
* could take first two statements out of loop
*************************************************************/
//String regexp = "http://(\\w+\\.)*(\\w+)";
Pattern pattern = Pattern.compile(regexp);
Matcher matcher = pattern.matcher(input);

// find and print all matches
while (matcher.find()) {
String w = matcher.group();
w = matcher.group(2);
System.out.println("Matched String="+w);
if (!st.contains(w)) {
st.add(w);
}
}

}
}

public static String getHTML(String urlToRead) {
URL url; // The URL to read
HttpURLConnection conn; // The actual connection to the web page
BufferedReader rd; // Used to read results from the web page
String line; // An individual line of the web page HTML
String result = ""; // A long string containing all the HTML
try {
url = new URL(urlToRead);
conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("GET");
rd = new BufferedReader(new InputStreamReader(conn.getInputStream()));
while ((line = rd.readLine()) != null) {
result += line;
}
rd.close();
} catch (Exception e) {
//e.printStackTrace();
}
return result;
}

public static ArrayList getHTMLLines(String urlToRead) {
URL url; // The URL to read
HttpURLConnection conn; // The actual connection to the web page
BufferedReader rd; // Used to read results from the web page
String line; // An individual line of the web page HTML
ArrayList result = new ArrayList(); // A long string containing all the HTML
try {
url = new URL(urlToRead);
conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("GET");
rd = new BufferedReader(new InputStreamReader(conn.getInputStream()));
while ((line = rd.readLine()) != null) {
result.add(line);
}
rd.close();
} catch (Exception e) {
//e.printStackTrace();
}
return result;
}

}

2 comments:

Krista said...

Good for people to know.

ozen green said...

y there is problem at line 17?
it said:

Exception in thread "main" java.lang.ArrayIndexOutOfBoundsException: 0
at WebCrawler.main(WebCrawler.java:17)