-
Re: remainder of WebCrawler Program
Hi,
Sorry, i couldn't quite send all of the program in one message. Here is the remainder of the program.
Barry
Rest of code for WebCrawler Program:
// only look at http links
if (urlLink.getProtocol().compareTo("http") != 0)
break;
if (Thread.currentThread() != searchThread)
break;
try
{
// try opening the new URL
URLConnection urlLinkConnection
= urlLink.openConnection();
urlLinkConnection.setAllowUserInteraction(false);
InputStream linkStream = urlLink.openStream();
String strType
= urlLinkConnection.guessContentTypeFromStream(linkStream);
linkStream.close();
// if no other page exists break out of loop
if (strType == null)
break;
if (strType.compareTo("text/html") == 0)
{
// if this URl is not searched already and not on the list to currentl search then do following..
if ((!vectorSearched.contains(strLink)) && (!vectorToSearch.contains(strLink)))
{
// test to make sure it is robot-safe!
if (robotSafe(urlLink))
// now add it to the list of ones we want to search
vectorToSearch.addElement(strLink);
}
}
// if the proper type, add it to the results list
// unless we have already seen it
if (strType.compareTo("text/html") == 0)
{
//vectorMatches.addElement(strLink);
for (int i =0; i < vectorToSearch.size(); i++)
{
System.out.println(i);
}
numberFound++;
if (numberFound >= SEARCH_LIMIT)
break;
// this is where we want to display the list orf URLs found to user
}
} catch (IOException e)
// catch the stream error and display to the user
{
System.out.println("ERROR: couldn't open URL " + strLink);
continue;
}
}
} catch (IOException e)
// catch a stream error in the outer block of code a and disply to user
{
System.out.println("ERROR: couldn't open the URL that you entered");
break;
}
numberSearched++;
if (numberSearched >= SEARCH_LIMIT)
break;
}
if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT)
System.out.println("reached search limit of " + SEARCH_LIMIT);
else
System.out.println("");
System.out.println("Your search is completed");
System.out.println("");
System.out.println("<------------------------------------------------------------>");
searchThread = null;
// add this URL to search into the Vector for searching
}
// method to see whether there's a robots.txt file at the website that includes a "Disallow" statement.
boolean robotSafe(URL url)
{
String strHost = url.getHost();
// form URL of the robots.txt file
String strRobot = "http://" + strHost + "/robots.txt";
URL urlRobot;
try {
urlRobot = new URL(strRobot);
} catch (MalformedURLException e) {
// something weird is happening, so don't trust it
return false;
}
String strCommands;
try {
InputStream urlRobotStream = urlRobot.openStream();
// read in entire file
byte b[] = new byte[1000];
// read in the stream of bytes and store it in an integer numRead
int numRead = urlRobotStream.read(b);
// now convert this file from the url into the string strCommands
strCommands = new String(b, 0, numRead);
// while numRead positive do the following checks
while (numRead != -1) {
// if currentThread different than the serachThread then break out of the loop and (continue on again???)
if (Thread.currentThread() != searchThread)
break;
numRead = urlRobotStream.read(b);
if (numRead != -1) {
String newCommands = new String(b, 0, numRead);
// new string now
strCommands += newCommands;
}
}
//close the stream
urlRobotStream.close();
} catch (IOException e) {
// if there is no robots.txt file, it is OK to search
// ok to search so return true to the method
return true;
}
String strURL = url.getFile();
int index = 0;
while ((index = strCommands.indexOf(DISALLOW, index)) != -1) {
index += DISALLOW.length();
String strPath = strCommands.substring(index);
StringTokenizer st = new StringTokenizer(strPath);
// if it doesnt have more sub-strings then break
if (!st.hasMoreTokens())
break;
String strBadPath = st.nextToken();
// if the URL starts with a disallowed path, it is not safe
if (strURL.indexOf(strBadPath) == 0)
return false;
}
//
return true;
}
// main method
public static void main(String args[])
{
/* Behind a firewall set your proxy and port here!
*/
Properties props= new Properties(System.getProperties());
props.put("http.proxySet", "true");
props.put("http.proxyHost", "webcache-cup");
props.put("http.proxyPort", "3128");
Properties newprops = new Properties(props);
System.setProperties(newprops);
WebCrawlerTest1 wct = new WebCrawlerTest1 ();
wct.run();
}
}
Similar Threads
-
By divagoddess in forum C++
Replies: 5
Last Post: 08-14-2009, 03:12 PM
-
Replies: 0
Last Post: 07-05-2006, 10:31 AM
-
By divagoddess in forum C++
Replies: 12
Last Post: 05-07-2006, 10:55 PM
-
By Gordon Reichhardt in forum VB Classic
Replies: 2
Last Post: 01-08-2002, 10:06 AM
-
By W.Pierce in forum VB Classic
Replies: 1
Last Post: 12-11-2001, 08:28 AM
Posting Permissions
- You may not post new threads
- You may not post replies
- You may not post attachments
- You may not edit your posts
Forum Rules
|
Top DevX Stories
Easy Web Services with SQL Server 2005 HTTP Endpoints
JavaOne 2005: Java Platform Roadmap Focuses on Ease of Development, Sun Focuses on the "Free" in F.O.S.S.
Wed Yourself to UML with the Power of Associations
Microsoft to Add AJAX Capabilities to ASP.NET
IBM's Cloudscape Versus MySQL
|
Bookmarks