Hi,

Sorry, i couldn't quite send all of the program in one message. Here is the remainder of the program.

Barry

Rest of code for WebCrawler Program:

// only look at http links
if (urlLink.getProtocol().compareTo("http") != 0)
break;

if (Thread.currentThread() != searchThread)
break;

try
{
// try opening the new URL

URLConnection urlLinkConnection
= urlLink.openConnection();
urlLinkConnection.setAllowUserInteraction(false);

InputStream linkStream = urlLink.openStream();
String strType
= urlLinkConnection.guessContentTypeFromStream(linkStream);
linkStream.close();

// if no other page exists break out of loop
if (strType == null)
break;
if (strType.compareTo("text/html") == 0)
{
// if this URl is not searched already and not on the list to currentl search then do following..
if ((!vectorSearched.contains(strLink)) && (!vectorToSearch.contains(strLink)))
{

// test to make sure it is robot-safe!
if (robotSafe(urlLink))
// now add it to the list of ones we want to search
vectorToSearch.addElement(strLink);
}
}

// if the proper type, add it to the results list
// unless we have already seen it
if (strType.compareTo("text/html") == 0)
{
//vectorMatches.addElement(strLink);
for (int i =0; i < vectorToSearch.size(); i++)
{
System.out.println(i);
}

numberFound++;
if (numberFound >= SEARCH_LIMIT)
break;
// this is where we want to display the list orf URLs found to user



}
} catch (IOException e)
// catch the stream error and display to the user
{
System.out.println("ERROR: couldn't open URL " + strLink);
continue;
}
}
} catch (IOException e)
// catch a stream error in the outer block of code a and disply to user
{
System.out.println("ERROR: couldn't open the URL that you entered");
break;
}
numberSearched++;
if (numberSearched >= SEARCH_LIMIT)
break;
}

if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT)
System.out.println("reached search limit of " + SEARCH_LIMIT);
else
System.out.println("");
System.out.println("Your search is completed");
System.out.println("");
System.out.println("<------------------------------------------------------------>");
searchThread = null;
// add this URL to search into the Vector for searching





}

// method to see whether there's a robots.txt file at the website that includes a "Disallow" statement.

boolean robotSafe(URL url)
{
String strHost = url.getHost();

// form URL of the robots.txt file
String strRobot = "http://" + strHost + "/robots.txt";
URL urlRobot;
try {
urlRobot = new URL(strRobot);
} catch (MalformedURLException e) {
// something weird is happening, so don't trust it
return false;
}

String strCommands;
try {
InputStream urlRobotStream = urlRobot.openStream();

// read in entire file
byte b[] = new byte[1000];
// read in the stream of bytes and store it in an integer numRead
int numRead = urlRobotStream.read(b);
// now convert this file from the url into the string strCommands
strCommands = new String(b, 0, numRead);
// while numRead positive do the following checks
while (numRead != -1) {
// if currentThread different than the serachThread then break out of the loop and (continue on again???)
if (Thread.currentThread() != searchThread)
break;
numRead = urlRobotStream.read(b);
if (numRead != -1) {
String newCommands = new String(b, 0, numRead);
// new string now
strCommands += newCommands;
}
}
//close the stream
urlRobotStream.close();
} catch (IOException e) {
// if there is no robots.txt file, it is OK to search
// ok to search so return true to the method
return true;
}



String strURL = url.getFile();
int index = 0;
while ((index = strCommands.indexOf(DISALLOW, index)) != -1) {
index += DISALLOW.length();
String strPath = strCommands.substring(index);
StringTokenizer st = new StringTokenizer(strPath);

// if it doesnt have more sub-strings then break
if (!st.hasMoreTokens())
break;

String strBadPath = st.nextToken();

// if the URL starts with a disallowed path, it is not safe
if (strURL.indexOf(strBadPath) == 0)
return false;
}

//
return true;
}


// main method
public static void main(String args[])
{
/* Behind a firewall set your proxy and port here!
*/
Properties props= new Properties(System.getProperties());
props.put("http.proxySet", "true");
props.put("http.proxyHost", "webcache-cup");
props.put("http.proxyPort", "3128");

Properties newprops = new Properties(props);
System.setProperties(newprops);

WebCrawlerTest1 wct = new WebCrawlerTest1 ();
wct.run();

}


}