Hi there,

A webCrawler program that I downloaded from the Web and made a few minor adjustments to, is not working as it supposed to. The idea of this program is that a users enters a url of a web page, the program recusivley analyzes the links contained in this web page and displays a maximum of 50 links that it finds, back to the user.
The problem is that this program is not displaying the links that are contained within the web page back to the user. I'm having a great deal of trouble trying to pinpoint the exact source of the problem.

If someone could pinpoint exactly why it is not working as it is supposed to, I would be very grateful indeed.

Barry.

here is the code, which i commented on in detail. (Can't send all code in this message)

import java.io.*;
import java.util.*;
import java.net.*;

public class WebCrawlerTest1
{
// vectors declared privatley
private Vector <String> vectorToSearch;
private Vector <String> vectorSearched;
private Vector <String> vectorMatches;

// a thread for performing for mutitasking duties declared
private Thread searchThread;

// a string (URL) declared here so it can be referenced within entire class
// otherwise would have to reference it through this.URL and could be trying to do so from
// static context
private String URL;

private URL url;

// constants declared here too
private static final String SEARCH = "Search";
private static final String DISALLOW = "Disallow:";
private static final int SEARCH_LIMIT = 50;


// constuctor to instantiate defaults for the class
public WebCrawlerTest1 ()
{
vectorToSearch = new Vector <String>();
vectorSearched = new Vector <String> ();
vectorMatches = new Vector<String> ();

// set default for URL access
URLConnection.setDefaultAllowUserInteraction(false);

}

public void run()
{
// useful for keeping track of how many searched and found URLs
int numberSearched = 0;
int numberFound = 0;


boolean error = true;
while(error)
{
error = false;

try
{

// reading in the string (URL) from the user here
InputStreamReader is = new InputStreamReader(System.in);
BufferedReader br = new BufferedReader(is);
System.out.println ("");
System.out.println("<-------------------------------------------------------------------->");
System.out.println("Enter your URL now in the form http://xxx.yyy e.g http://java.sun.com");
URL = br.readLine();

//if user enters nothing, they need to be prompted to enter a URL
if (URL.length() == 0)
{
System.out.println ("");
System.out.println("ERROR: must enter a starting URL");
return;
}

System.out.println ("");
System.out.println ("searching now for " + URL+ " -------> the URL you have entered");
}catch (Exception e)
// catching an error, user must enter a URL
{
error = true;
System.out.println ("");
System.err.println("Sorry an error has occured "+ e.getMessage());
e.printStackTrace();
}


}// while

// while the vector isn't empty and the current thread is equivalent to the search thread
while ((vectorToSearch.size() > 0) && (Thread.currentThread() == searchThread))
{
// get the first element from the list to be searched
URL = (String) vectorToSearch.elementAt(0);

// instantiating the string that the user entered as a URL
try
{
url = new URL(URL);
} catch (MalformedURLException e)
{
System.out.println("ERROR: invalid URL " + URL);

}

// initialize search data structures
vectorToSearch.removeAllElements();
vectorSearched.removeAllElements();
vectorMatches.removeAllElements();



// mark the URL as searched (we want this one way or the other)
vectorToSearch.removeElementAt(0);
vectorSearched.addElement(URL);

try
{
// try opening the URL
URLConnection urlConnection = url.openConnection();

urlConnection.setAllowUserInteraction(false);

// opening the stream for reading
InputStream urlStream = url.openStream();

// calling this method to interpret the format by looking at the stream
String type
= urlConnection.guessContentTypeFromStream(urlStream);

// if stream is empty, break out of loop and return to whilr loop again
if (type == null)
break;
// if the stream type doesn't compare to text or html break out of loop and return to while loop again
if (type.compareTo("text/html") != 0)
break;


// instantiating array of 1000 bytes
byte b[] = new byte[1000];

// converting stream (in HTML) to an int
int numRead = urlStream.read(b);

// converting to a string
String content = new String(b, 0, numRead);

// while the stream when converted is positive do the following actions inside while loop
while (numRead != -1)
{
if (Thread.currentThread() != searchThread)
break;
numRead = urlStream.read(b);
if (numRead != -1)
{
String newContent = new String(b, 0, numRead);
content += newContent;
}
}
// close the stream
urlStream.close();

// break out of the while loop if the current thread is not equal to the search thread
if (Thread.currentThread() != searchThread)
break;

// convert the content of the stream which is in HTML (or should be) to lowercase string
String lowerCaseContent = content.toLowerCase();

int index = 0;

// a while loop with three conditions imposed for looking at a link in the HTML code (i.e. <a href=" ">
// if all these are true then..
while ((index = lowerCaseContent.indexOf("<a", index)) != -1)
{
if ((index = lowerCaseContent.indexOf("href", index)) == -1)
break;
if ((index = lowerCaseContent.indexOf("=", index)) == -1)
break;

if (Thread.currentThread() != searchThread)
break;

// increment index counter
index++;

// want to get the remaining string where the <a href = some value
String remaining = content.substring(index);

// now parsing this value (string) by backspace, newline characters e.t.c
StringTokenizer st = new StringTokenizer(remaining, "\t\n\r\">#");
String strLink = st.nextToken();


URL urlLink;
try
{
// this token of the remaining string is contains the link to another web page
urlLink = new URL(url, strLink);
strLink = urlLink.toString();
} catch (MalformedURLException e)
{
System.out.println("ERROR: bad URL " + strLink);
continue;
}