I make a spider program, but it runs out of memory after ~200 websites.

after that it crashes.. or becomes so slow its useless.

The problem is in this Spider class , that cant be garbage collected (But I dont see any reason why not)

These profilers i have seen show you that memory is allocated and cant be garbage collected , but not where ? Can anyone give suggestions what can be wrong with this class or tell how they fix memory problems?

peter


import java.util.*;
import java.net.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;

import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;

import org.htmlparser.util.ParserException;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.FrameTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.tags.HeadingTag;

import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.beans.StringBean;





public class Spider implements Runnable {

private URL base;
private int siteid;
private int companyid;
private int MaxPaginas;

protected Collection workloadPath = new ArrayList(3);
protected Collection workloadError = new ArrayList(3);
protected Collection workloadWaiting = new ArrayList(3);
protected Collection workloadProcessed = new ArrayList(3);
protected ISpiderReportable report;
protected Done done;
protected Parser mParser;

String content = "";
String meta = "";
String titel = "";
String kopjes = "";


static private int count = 0;
private int taskNumber;


public Spider(int DBcompanyid, int DBsiteid, URL DBbase, ISpiderReportable report)
{
base = DBbase;
siteid = DBsiteid;
companyid = DBcompanyid;
MaxPaginas = 20;
this.report = report;

count++;
taskNumber = count;


mParser = new Parser ();


PrototypicalNodeFactory factory = new PrototypicalNodeFactory ();
factory.registerTag (new LocalLinkTag ());
factory.registerTag (new LocalMetaTag ());
factory.registerTag (new LocalFrameTag ());
factory.registerTag (new LocalTitleTag ());
factory.registerTag (new LocalHeadingTag ());
mParser.setNodeFactory (factory);

}

public void run()
{
clear();
report.koppelDB(siteid,companyid);
addURL(base);
begin();
}


public Collection getWorkloadPath()
{
return workloadPath;
}

public Collection getWorkloadError()
{
return workloadError;
}

public Collection getWorkloadWaiting()
{
return workloadWaiting;
}

public Collection getWorkloadProcessed()
{
return workloadProcessed;
}

public void clear()
{
getWorkloadError().clear();
getWorkloadWaiting().clear();
getWorkloadProcessed().clear();
getWorkloadPath().clear();
}

public void addURL(URL url)
{
if ( getWorkloadWaiting().contains(url) )
return;
if ( getWorkloadError().contains(url) )
return;
if ( getWorkloadProcessed().contains(url) )
return;
if ( getWorkloadPath().contains(url.getPath()) )
return;

getWorkloadPath().add(url.getPath());

log("PROCES: " + taskNumber + " Adding to workload: " + url );
getWorkloadWaiting().add(url);
MaxPaginas--;
}


protected void processURL (URL Furl) throws ParserException
{
NodeList Nlist;

getWorkloadWaiting().remove(Furl);
getWorkloadProcessed().add(Furl);

String url = Furl.toString();
StringExtractor se = new StringExtractor (url);
try
{
content = se.extractStrings ();
}
catch (ParserException e)
{
e.printStackTrace ();
}

try
{
mParser.setURL (url);


try
{
Nlist = new NodeList ();
for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); )
Nlist.add (e.nextNode ());
}
catch (EncodingChangeException ece)
{
mParser.reset ();
Nlist = new NodeList ();
for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); )
Nlist.add (e.nextNode ());
}
}
catch (ParserException pe)
{
String message;
message = pe.getMessage ();
if ((null != message) && (message.endsWith ("does not contain text")))
{
System.out.println("Is geen text bestand...");
}
else
throw pe;
}

report.writeDB(siteid,(String)Furl.getPath(),content, titel, meta, kopjes);

String content = "";
String meta = "";
String titel = "";
String kopjes = "";

log("Complete: " + url);
}




class LocalLinkTag extends LinkTag
{
public void doSemanticAction ()
throws
ParserException
{
if(!isHTTPLikeLink())
return;

String link = getLink();

int index = link.indexOf('#');
if (index != -1)
link = link.substring(0, index);

if(MaxPaginas>1)
handleLink(base,link);
else
return;
}
}

class LocalFrameTag extends FrameTag
{
public void doSemanticAction ()
throws
ParserException
{
String link = getFrameLocation ();

if(MaxPaginas>1){
handleLink(base,link);
}

}
}


public class StringExtractor
{
private String resource;


public StringExtractor (String resource)
{
this.resource = resource;
}

public String extractStrings ()
throws
ParserException
{
StringBean sb;

sb = new StringBean ();
sb.setLinks (false);
sb.setURL (resource);

return (sb.getStrings ());
}
}


class LocalTitleTag extends TitleTag
{
public void doSemanticAction ()
throws
ParserException
{
titel = getTitle();
}
}

class LocalHeadingTag extends HeadingTag
{
public void doSemanticAction ()
throws
ParserException
{
kopjes = kopjes + " " + toPlainTextString();
}
}




class LocalMetaTag extends MetaTag
{
public void doSemanticAction ()
throws
ParserException
{
String metaNaam = null;

metaNaam = getMetaTagName();

if(metaNaam!=null)

if(metaNaam.equals("keywords") || metaNaam.equals("description"))
{
meta = meta + " " + getMetaContent();
}
}
}


public void begin()
{
while ( !getWorkloadWaiting().isEmpty()) {
Object list[] = getWorkloadWaiting().toArray();

for ( int i=0;(i<list.length);i++ )
try{
processURL((URL)list[i]);
}catch(ParserException pe){
System.out.println("Parser error:"+pe);
MaxPaginas++;
}
}
}


protected void handleLink(URL base,String str)
{
try {
URL url = new URL(base,str);
if ( report.spiderFoundURL(base,url)){
addURL(url);
}
} catch ( MalformedURLException e ) {}
}



public void log(String entry)
{
System.out.println(entry );
}


}





It must have something to do with the inner classes and processURL since i replaced only this code Swing parser with htmlparser.

I dont expect anyone to exactly say this is wrong or that, but maybe some tools and suggestions how to solve would be very welcome.

thanks>