-
Java heap memory errors..
I make a spider program, but it runs out of memory after ~200 websites.
after that it crashes.. or becomes so slow its useless.
The problem is in this Spider class , that cant be garbage collected (But I dont see any reason why not)
These profilers i have seen show you that memory is allocated and cant be garbage collected , but not where ? Can anyone give suggestions what can be wrong with this class or tell how they fix memory problems?
peter
import java.util.*;
import java.net.*;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import org.htmlparser.util.ParserException;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.FrameTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.tags.HeadingTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.beans.StringBean;
public class Spider implements Runnable {
private URL base;
private int siteid;
private int companyid;
private int MaxPaginas;
protected Collection workloadPath = new ArrayList(3);
protected Collection workloadError = new ArrayList(3);
protected Collection workloadWaiting = new ArrayList(3);
protected Collection workloadProcessed = new ArrayList(3);
protected ISpiderReportable report;
protected Done done;
protected Parser mParser;
String content = "";
String meta = "";
String titel = "";
String kopjes = "";
static private int count = 0;
private int taskNumber;
public Spider(int DBcompanyid, int DBsiteid, URL DBbase, ISpiderReportable report)
{
base = DBbase;
siteid = DBsiteid;
companyid = DBcompanyid;
MaxPaginas = 20;
this.report = report;
count++;
taskNumber = count;
mParser = new Parser ();
PrototypicalNodeFactory factory = new PrototypicalNodeFactory ();
factory.registerTag (new LocalLinkTag ());
factory.registerTag (new LocalMetaTag ());
factory.registerTag (new LocalFrameTag ());
factory.registerTag (new LocalTitleTag ());
factory.registerTag (new LocalHeadingTag ());
mParser.setNodeFactory (factory);
}
public void run()
{
clear();
report.koppelDB(siteid,companyid);
addURL(base);
begin();
}
public Collection getWorkloadPath()
{
return workloadPath;
}
public Collection getWorkloadError()
{
return workloadError;
}
public Collection getWorkloadWaiting()
{
return workloadWaiting;
}
public Collection getWorkloadProcessed()
{
return workloadProcessed;
}
public void clear()
{
getWorkloadError().clear();
getWorkloadWaiting().clear();
getWorkloadProcessed().clear();
getWorkloadPath().clear();
}
public void addURL(URL url)
{
if ( getWorkloadWaiting().contains(url) )
return;
if ( getWorkloadError().contains(url) )
return;
if ( getWorkloadProcessed().contains(url) )
return;
if ( getWorkloadPath().contains(url.getPath()) )
return;
getWorkloadPath().add(url.getPath());
log("PROCES: " + taskNumber + " Adding to workload: " + url );
getWorkloadWaiting().add(url);
MaxPaginas--;
}
protected void processURL (URL Furl) throws ParserException
{
NodeList Nlist;
getWorkloadWaiting().remove(Furl);
getWorkloadProcessed().add(Furl);
String url = Furl.toString();
StringExtractor se = new StringExtractor (url);
try
{
content = se.extractStrings ();
}
catch (ParserException e)
{
e.printStackTrace ();
}
try
{
mParser.setURL (url);
try
{
Nlist = new NodeList ();
for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); )
Nlist.add (e.nextNode ());
}
catch (EncodingChangeException ece)
{
mParser.reset ();
Nlist = new NodeList ();
for (NodeIterator e = mParser.elements (); e.hasMoreNodes (); )
Nlist.add (e.nextNode ());
}
}
catch (ParserException pe)
{
String message;
message = pe.getMessage ();
if ((null != message) && (message.endsWith ("does not contain text")))
{
System.out.println("Is geen text bestand...");
}
else
throw pe;
}
report.writeDB(siteid,(String)Furl.getPath(),content, titel, meta, kopjes);
String content = "";
String meta = "";
String titel = "";
String kopjes = "";
log("Complete: " + url);
}
class LocalLinkTag extends LinkTag
{
public void doSemanticAction ()
throws
ParserException
{
if(!isHTTPLikeLink())
return;
String link = getLink();
int index = link.indexOf('#');
if (index != -1)
link = link.substring(0, index);
if(MaxPaginas>1)
handleLink(base,link);
else
return;
}
}
class LocalFrameTag extends FrameTag
{
public void doSemanticAction ()
throws
ParserException
{
String link = getFrameLocation ();
if(MaxPaginas>1){
handleLink(base,link);
}
}
}
public class StringExtractor
{
private String resource;
public StringExtractor (String resource)
{
this.resource = resource;
}
public String extractStrings ()
throws
ParserException
{
StringBean sb;
sb = new StringBean ();
sb.setLinks (false);
sb.setURL (resource);
return (sb.getStrings ());
}
}
class LocalTitleTag extends TitleTag
{
public void doSemanticAction ()
throws
ParserException
{
titel = getTitle();
}
}
class LocalHeadingTag extends HeadingTag
{
public void doSemanticAction ()
throws
ParserException
{
kopjes = kopjes + " " + toPlainTextString();
}
}
class LocalMetaTag extends MetaTag
{
public void doSemanticAction ()
throws
ParserException
{
String metaNaam = null;
metaNaam = getMetaTagName();
if(metaNaam!=null)
if(metaNaam.equals("keywords") || metaNaam.equals("description"))
{
meta = meta + " " + getMetaContent();
}
}
}
public void begin()
{
while ( !getWorkloadWaiting().isEmpty()) {
Object list[] = getWorkloadWaiting().toArray();
for ( int i=0;(i<list.length);i++ )
try{
processURL((URL)list[i]);
}catch(ParserException pe){
System.out.println("Parser error:"+pe);
MaxPaginas++;
}
}
}
protected void handleLink(URL base,String str)
{
try {
URL url = new URL(base,str);
if ( report.spiderFoundURL(base,url)){
addURL(url);
}
} catch ( MalformedURLException e ) {}
}
public void log(String entry)
{
System.out.println(entry );
}
}
It must have something to do with the inner classes and processURL since i replaced only this code Swing parser with htmlparser.
I dont expect anyone to exactly say this is wrong or that, but maybe some tools and suggestions how to solve would be very welcome.
thanks>
Similar Threads
-
Replies: 2
Last Post: 06-14-2006, 03:16 PM
-
By Glen Kunene in forum Talk to the Editors
Replies: 17
Last Post: 03-23-2002, 12:43 AM
-
By Brad O'Hearne in forum Talk to the Editors
Replies: 2
Last Post: 11-05-2001, 09:32 AM
-
By Tim Romano in forum Talk to the Editors
Replies: 1
Last Post: 07-27-2001, 08:05 AM
-
By Keith Franklin, MCSD in forum java.announcements
Replies: 0
Last Post: 08-18-2000, 06:37 PM
Posting Permissions
- You may not post new threads
- You may not post replies
- You may not post attachments
- You may not edit your posts
Forum Rules
|
Top DevX Stories
Easy Web Services with SQL Server 2005 HTTP Endpoints
JavaOne 2005: Java Platform Roadmap Focuses on Ease of Development, Sun Focuses on the "Free" in F.O.S.S.
Wed Yourself to UML with the Power of Associations
Microsoft to Add AJAX Capabilities to ASP.NET
IBM's Cloudscape Versus MySQL
|
Bookmarks