java - Getting many memory errors when try to run it for few days in my web crawler -
i developing web crawler application. when run program getting these error messages below:
i've got these errors after running program more 3 hours. tried allocate memory changing eclipse.ini setting 2048 mb of ram answered in this topic still same errors after 3 hours or less. should run program more 2-3 days non-stopping analyse results.
can tell me missing here these error below ?
these classes:
seeds.txt
http://www.stanford.edu http://www.archive.org
webcrawler.java
package pkg.crawler; import java.io.bufferedwriter; import java.io.file; import java.io.filewriter; import java.io.ioexception; import java.io.printwriter; import java.net.malformedurlexception; import java.net.sockettimeoutexception; import java.util.*; import java.util.concurrent.executorservice; import java.util.concurrent.priorityblockingqueue; import java.util.concurrent.timeunit; import org.jsoup.httpstatusexception; import org.jsoup.unsupportedmimetypeexception; import org.joda.time.datetime; public class webcrawler { public static queue <linknodelight> queue = new priorityblockingqueue <> (); // priority queue public static final int n_threads = 5; // amount of threads private static set<string> processed = new linkedhashset <> (); // set of processed urls private printwriter out; // output file private printwriter err; // error file private static integer cntintra = new integer (0); // counters intra- links in queue private static integer cntinter = new integer (0); // counters inter- links in queue private static integer dub = new integer (0); // amount of skipped urls public static void main(string[] args) throws exception { system.out.println("running web crawler: " + new date()); webcrawler webcrawler = new webcrawler(); webcrawler.createfiles(); try (scanner in = new scanner(new file ("seeds.txt"))) { while (in.hasnext()) { webcrawler.enque(new linknode (in.nextline().trim())); } } catch (ioexception e) { e.printstacktrace(); return; } webcrawler.processqueue(); webcrawler.out.close(); webcrawler.err.close(); } public void processqueue(){ /* run in threads */ runnable r = new runnable() { @override public void run() { /* queue may empty process not finished, that's why need check if links being processed */ while (true) { linknode link = deque(); if (link == null) continue; link.setstarttime(new datetime()); boolean process = processlink(link); link.setendtime(new datetime()); if (!process) continue; /* print data csv file */ if (link.getstatus() != null && link.getstatus().equals(linknodestatus.ok)) { synchronized(out) { out.println(getoutputline(link)); out.flush(); } } else { synchronized(err) { err.println(getoutputline(link)); err.flush(); } } } } }; /* run n_threads threads perform dequeue , process */ linkedlist <thread> threads = new linkedlist <> (); (int = 0; < n_threads; i++) { threads.add(new thread(r)); threads.getlast().start(); } (thread thread : threads) { try { thread.join(); } catch (interruptedexception e) { e.printstacktrace(); } } } /* returns true if link processed */ private boolean processlink(linknode inputlink) { string url = geturlgeneralform(inputlink); boolean process = true; synchronized (processed) { if (processed.contains(url)) { process = false; synchronized (dub) {dub++;} } else processed.add(url); } /* start processing if url have not been processed yet or not being processed */ if (process) { system.out.println("processing url " + url); list<linknodelight> outputlinks = parseandwieghtresults(inputlink); (linknodelight outputlink : outputlinks) { string geturlgeneralforumoutput = geturlgeneralform(outputlink); /* add new link queue if has not been processed yet */ process = true; synchronized (processed) { if (processed.contains(geturlgeneralforumoutput)) { process = false; synchronized (dub) {dub++;} } } if (process) { enque(outputlink); } } return true; } return false; } void enque(linknodelight link){ link.setenquetime(new datetime()); /* add method requires implicit priority */ synchronized (queue) { if (link.interlinks) synchronized (cntinter) {cntinter++;} else synchronized (cntintra) {cntintra++;} //queue.add(link, 100 - (int)(link.getweight() * 100.f)); queue.add(link); } } /** * picks element queue * @return top element queue or null if queue empty */ linknode deque(){ /* link must checked */ linknode link = null; synchronized (queue) { link = (linknode) queue.poll(); if (link != null) { link.setdequetime(new datetime()); if (link.isinterlinks()) synchronized (cntinter) {cntinter--;} else synchronized (cntintra) {cntintra--;} } } return link; } private void createfiles() { /* create output file */ try { out = new printwriter(new bufferedwriter(new filewriter("crawledurls.csv", false))); out.println(generateheaderfile()); } catch (ioexception e) { system.err.println(e); } /* create error file */ try { err = new printwriter(new bufferedwriter(new filewriter("crawledurlserror.csv", false))); err.println(generateheaderfile()); } catch (ioexception e) { system.err.println(e); } } /** * formats string can valid entry in csv file * @param s * @return */ private static string format(string s) { // replace " "" string ret = s.replaceall("\"", "\"\""); // put string quotes return "\"" + ret + "\""; } /** * creates line needs written in outputfile * @param link * @return */ public static string getoutputline(linknode link){ stringbuilder builder = new stringbuilder(); builder.append(link.getparentlink()!=null ? format(link.getparentlink().geturl()) : ""); builder.append(","); builder.append(link.getparentlink()!=null ? link.getparentlink().getipadress() : ""); builder.append(","); builder.append(link.getparentlink()!=null ? link.getparentlink().linkprocessingduration() : ""); builder.append(","); builder.append(format(link.geturl())); builder.append(","); builder.append(link.getdomain()); builder.append(","); builder.append(link.isinterlinks()); builder.append(","); builder.append(util.formatdate(link.getenquetime())); builder.append(","); builder.append(util.formatdate(link.getdequetime())); builder.append(","); builder.append(link.waitinginqueue()); builder.append(","); builder.append(queue.size()); /* inter , intra links in queue */ builder.append(","); builder.append(cntintra.tostring()); builder.append(","); builder.append(cntinter.tostring()); builder.append(","); builder.append(dub); builder.append(","); builder.append(new date ()); /* url size*/ builder.append(","); builder.append(link.getsize()); /* html file builder.append(","); builder.append(link.getfilename());*/ /* add http error */ builder.append(","); if (link.getparseexception() != null) { if (link.getparseexception() instanceof httpstatusexception) builder.append(((httpstatusexception) link.getparseexception()).getstatuscode()); if (link.getparseexception() instanceof sockettimeoutexception) builder.append("time out"); if (link.getparseexception() instanceof malformedurlexception) builder.append("url not valid"); if (link.getparseexception() instanceof unsupportedmimetypeexception) builder.append("unsupported mime type: " + ((unsupportedmimetypeexception)link.getparseexception()).getmimetype()); } return builder.tostring(); } /** * generates header file * @param link * @return */ private string generateheaderfile(){ stringbuilder builder = new stringbuilder(); builder.append("seed url"); builder.append(","); builder.append("seed ip"); builder.append(","); builder.append("process duration"); builder.append(","); builder.append("link url"); builder.append(","); builder.append("link domain"); builder.append(","); builder.append("link ip"); builder.append(","); builder.append("enque time"); builder.append(","); builder.append("deque time"); builder.append(","); builder.append("waiting in queue"); builder.append(","); builder.append("queuesize"); builder.append(","); builder.append("intra in queue"); builder.append(","); builder.append("inter in queue"); builder.append(","); builder.append("dublications skipped"); /* time printed, no header */ builder.append(","); builder.append("time"); /* url size*/ builder.append(","); builder.append("size bytes"); /* http errors */ builder.append(","); builder.append("http error"); return builder.tostring(); } string geturlgeneralform(linknodelight link){ string url = link.geturl(); if (url.endswith("/")){ url = url.substring(0, url.length() - 1); } return url; } private list<linknodelight> parseandwieghtresults(linknode inputlink) { list<linknodelight> outputlinks = htmlparser.parse(inputlink); if (inputlink.hasparseexception()) { return outputlinks; } else { return urlweight.weight(inputlink, outputlinks); } } }
htmlparser.java
package pkg.crawler; import org.jsoup.connection; import org.jsoup.jsoup; import org.jsoup.nodes.document; import org.jsoup.nodes.element; import org.jsoup.select.elements; import java.io.bufferedwriter; import java.io.file; import java.io.fileoutputstream; import java.io.filewriter; import java.io.ioexception; import java.io.outputstreamwriter; import java.io.printwriter; import java.io.writer; import java.math.biginteger; import java.util.formatter; import java.util.hashmap; import java.util.linkedlist; import java.util.list; import java.util.concurrent.timeunit; import java.util.logging.logger; import java.security.*; import java.nio.file.path; import java.nio.file.paths; public class htmlparser { private static final int read_timeout_in_millissecs = (int) timeunit.milliseconds.convert(30, timeunit.seconds); private static hashmap <string, integer> filecounter = new hashmap<> (); public static list<linknodelight> parse(linknode inputlink){ list<linknodelight> outputlinks = new linkedlist<>(); try { inputlink.setipadress(ipfromurl.getip(inputlink.geturl())); string url = inputlink.geturl(); if (inputlink.getipadress() != null) { url.replace(urlweight.gethostname(url), inputlink.getipadress()); } document parsedresults = jsoup .connect(url) .timeout(read_timeout_in_millissecs) .get(); inputlink.setsize(parsedresults.html().length()); /* ip address moved here in order speed process */ inputlink.setstatus(linknodestatus.ok); inputlink.setdomain(urlweight.getdomainname(inputlink.geturl())); if (true) { /* save file html */ string filename = parsedresults.title();//digestbig.tostring(16) + ".html"; if (filename.length() > 24) { filename = filename.substring(0, 24); } filename = filename.replaceall("[^\\w\\d\\s]", "").trim(); filename = filename.replaceall("\\s+", " "); if (!filecounter.containskey(filename)) { filecounter.put(filename, 1); } else { integer tmp = filecounter.remove(filename); filecounter.put(filename, tmp + 1); } filename = filename + "-" + (filecounter.get(filename)).tostring() + ".html"; filename = paths.get("downloads", filename).tostring(); inputlink.setfilename(filename); /* use md5 of url file name */ try (printwriter out = new printwriter(new bufferedwriter(new filewriter(filename)))) { out.println("<!--" + inputlink.geturl() + "-->"); out.print(parsedresults.html()); out.flush(); out.close(); } catch (ioexception e) { e.printstacktrace(); } } string tag; elements tagelements; list<linknode> result; tag = "a[href"; tagelements = parsedresults.select(tag); result = tolinknodeobject(inputlink, tagelements, tag); outputlinks.addall(result); tag = "area[href"; tagelements = parsedresults.select(tag); result = tolinknodeobject(inputlink, tagelements, tag); outputlinks.addall(result); } catch (ioexception e) { inputlink.setparseexception(e); inputlink.setstatus(linknodestatus.error); } return outputlinks; } static list<linknode> tolinknodeobject(linknode parentlink, elements tagelements, string tag) { list<linknode> links = new linkedlist<>(); (element element : tagelements) { if(isfragmentref(element)){ continue; } string absoluteref = string.format("abs:%s", tag.contains("[") ? tag.substring(tag.indexof("[") + 1, tag.length()) : "href"); string url = element.attr(absoluteref); if(url!=null && url.trim().length()>0) { linknode link = new linknode(url); link.settag(element.tagname()); link.setparentlink(parentlink); links.add(link); } } return links; } static boolean isfragmentref(element element){ string href = element.attr("href"); return href!=null && (href.trim().startswith("#") || href.startswith("mailto:")); }
}
util.java
package pkg.crawler; import java.util.date; import org.joda.time.datetime; import org.joda.time.format.datetimeformat; import org.joda.time.format.datetimeformatter; public class util { private static datetimeformatter formatter; static { formatter = datetimeformat.forpattern("yyyy-mm-dd hh:mm:ss:sss"); } public static string linktostring(linknode inputlink){ return string.format("%s\t%s\t%s\t%s\t%s\t%s", inputlink.geturl(), inputlink.getweight(), formatdate(inputlink.getenquetime()), formatdate(inputlink.getdequetime()), differenceinmilliseconds(inputlink.getenquetime(), inputlink.getdequetime()), inputlink.getparentlink()==null?"":inputlink.getparentlink().geturl() ); } public static string linktoerrorstring(linknode inputlink){ return string.format("%s\t%s\t%s\t%s\t%s\t%s", inputlink.geturl(), inputlink.getweight(), formatdate(inputlink.getenquetime()), formatdate(inputlink.getdequetime()), inputlink.getparentlink()==null?"":inputlink.getparentlink().geturl(), inputlink.getparseexception().getmessage() ); } public static string formatdate(datetime date){ return formatter.print(date); } public static long differenceinmilliseconds(datetime dequetime, datetime enquetime){ return (dequetime.getmillis()- enquetime.getmillis()); } public static int differenceinseconds(date enquetime, date dequetime){ return (int)((dequetime.gettime()/1000) - (enquetime.gettime()/1000)); } public static int differenceinminutes(date enquetime, date dequetime){ return (int)((dequetime.gettime()/60000) - (enquetime.gettime()/60000)); } }
urlweight.java
package pkg.crawler; import java.util.arraylist; import java.util.hashset; import java.util.linkedlist; import java.util.list; import java.util.regex.pattern; public class urlweight { public static list<linknodelight> weight(linknode sourcelink, list<linknodelight> links) { list<linknodelight> interlinks = new linkedlist<>(); list<linknodelight> intralinks = new linkedlist<>(); (linknodelight link : links) { if (isintralink(sourcelink, link)) { intralinks.add(link); link.setinterlinks(false); } else { interlinks.add(link); link.setinterlinks(true); } } static boolean isintralink(linknodelight sourcelink, linknodelight link){ string parentdomainname = gethostname(sourcelink.geturl()); string childdomainname = gethostname(link.geturl()); return parentdomainname.equalsignorecase(childdomainname); } public static string gethostname(string url) { if(url == null){ // system.out.println("deneme"); return ""; } string domainname = new string(url); int index = domainname.indexof("://"); if (index != -1) { domainname = domainname.substring(index + 3); } (int = 0; < domainname.length(); i++) if (domainname.charat(i) == '?' || domainname.charat(i) == '/') { domainname = domainname.substring(0, i); break; } /*if (index != -1) { domainname = domainname.substring(0, index); }*/ /* have keep www in order replacements ip */ //domainname = domainname.replacefirst("^www.*?\\.", ""); return domainname; } public static string getdomainname(string url) { string [] tmp= gethostname(url).split("\\."); if (tmp.length == 0) return ""; return tmp[tmp.length - 1]; } }
pingtaskmanager.java
package pkg.crawler; import java.util.concurrent.executorservice; import java.util.concurrent.executors; public class pingtaskmanager { private static executorservice executor = executors.newfixedthreadpool(100); public static void ping (linknode e) { executor.submit(new pingtaks(e)); } } class pingtaks implements runnable { private linknode link; public pingtaks( linknode link ) { } @override public void run() { /* link.ping(); */ } }
linknodestatus.java
package pkg.crawler; public enum linknodestatus { ok, error }
linknodelight.java
package pkg.crawler; import org.joda.time.datetime; public class linknodelight implements comparable<linknodelight> { protected string url; protected float weight; protected datetime enquetime; protected boolean interlinks; public string geturl() { return url; } public float getweight() { return weight; } public void setweight(float weight) { this.weight = weight; } public datetime getenquetime() { return enquetime; } public linknodelight(string url) { this.url = url; } public void setenquetime(datetime enquetime) { this.enquetime = enquetime; } @override public int compareto(linknodelight link) { if (this.weight < link.weight) return 1; else if (this.weight > link.weight) return -1; return 0; } }
linknode.java
package pkg.crawler; import java.io.ioexception; import java.net.httpurlconnection; import java.net.socket; import java.net.url; import java.net.unknownhostexception; import java.util.date; import org.joda.time.datetime; public class linknode extends linknodelight{ public linknode(string url) { super(url); } private string tag; private linknode parentlink; private ioexception parseexception = null; // initialize parse exception null private float weight; private datetime dequetime; private datetime starttime; private datetime endtime; private linknodestatus status; private string ipadress; private int size; private string filename; private string domain; public datetime getstarttime() { return starttime; } public void setstarttime(datetime starttime) { this.starttime = starttime; } public datetime getendtime() { return endtime; } public void setendtime(datetime endtime) { this.endtime = endtime; } public datetime getdequetime() { return dequetime; } public string gettag() { return tag; } public linknode getparentlink() { return parentlink; } public exception getparseexception() { return parseexception; } public boolean hasparseexception(){ return parseexception!=null; } public void setdequetime(datetime dequetime) { this.dequetime = dequetime; } public void settag(string tag) { this.tag = tag; } public void setparentlink(linknode parentlink) { this.parentlink = parentlink; } public void setparseexception(ioexception parseexception) { this.parseexception = parseexception; } @override public boolean equals(object o) { if (this == o) { return true; } if (o == null || getclass() != o.getclass()) { return false; } linknode link = (linknode) o; if (url != null ? !url.equals(link.url) : link.url != null) { return false; } return true; } @override public int hashcode() { return url != null ? url.hashcode() : 0; } public long waitinginqueue(){ return util.differenceinmilliseconds( dequetime,enquetime ); } public long linkprocessingduration(){ return util.differenceinmilliseconds( endtime,starttime ); } @override public string tostring() { stringbuilder sb = new stringbuilder("linknode{"); sb.append("url='").append(url).append('\''); sb.append(", score=").append(weight); sb.append(", enquetime=").append(enquetime); sb.append(", dequetime=").append(dequetime); sb.append(", tag=").append(tag); if(parentlink!=null) { sb.append(", parentlink=").append(parentlink.geturl()); } sb.append('}'); return sb.tostring(); } public void setstatus(linknodestatus status) { this.status = status; } public linknodestatus getstatus(){ if (status == null) { status = linknodestatus.error; } return status; } // check server link exist or not /* method gives fake errors public linknodestatus ping () { boolean reachable = false; string sanitizeurl = url.replacefirst("^https", "http"); try { httpurlconnection connection = (httpurlconnection) new url(sanitizeurl).openconnection(); connection.setconnecttimeout(1000); connection.setrequestmethod("head"); int responsecode = connection.getresponsecode(); system.err.println(url + " " + responsecode); reachable = (200 <= responsecode && responsecode <= 399); } catch (ioexception exception) { } return reachable?linknodestatus.ok: linknodestatus.error; }*/ public string getipadress() { return ipadress; } public void setipadress(string ipadress) { this.ipadress = ipadress; } /* methods controlling url size */ public void setsize(int size) { this.size = size; } public int getsize() { return this.size; } public void setfilename(string filename) { this.filename = filename; } public string getfilename() { return this.filename; } public string getdomain() { return domain; } public void setdomain(string domain) { this.domain = domain; } }
i tried allocate memory changing eclipse.ini setting 2048 mb of ram answered in topic still same errors after 3 hours or less.
i hate repeat myself(*), in eclipse.ini set memory eclipse, has nothing memory crawler.
when using command line, need start via java -xmx2g pkg.crawler.webcrawler
.
when starting eclipse, need add -xmx2g
run configuration ("vm arguments" rather "program arguments").
(*) link deleted question; requires reputation view.
Comments
Post a Comment