java - Getting many memory errors when try to run it for few days in my web crawler -


i developing web crawler application. when run program getting these error messages below:

enter image description here

i've got these errors after running program more 3 hours. tried allocate memory changing eclipse.ini setting 2048 mb of ram answered in this topic still same errors after 3 hours or less. should run program more 2-3 days non-stopping analyse results.

can tell me missing here these error below ?

these classes:

seeds.txt

http://www.stanford.edu http://www.archive.org 

webcrawler.java

 package pkg.crawler;  import java.io.bufferedwriter; import java.io.file; import java.io.filewriter; import java.io.ioexception; import java.io.printwriter; import java.net.malformedurlexception; import java.net.sockettimeoutexception; import java.util.*; import java.util.concurrent.executorservice; import java.util.concurrent.priorityblockingqueue; import java.util.concurrent.timeunit;  import org.jsoup.httpstatusexception; import org.jsoup.unsupportedmimetypeexception; import org.joda.time.datetime;   public class webcrawler {  public static queue <linknodelight> queue = new priorityblockingqueue <> (); // priority queue public static final int n_threads = 5;                                 // amount of threads private static set<string> processed = new linkedhashset <> ();         // set of processed urls private printwriter out;                                                // output file private printwriter err;                                                // error file private static integer cntintra = new integer (0);                              // counters intra- links in queue private static integer cntinter = new integer (0);                              // counters inter- links in queue private static integer dub = new integer (0);                                   // amount of skipped urls  public static void main(string[] args) throws exception {     system.out.println("running web crawler: " + new date());      webcrawler webcrawler = new webcrawler();     webcrawler.createfiles();     try (scanner in = new scanner(new file ("seeds.txt"))) {         while (in.hasnext()) {             webcrawler.enque(new linknode (in.nextline().trim()));         }     } catch (ioexception e) {         e.printstacktrace();         return;     }     webcrawler.processqueue();     webcrawler.out.close();     webcrawler.err.close(); }  public void processqueue(){     /* run in threads */     runnable r = new runnable() {         @override          public void run() {             /* queue may empty process not finished, that's why need check if links being processed */             while (true) {                 linknode link = deque();                 if (link == null)                     continue;                 link.setstarttime(new datetime());                 boolean process = processlink(link);                 link.setendtime(new datetime());                 if (!process)                     continue;                 /* print data csv file */                 if (link.getstatus() != null && link.getstatus().equals(linknodestatus.ok)) {                     synchronized(out) {                         out.println(getoutputline(link));                         out.flush();                     }                 } else {                     synchronized(err) {                         err.println(getoutputline(link));                         err.flush();                     }                 }             }         }     };     /* run n_threads threads perform dequeue , process */     linkedlist <thread> threads = new linkedlist <> ();     (int = 0; < n_threads; i++) {         threads.add(new thread(r));         threads.getlast().start();     }     (thread thread : threads) {         try {             thread.join();         } catch (interruptedexception e) {             e.printstacktrace();         }     } }   /* returns true if link processed */ private boolean processlink(linknode inputlink) {     string url = geturlgeneralform(inputlink);     boolean process = true;     synchronized (processed) {         if (processed.contains(url)) {             process = false;             synchronized (dub) {dub++;}         } else             processed.add(url);     }     /* start processing if url have not been processed yet or not being processed */     if (process) {         system.out.println("processing url " + url);         list<linknodelight> outputlinks = parseandwieghtresults(inputlink);         (linknodelight outputlink : outputlinks) {             string geturlgeneralforumoutput = geturlgeneralform(outputlink);             /* add new link queue if has not been processed yet */             process = true;             synchronized (processed) {                 if (processed.contains(geturlgeneralforumoutput)) {                     process = false;                     synchronized (dub) {dub++;}                 }             }             if (process) {                 enque(outputlink);             }         }         return true;     }     return false; }  void enque(linknodelight link){     link.setenquetime(new datetime());     /* add method requires implicit priority */     synchronized (queue) {         if (link.interlinks)             synchronized (cntinter) {cntinter++;}         else             synchronized (cntintra) {cntintra++;}       //queue.add(link, 100 - (int)(link.getweight() * 100.f));         queue.add(link);     } }   /**  * picks element queue  * @return top element queue or null if queue empty  */ linknode deque(){     /* link must checked */     linknode link = null;     synchronized (queue) {         link = (linknode) queue.poll();         if (link != null) {             link.setdequetime(new datetime());             if (link.isinterlinks())                 synchronized (cntinter) {cntinter--;}             else                 synchronized (cntintra) {cntintra--;}         }     }     return link; }  private void createfiles() {     /* create output file */     try {         out = new printwriter(new bufferedwriter(new filewriter("crawledurls.csv", false)));         out.println(generateheaderfile());     } catch (ioexception e) {         system.err.println(e);     }     /* create error file */     try {         err = new printwriter(new bufferedwriter(new filewriter("crawledurlserror.csv", false)));         err.println(generateheaderfile());     } catch (ioexception e) {         system.err.println(e);     } } /**  * formats string can valid entry in csv file  * @param s  * @return  */ private static string format(string s) {     // replace " ""     string ret = s.replaceall("\"", "\"\"");     // put string quotes     return "\"" + ret + "\""; } /**  * creates line needs written in outputfile  * @param link  * @return  */ public static string getoutputline(linknode link){     stringbuilder builder = new stringbuilder();     builder.append(link.getparentlink()!=null ? format(link.getparentlink().geturl()) : "");     builder.append(",");     builder.append(link.getparentlink()!=null ? link.getparentlink().getipadress() : "");     builder.append(",");     builder.append(link.getparentlink()!=null ? link.getparentlink().linkprocessingduration() : "");     builder.append(",");     builder.append(format(link.geturl()));     builder.append(",");     builder.append(link.getdomain());     builder.append(",");     builder.append(link.isinterlinks());     builder.append(",");     builder.append(util.formatdate(link.getenquetime()));     builder.append(",");     builder.append(util.formatdate(link.getdequetime()));     builder.append(",");     builder.append(link.waitinginqueue());     builder.append(",");     builder.append(queue.size());     /* inter , intra links in queue */     builder.append(",");     builder.append(cntintra.tostring());     builder.append(",");     builder.append(cntinter.tostring());     builder.append(",");     builder.append(dub);     builder.append(",");     builder.append(new date ());     /* url size*/     builder.append(",");     builder.append(link.getsize());     /* html file     builder.append(",");     builder.append(link.getfilename());*/     /* add http error */     builder.append(",");     if (link.getparseexception() != null) {         if (link.getparseexception() instanceof httpstatusexception)             builder.append(((httpstatusexception) link.getparseexception()).getstatuscode());         if (link.getparseexception() instanceof sockettimeoutexception)             builder.append("time out");         if (link.getparseexception() instanceof malformedurlexception)             builder.append("url not valid");         if (link.getparseexception() instanceof unsupportedmimetypeexception)             builder.append("unsupported mime type: " + ((unsupportedmimetypeexception)link.getparseexception()).getmimetype());     }     return builder.tostring();  }  /**  * generates header file  * @param link  * @return  */ private string generateheaderfile(){     stringbuilder builder = new stringbuilder();     builder.append("seed url");     builder.append(",");     builder.append("seed ip");     builder.append(",");     builder.append("process duration");     builder.append(",");     builder.append("link url");     builder.append(",");     builder.append("link domain");     builder.append(",");     builder.append("link ip");     builder.append(",");     builder.append("enque time");     builder.append(",");     builder.append("deque time");     builder.append(",");     builder.append("waiting in queue");     builder.append(",");     builder.append("queuesize");     builder.append(",");     builder.append("intra in queue");     builder.append(",");     builder.append("inter in queue");     builder.append(",");     builder.append("dublications skipped");     /* time printed, no header */     builder.append(",");     builder.append("time");     /* url size*/     builder.append(",");     builder.append("size bytes");     /* http errors */     builder.append(",");     builder.append("http error");     return builder.tostring();  }    string geturlgeneralform(linknodelight link){     string url = link.geturl();     if (url.endswith("/")){         url = url.substring(0, url.length() - 1);     }     return url; }   private list<linknodelight> parseandwieghtresults(linknode inputlink) {     list<linknodelight> outputlinks = htmlparser.parse(inputlink);     if (inputlink.hasparseexception()) {         return outputlinks;     } else {         return urlweight.weight(inputlink, outputlinks);     } } } 

htmlparser.java

package pkg.crawler;  import org.jsoup.connection; import org.jsoup.jsoup; import org.jsoup.nodes.document; import org.jsoup.nodes.element; import org.jsoup.select.elements;  import java.io.bufferedwriter; import java.io.file; import java.io.fileoutputstream; import java.io.filewriter; import java.io.ioexception; import java.io.outputstreamwriter; import java.io.printwriter; import java.io.writer; import java.math.biginteger; import java.util.formatter; import java.util.hashmap; import java.util.linkedlist; import java.util.list; import java.util.concurrent.timeunit; import java.util.logging.logger; import java.security.*; import java.nio.file.path; import java.nio.file.paths;   public class htmlparser {  private static final int read_timeout_in_millissecs = (int) timeunit.milliseconds.convert(30, timeunit.seconds); private static hashmap <string, integer> filecounter = new hashmap<> ();   public static list<linknodelight> parse(linknode inputlink){     list<linknodelight> outputlinks = new linkedlist<>();     try {         inputlink.setipadress(ipfromurl.getip(inputlink.geturl()));         string url = inputlink.geturl();         if (inputlink.getipadress() != null) {             url.replace(urlweight.gethostname(url), inputlink.getipadress());         }         document parsedresults =  jsoup                 .connect(url)                 .timeout(read_timeout_in_millissecs)                 .get();         inputlink.setsize(parsedresults.html().length());         /* ip address moved here in order speed process */         inputlink.setstatus(linknodestatus.ok);         inputlink.setdomain(urlweight.getdomainname(inputlink.geturl()));         if (true) {             /* save file html */             string filename = parsedresults.title();//digestbig.tostring(16) + ".html";             if (filename.length() > 24) {                 filename = filename.substring(0, 24);             }             filename = filename.replaceall("[^\\w\\d\\s]", "").trim();             filename = filename.replaceall("\\s+",  " ");              if (!filecounter.containskey(filename)) {                 filecounter.put(filename, 1);             } else {                 integer tmp = filecounter.remove(filename);                 filecounter.put(filename, tmp + 1);             }             filename = filename + "-" + (filecounter.get(filename)).tostring() + ".html";             filename = paths.get("downloads", filename).tostring();             inputlink.setfilename(filename);             /* use md5 of url file name */             try (printwriter out = new printwriter(new bufferedwriter(new filewriter(filename)))) {                 out.println("<!--" + inputlink.geturl() + "-->");                 out.print(parsedresults.html());                 out.flush();                 out.close();             } catch (ioexception e) {                 e.printstacktrace();             }         }         string tag;         elements tagelements;         list<linknode> result;           tag = "a[href";         tagelements = parsedresults.select(tag);         result = tolinknodeobject(inputlink, tagelements, tag);         outputlinks.addall(result);           tag = "area[href";         tagelements = parsedresults.select(tag);         result = tolinknodeobject(inputlink, tagelements, tag);         outputlinks.addall(result);     } catch (ioexception e) {         inputlink.setparseexception(e);         inputlink.setstatus(linknodestatus.error);     }      return outputlinks; }   static list<linknode> tolinknodeobject(linknode parentlink, elements tagelements, string tag) {     list<linknode> links = new linkedlist<>();     (element element : tagelements) {          if(isfragmentref(element)){             continue;         }          string absoluteref = string.format("abs:%s", tag.contains("[") ? tag.substring(tag.indexof("[") + 1, tag.length()) : "href");         string url = element.attr(absoluteref);          if(url!=null && url.trim().length()>0) {             linknode link = new linknode(url);             link.settag(element.tagname());             link.setparentlink(parentlink);             links.add(link);         }     }     return links; }  static boolean isfragmentref(element element){     string href = element.attr("href");     return href!=null && (href.trim().startswith("#") || href.startswith("mailto:")); } 

}

util.java

package pkg.crawler;  import java.util.date;  import org.joda.time.datetime; import org.joda.time.format.datetimeformat; import org.joda.time.format.datetimeformatter;   public class util {  private static datetimeformatter formatter; static {        formatter =   datetimeformat.forpattern("yyyy-mm-dd hh:mm:ss:sss");   }   public static string linktostring(linknode inputlink){       return string.format("%s\t%s\t%s\t%s\t%s\t%s",             inputlink.geturl(),             inputlink.getweight(),             formatdate(inputlink.getenquetime()),             formatdate(inputlink.getdequetime()),             differenceinmilliseconds(inputlink.getenquetime(), inputlink.getdequetime()),             inputlink.getparentlink()==null?"":inputlink.getparentlink().geturl()     ); }  public static string linktoerrorstring(linknode inputlink){      return string.format("%s\t%s\t%s\t%s\t%s\t%s",             inputlink.geturl(),             inputlink.getweight(),             formatdate(inputlink.getenquetime()),             formatdate(inputlink.getdequetime()),             inputlink.getparentlink()==null?"":inputlink.getparentlink().geturl(),             inputlink.getparseexception().getmessage()     ); }   public static string formatdate(datetime date){     return formatter.print(date); }  public static long differenceinmilliseconds(datetime dequetime, datetime enquetime){     return (dequetime.getmillis()- enquetime.getmillis()); }  public static int differenceinseconds(date enquetime, date dequetime){     return (int)((dequetime.gettime()/1000) - (enquetime.gettime()/1000)); }  public static int differenceinminutes(date enquetime, date dequetime){     return (int)((dequetime.gettime()/60000) - (enquetime.gettime()/60000)); }  } 

urlweight.java

package pkg.crawler;  import java.util.arraylist; import java.util.hashset; import java.util.linkedlist; import java.util.list; import java.util.regex.pattern;  public class urlweight {  public static list<linknodelight> weight(linknode sourcelink, list<linknodelight> links) {      list<linknodelight> interlinks = new linkedlist<>();     list<linknodelight> intralinks = new linkedlist<>();      (linknodelight link : links) {         if (isintralink(sourcelink, link)) {             intralinks.add(link);             link.setinterlinks(false);         } else {             interlinks.add(link);             link.setinterlinks(true);         }     }    static boolean isintralink(linknodelight sourcelink, linknodelight link){      string parentdomainname = gethostname(sourcelink.geturl());      string childdomainname = gethostname(link.geturl());     return parentdomainname.equalsignorecase(childdomainname); }  public static string gethostname(string url) {     if(url == null){     //  system.out.println("deneme");         return "";      }      string domainname = new string(url);      int index = domainname.indexof("://");     if (index != -1) {          domainname = domainname.substring(index + 3);     }     (int = 0; < domainname.length(); i++)         if (domainname.charat(i) == '?' || domainname.charat(i) == '/') {             domainname = domainname.substring(0, i);             break;         }      /*if (index != -1) {          domainname = domainname.substring(0, index);     }*/      /* have keep www in order replacements ip */     //domainname = domainname.replacefirst("^www.*?\\.", "");      return domainname; } public static string getdomainname(string url) {     string [] tmp= gethostname(url).split("\\.");     if (tmp.length == 0)         return "";     return tmp[tmp.length - 1]; }   } 

pingtaskmanager.java

package pkg.crawler;  import java.util.concurrent.executorservice; import java.util.concurrent.executors;  public class pingtaskmanager {  private static executorservice executor = executors.newfixedthreadpool(100);  public  static void ping (linknode e) {     executor.submit(new pingtaks(e)); }   }  class pingtaks implements runnable {  private linknode link; public pingtaks( linknode link ) {  }  @override public void run() {     /* link.ping(); */       }   } 

linknodestatus.java

package pkg.crawler;  public enum linknodestatus { ok, error  } 

linknodelight.java

package pkg.crawler;  import org.joda.time.datetime;  public class linknodelight implements comparable<linknodelight> { protected string url; protected float weight; protected datetime enquetime; protected boolean interlinks;  public string geturl() {     return url; }  public float getweight() {     return weight; }  public void setweight(float weight) {     this.weight = weight; }  public datetime getenquetime() {     return enquetime; }   public linknodelight(string url) {     this.url = url; }   public void setenquetime(datetime enquetime) {     this.enquetime = enquetime; }  @override public int compareto(linknodelight link) {      if (this.weight < link.weight) return 1;      else if (this.weight > link.weight) return -1;         return 0;      } } 

linknode.java

package pkg.crawler;  import java.io.ioexception; import java.net.httpurlconnection; import java.net.socket; import java.net.url; import java.net.unknownhostexception; import java.util.date;    import org.joda.time.datetime;   public class linknode extends linknodelight{ public linknode(string url) {     super(url); }  private string tag; private linknode parentlink; private ioexception parseexception = null; // initialize parse exception null private float weight; private datetime dequetime; private datetime starttime; private datetime endtime; private linknodestatus status; private string ipadress; private int size; private string filename; private string domain;  public datetime getstarttime() {     return starttime; }  public void setstarttime(datetime starttime) {     this.starttime = starttime; }  public datetime getendtime() {     return endtime; }  public void setendtime(datetime endtime) {     this.endtime = endtime; }  public datetime getdequetime() {     return dequetime; }  public string gettag() {     return tag; }  public linknode getparentlink() {     return parentlink; }  public exception getparseexception() {     return parseexception; }  public boolean hasparseexception(){     return parseexception!=null; }   public void setdequetime(datetime dequetime) {     this.dequetime = dequetime; }  public void settag(string tag) {     this.tag = tag; }  public void setparentlink(linknode parentlink) {     this.parentlink = parentlink; }  public void setparseexception(ioexception parseexception) {     this.parseexception = parseexception; }  @override public boolean equals(object o) {     if (this == o) {         return true;     }     if (o == null || getclass() != o.getclass()) {         return false;     }      linknode link = (linknode) o;      if (url != null ? !url.equals(link.url) : link.url != null) {         return false;     }      return true; }  @override public int hashcode() {     return url != null ? url.hashcode() : 0; }  public long waitinginqueue(){     return util.differenceinmilliseconds( dequetime,enquetime ); }  public long linkprocessingduration(){     return util.differenceinmilliseconds( endtime,starttime ); }  @override public string tostring() {     stringbuilder sb = new stringbuilder("linknode{");     sb.append("url='").append(url).append('\'');     sb.append(", score=").append(weight);     sb.append(", enquetime=").append(enquetime);     sb.append(", dequetime=").append(dequetime);     sb.append(", tag=").append(tag);     if(parentlink!=null) {         sb.append(", parentlink=").append(parentlink.geturl());     }     sb.append('}');     return sb.tostring(); }  public void setstatus(linknodestatus status) {     this.status = status; }  public linknodestatus getstatus(){     if (status == null) {         status = linknodestatus.error;     }     return status; }  // check server link exist or not /* method gives fake errors public linknodestatus ping () {      boolean reachable = false;     string sanitizeurl = url.replacefirst("^https", "http");      try {         httpurlconnection connection = (httpurlconnection) new url(sanitizeurl).openconnection();         connection.setconnecttimeout(1000);         connection.setrequestmethod("head");         int responsecode = connection.getresponsecode();         system.err.println(url + " " + responsecode);         reachable = (200 <= responsecode && responsecode <= 399);     } catch (ioexception exception) {     }     return reachable?linknodestatus.ok: linknodestatus.error; }*/   public string getipadress() {     return ipadress; }  public void setipadress(string ipadress) {     this.ipadress = ipadress; }  /* methods controlling url size */ public void setsize(int size) {     this.size = size; }  public int getsize() {     return this.size; }  public void setfilename(string filename) {     this.filename = filename; }  public string getfilename() {     return this.filename; }  public string getdomain() {     return domain; }  public void setdomain(string domain) {     this.domain = domain;     } } 

i tried allocate memory changing eclipse.ini setting 2048 mb of ram answered in topic still same errors after 3 hours or less.

i hate repeat myself(*), in eclipse.ini set memory eclipse, has nothing memory crawler.

when using command line, need start via java -xmx2g pkg.crawler.webcrawler.

when starting eclipse, need add -xmx2g run configuration ("vm arguments" rather "program arguments").


(*) link deleted question; requires reputation view.


Comments

Popular posts from this blog

powershell Start-Process exit code -1073741502 when used with Credential from a windows service environment -

twig - Using Twigbridge in a Laravel 5.1 Package -

c# - LINQ join Entities from HashSet's, Join vs Dictionary vs HashSet performance -