多线程实现的Java爬虫程序
作者:网络转载 发布时间:[ 2013/10/9 15:33:55 ] 推荐标签:
|
public synchronized void addReport(String s) { try { report += s; PrintWriter pwReport = new PrintWriter(new FileOutputStream( "report.txt")); pwReport.println(report); pwReport.close(); } catch (Exception e) { System.out.println("生成报告文件失败!"); } } public synchronized String getAUrl() { String tmpAUrl = arrUrls.get(0); arrUrls.remove(0); return tmpAUrl; } public synchronized String getUrl() { String tmpUrl = arrUrl.get(0); arrUrl.remove(0); return tmpUrl; } public synchronized Integer getIntWebIndex() { intWebIndex++; return intWebIndex; } /** * @param args */ public static void main(String[] args) { if (args.length == 0 || args[0].equals("")) { System.out.println("No input!"); System.exit(1); } else if (args.length == 1) { GetWeb gw = new GetWeb(args[0]); gw.getWebByHomePage(); } else { GetWeb gw = new GetWeb(args[0], Integer.parseInt(args[1])); gw.getWebByHomePage(); } } public void getWebByHomePage() { startTime = System.currentTimeMillis(); this.myDomain = getDomain(); if (myDomain == null) { System.out.println("Wrong input!"); // System.exit(1); return; } System.out.println("Homepage = " + strHomePage); addReport("Homepage = " + strHomePage + "! "); System.out.println("Domain = " + myDomain); addReport("Domain = " + myDomain + "! "); arrUrls.add(strHomePage); arrUrl.add(strHomePage); allUrls.put(strHomePage, 0); deepUrls.put(strHomePage, 1); File fDir = new File(fPath); if (!fDir.exists()) { fDir.mkdir(); } System.out.println("Start!"); this.addReport("Start! "); String tmp = getAUrl(); this.getWebByUrl(tmp, charset, allUrls.get(tmp) + ""); int i = 0; for (i = 0; i < intThreadNum; i++) { new Thread(new Processer(this)).start(); } while (true) { if (arrUrls.isEmpty() && Thread.activeCount() == 1) { long finishTime = System.currentTimeMillis(); long costTime = finishTime - startTime; System.out.println(" Finished!"); addReport(" Finished! "); System.out.println("Start time = " + startTime + " " + "Finish time = " + finishTime + " " + "Cost time = " + costTime + "ms"); addReport("Start time = " + startTime + " " + "Finish time = " + finishTime + " " + "Cost time = " + costTime + "ms" + " "); System.out.println("Total url number = " + (webSuccessed + webFailed) + " Successed: " + webSuccessed + " Failed: " + webFailed); addReport("Total url number = " + (webSuccessed + webFailed) + " Successed: " + webSuccessed + " Failed: " + webFailed + " "); String strIndex = ""; String tmpUrl = ""; while (!arrUrl.isEmpty()) { tmpUrl = getUrl(); strIndex += "Web depth:" + deepUrls.get(tmpUrl) + " Filepath: " + fPath + "/web" + allUrls.get(tmpUrl) + ".htm" + " url:" + tmpUrl + " "; } System.out.println(strIndex); try { PrintWriter pwIndex = new PrintWriter(new FileOutputStream( "fileindex.txt")); pwIndex.println(strIndex); pwIndex.close(); } catch (Exception e) { System.out.println("生成索引文件失败!"); } break; } } } public void getWebByUrl(String strUrl, String charset, String fileIndex) { try { // if(charset==null||"".equals(charset))charset="utf-8"; System.out.println("Getting web by url: " + strUrl); addReport("Getting web by url: " + strUrl + " "); URL url = new URL(strUrl); URLConnection conn = url.openConnection(); conn.setDoOutput(true); InputStream is = null; is = url.openStream(); String filePath = fPath + "/web" + fileIndex + ".htm"; PrintWriter pw = null; FileOutputStream fos = new FileOutputStream(filePath); OutputStreamWriter writer = new OutputStreamWriter(fos); pw = new PrintWriter(writer); BufferedReader bReader = new BufferedReader(new InputStreamReader( is)); StringBuffer sb = new StringBuffer(); String rLine = null; String tmp_rLine = null; while ((rLine = bReader.readLine()) != null) { tmp_rLine = rLine; int str_len = tmp_rLine.length(); if (str_len > 0) { sb.append(" " + tmp_rLine); pw.println(tmp_rLine); pw.flush(); if (deepUrls.get(strUrl) < webDepth) getUrlByString(tmp_rLine, strUrl); } tmp_rLine = null; } is.close(); pw.close(); System.out.println("Get web successfully! " + strUrl); addReport("Get web successfully! " + strUrl + " "); addWebSuccessed(); } catch (Exception e) { System.out.println("Get web failed! " + strUrl); addReport("Get web failed! " + strUrl + " "); addWebFailed(); } } public String getDomain() { String reg = "(?<=http\://[a-zA-Z0-9]{0,100}[.]{0,1})[^.\s]*?\.(com|cn|net|org|biz|info|cc|tv)"; Pattern p = Pattern.compile(reg, Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(strHomePage); boolean blnp = m.find(); if (blnp == true) { return m.group(0); } return null; } public void getUrlByString(String inputArgs, String strUrl) { String tmpStr = inputArgs; String regUrl = "(?<=(href=)["]?[']?)[http://][^\s"'\?]*(" + myDomain + ")[^\s"'>]*"; Pattern p = Pattern.compile(regUrl, Pattern.CASE_INSENSITIVE); Matcher m = p.matcher(tmpStr); boolean blnp = m.find(); // int i = 0; while (blnp == true) { if (!allUrls.containsKey(m.group(0))) { System.out.println("Find a new url,depth:" + (deepUrls.get(strUrl) + 1) + " " + m.group(0)); addReport("Find a new url,depth:" + (deepUrls.get(strUrl) + 1) + " " + m.group(0) + " "); arrUrls.add(m.group(0)); arrUrl.add(m.group(0)); allUrls.put(m.group(0), getIntWebIndex()); deepUrls.put(m.group(0), (deepUrls.get(strUrl) + 1)); } tmpStr = tmpStr.substring(m.end(), tmpStr.length()); m = p.matcher(tmpStr); blnp = m.find(); } } class Processer implements Runnable { GetWeb gw; public Processer(GetWeb g) { this.gw = g; } public void run() { // Thread.sleep(5000); while (!arrUrls.isEmpty()) { String tmp = getAUrl(); getWebByUrl(tmp, charset, allUrls.get(tmp) + ""); } } } } |
本文内容不用于商业目的,如涉及知识产权问题,请权利人联系SPASVO小编(021-61079698-8054),我们将立即处理,马上删除。
相关推荐
Java性能测试有哪些不为众人所知的原则?Java设计模式??装饰者模式谈谈Java中遍历Map的几种方法Java Web入门必知你需要理解的Java反射机制知识总结编写更好的Java单元测试的7个技巧编程常用的几种时间戳转换(java .net 数据库)适合Java开发者学习的Python入门教程Java webdriver如何获取浏览器新窗口中的元素?Java重写与重载(区别与用途)Java变量的分类与初始化JavaScript有这几种测试分类Java有哪四个核心技术?给 Java开发者的10个大数据工具和框架Java中几个常用设计模式汇总java生态圈常用技术框架、开源中间件,系统架构及经典案例等

sales@spasvo.com