技术开发 频道

用Java编程实现“网络蜘蛛”

例2:报告蜘蛛事件(ISpiderReportable.java)
import java.net.*; interface ISpiderReportable { public boolean spiderFoundURL(URL base,URL url); public void spiderURLError(URL url); public void spiderFoundEMail(String email); }

例3:可复用的蜘蛛类(Spider.java)

import java.util.*; import java.net.*; import java.io.*; import javax.swing.text.*; import javax.swing.text.html.*; public class Spider { /** *导致错误的URL集合 */ protected Collection workloadError = new ArrayList(3); /** *等待区URL集合 */ protected Collection workloadWaiting = new ArrayList(3); /** *处理过的URL集合 */ protected Collection workloadProcessed = new ArrayList(3); protected ISpiderReportable report; /** *表明处理过程是否应取消的标志 */ protected boolean cancel = false; /** *构造函数 * *参数report为实现了ISpiderReportable接口的类 */ public Spider(ISpiderReportable report) { this.report = report; } /** *获取导致错误的URL */ public Collection getWorkloadError() { return workloadError; } /** *获取在等待的URL *应添加至少一个URL到此集合以启动蜘蛛 */ public Collection getWorkloadWaiting() { return workloadWaiting; } /** *获取被处理过的URL */ public Collection getWorkloadProcessed() { return workloadProcessed; } /** *清空所有 */ public void clear() { getWorkloadError().clear(); getWorkloadWaiting().clear(); getWorkloadProcessed().clear(); } /** *设置一标志,使begin方法在完成之前返回 */ public void cancel() { cancel = true; } public void addURL(URL url) { if ( getWorkloadWaiting().contains(url) ) return; if ( getWorkloadError().contains(url) ) return; if ( getWorkloadProcessed().contains(url) ) return; log("正添加到工作区:" + url ); getWorkloadWaiting().add(url); } public void processURL(URL url) { try { log("正在处理:" + url ); //获取URL的内容 URLConnection connection = url.openConnection(); if ( (connection.getContentType()!=null) && !connection.getContentType().toLowerCase().s tartsWith("text/") ) { getWorkloadWaiting().remove(url); getWorkloadProcessed().add(url); log("不会进行正理,因为类型为:" + connection.getContentType() ); return; } //读取URL InputStream is = connection.getInputStream(); Reader r = new InputStreamReader(is); //解析URL HTMLEditorKit.Parser parse = new HTMLParse().getParser(); parse.parse(r,new Parser(url),true); } catch ( IOException e ) { getWorkloadWaiting().remove(url); getWorkloadError().add(url); log("错误:" + url ); report.spiderURLError(url); return; } //标记此URL已完成 getWorkloadWaiting().remove(url); getWorkloadProcessed().add(url); log("已完成:" + url ); } public void begin() { cancel = false; while ( !getWorkloadWaiting().isEmpty() && !cancel ) { Object list[] = getWorkloadWaiting().toArray(); for ( int i=0;(i<list.length)&&!cancel;i++ ) processURL((URL)list[i]); } } /** *HTML解析器回调函数 */ protected class Parser extends HTMLEditorKit.ParserCallback { protected URL base; public Parser(URL base) { this.base = base; } public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a,int pos) { String href = (String)a.getAttribute(HTML.Attribute.HREF); if( (href==null) && (t==HTML.Tag.FRAME) ) href = (String)a.getAttribute(HTML.Attribute.SRC); if ( href==null ) return; int i = href.indexOf('#'); if ( i!=-1 ) href = href.substring(0,i); if ( href.toLowerCase().startsWith("mailto:") ) { report.spiderFoundEMail(href); return; } handleLink(base,href); } public void handleStartTag(HTML.Tag t, MutableAttributeSet a,int pos) { handleSimpleTag(t,a,pos);//以同样的方式处理 } protected void handleLink(URL base,String str) { try { URL url = new URL(base,str); if ( report.spiderFoundURL(base,url) ) addURL(url); } catch ( MalformedURLException e ) { log("找到畸形URL:" + str ); } } } /** *由内部调用来记录信息 *仅是把日志写到标准输出 * *参数entry为写到日志的信息 */ public void log(String entry) { System.out.println( (new Date()) + ":" + entry ); } }

例4:解析HTML(HTMLParse.java)

import javax.swing.text.html.*; public class HTMLParse extends HTMLEditorKit { public HTMLEditorKit.Parser getParser() { return super.getParser(); } }

 

0
相关文章