myCrawler.java
package WebCrawler;import java.io.File;import java.util.ArrayList;import java.util.LinkedList;import java.util.Queue;public class MyCrawler { private static final String SAVEPATH = "C:"+File.separator+"downloadURL"; public void crawl(ArrayListurls, int depth) { //初始化队列 Queue q = new LinkedList (); ArrayList visited = new ArrayList (); q.addAll(urls); while (!q.isEmpty()) { URL head = q.poll(); //出列 if(head.getDepth() > depth){ break; } visited.add(head); String page = HtmlParserTool.getPage(head.toString()); String charset = HtmlParserTool.getCharset(page); String urlFullPath = SAVEPATH+File.separator+head.toString().replaceAll("[?:<>*|]","_")+".html"; HtmlParserTool.writeToDisk(urlFullPath, page, charset); //保存到磁盘 ArrayList toVisit = HtmlParserTool.extractLinks(page); for (String s : toVisit) { if (!visited.contains(s)) { //visited.add(s); q.add(new URL(s, head.getDepth()+1)); } } } } public static void main(String[] args) throws Exception { ArrayList urls = new ArrayList (); urls.add(new URL("http://www.baidu.com")); new MyCrawler().crawl(urls,1); }}
HtmlParserTool.java
package WebCrawler;import java.io.BufferedReader;import java.io.File;import java.io.FileOutputStream;import java.io.InputStreamReader;import java.io.OutputStream;import java.util.ArrayList;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.htmlparser.Parser;import org.htmlparser.Tag;import org.htmlparser.tags.LinkTag;import org.htmlparser.visitors.NodeVisitor;public class HtmlParserTool { //判断字符串是否是一个网址 private static boolean isValidUrl(String url) { if (url.startsWith("http") | url.startsWith("https")) { return true; } else { return false; } } //获取网页包含的超链接 public static ArrayListextractLinks(String content){ ArrayList links = new ArrayList (); Parser parser = null; NodeVisitor visitor = null; try { parser = new Parser(content); visitor = new NodeVisitor() { @Override public void visitTag(Tag tag) { if(tag instanceof LinkTag) { LinkTag link = (LinkTag)tag; String linkString = link.getLink(); if(isValidUrl(linkString) && !links.contains(linkString)) { links.add(linkString); } } } }; parser.visitAllNodesWith(visitor); } catch (Exception e) { e.printStackTrace(); } return links; } //获取字符集 public static String getCharset(String content) { int startIdx = content.indexOf("charset"); int endIdx = content.indexOf("\"", startIdx+9); String charset = content.substring(startIdx+9, endIdx); return charset; } //获取网页内容 public static String getPage(String url) { CloseableHttpClient client = HttpClients.createDefault(); HttpGet request = new HttpGet(url); String content=""; try { CloseableHttpResponse response = client.execute(request); //System.out.println("Response Code: " + response.getStatusLine().getStatusCode()); BufferedReader rd = new BufferedReader(new InputStreamReader(response.getEntity().getContent())); String line = ""; while ((line = rd.readLine()) != null) { content = content + line + "\n"; } response.close(); client.close(); String charset = getCharset(content); if(charset != null) { content = new String(content.getBytes(),charset); } } catch (Exception e) { e.printStackTrace(); } return content; } //将网页内容写至磁盘 public static void writeToDisk(String path, String content, String charset){ try { File file = new File(path); OutputStream o = new FileOutputStream(file); o.write(content.getBytes(charset)); o.close(); } catch (Exception e) { e.printStackTrace(); } }}
URL.java
package WebCrawler;public class URL { private String url; private int depth; public URL(String url) { this.url = url; this.depth = 1; } public URL(String url, int depth) { this.url = url; this.depth = depth; } public String toString() { return this.url; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public int getDepth() { return depth; } public void setDepth(int depth) { this.depth = depth; }}