博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
用Java实现网络爬虫
阅读量:5099 次
发布时间:2019-06-13

本文共 5432 字,大约阅读时间需要 18 分钟。

myCrawler.java

package WebCrawler;import java.io.File;import java.util.ArrayList;import java.util.LinkedList;import java.util.Queue;public class MyCrawler {        private static final String SAVEPATH = "C:"+File.separator+"downloadURL";    public void crawl(ArrayList
urls, int depth) { //初始化队列 Queue
q = new LinkedList
(); ArrayList
visited = new ArrayList
(); q.addAll(urls); while (!q.isEmpty()) { URL head = q.poll(); //出列 if(head.getDepth() > depth){ break; } visited.add(head); String page = HtmlParserTool.getPage(head.toString()); String charset = HtmlParserTool.getCharset(page); String urlFullPath = SAVEPATH+File.separator+head.toString().replaceAll("[?:<>*|]","_")+".html"; HtmlParserTool.writeToDisk(urlFullPath, page, charset); //保存到磁盘 ArrayList
toVisit = HtmlParserTool.extractLinks(page); for (String s : toVisit) { if (!visited.contains(s)) { //visited.add(s); q.add(new URL(s, head.getDepth()+1)); } } } } public static void main(String[] args) throws Exception { ArrayList
urls = new ArrayList
(); urls.add(new URL("http://www.baidu.com")); new MyCrawler().crawl(urls,1); }}

 

HtmlParserTool.java

package WebCrawler;import java.io.BufferedReader;import java.io.File;import java.io.FileOutputStream;import java.io.InputStreamReader;import java.io.OutputStream;import java.util.ArrayList;import org.apache.http.client.methods.CloseableHttpResponse;import org.apache.http.client.methods.HttpGet;import org.apache.http.impl.client.CloseableHttpClient;import org.apache.http.impl.client.HttpClients;import org.htmlparser.Parser;import org.htmlparser.Tag;import org.htmlparser.tags.LinkTag;import org.htmlparser.visitors.NodeVisitor;public class HtmlParserTool {    //判断字符串是否是一个网址    private static boolean isValidUrl(String url) {        if (url.startsWith("http") | url.startsWith("https")) {            return true;        } else {            return false;        }    }            //获取网页包含的超链接    public static ArrayList
extractLinks(String content){ ArrayList
links = new ArrayList
(); Parser parser = null; NodeVisitor visitor = null; try { parser = new Parser(content); visitor = new NodeVisitor() { @Override public void visitTag(Tag tag) { if(tag instanceof LinkTag) { LinkTag link = (LinkTag)tag; String linkString = link.getLink(); if(isValidUrl(linkString) && !links.contains(linkString)) { links.add(linkString); } } } }; parser.visitAllNodesWith(visitor); } catch (Exception e) { e.printStackTrace(); } return links; } //获取字符集 public static String getCharset(String content) { int startIdx = content.indexOf("charset"); int endIdx = content.indexOf("\"", startIdx+9); String charset = content.substring(startIdx+9, endIdx); return charset; } //获取网页内容 public static String getPage(String url) { CloseableHttpClient client = HttpClients.createDefault(); HttpGet request = new HttpGet(url); String content=""; try { CloseableHttpResponse response = client.execute(request); //System.out.println("Response Code: " + response.getStatusLine().getStatusCode()); BufferedReader rd = new BufferedReader(new InputStreamReader(response.getEntity().getContent())); String line = ""; while ((line = rd.readLine()) != null) { content = content + line + "\n"; } response.close(); client.close(); String charset = getCharset(content); if(charset != null) { content = new String(content.getBytes(),charset); } } catch (Exception e) { e.printStackTrace(); } return content; } //将网页内容写至磁盘 public static void writeToDisk(String path, String content, String charset){ try { File file = new File(path); OutputStream o = new FileOutputStream(file); o.write(content.getBytes(charset)); o.close(); } catch (Exception e) { e.printStackTrace(); } }}

 

 

URL.java

package WebCrawler;public class URL {    private String url;    private int depth;        public URL(String url) {        this.url = url;        this.depth = 1;    }        public URL(String url, int depth) {        this.url = url;        this.depth = depth;    }        public String toString() {        return this.url;    }        public String getUrl() {        return url;    }    public void setUrl(String url) {        this.url = url;    }        public int getDepth() {        return depth;    }    public void setDepth(int depth) {        this.depth = depth;    }}

 

转载于:https://www.cnblogs.com/finalboss1987/p/5438532.html

你可能感兴趣的文章
流媒体技术学习笔记之(三)Nginx-Rtmp-Module统计某频道在线观看流的客户数
查看>>
centos6.5安装mongodb2.6
查看>>
(项目)在线教育平台(一)
查看>>
R中基本函数学习[转载]
查看>>
行内元素
查看>>
电机+1舵机+红外控制+灯+LCD
查看>>
EJB到底是什么???
查看>>
js创建对象的模式介绍
查看>>
Spring的@ExceptionHandler和@ControllerAdvice统一处理异常
查看>>
1.输入年月,打印该月的日历,同Windows右下角日历
查看>>
Google提出的新型激活函数:Swish
查看>>
向大神看齐: 如何阅读大型前端开源项目的源码(转)
查看>>
数据挖掘领域最有影响力的18个算法(转载)
查看>>
Jenkins入坑记
查看>>
在Dictionary使用foreach的注意
查看>>
[转]职业规划中的“我想要”和“我需要”
查看>>
Linux中gdb 查看core堆栈信息
查看>>
【Python】SyntaxError: Non-ASCII character '\xe8' in file
查看>>
TensorFlow实战学习笔记(14)------VGGNet
查看>>
Docker的安装使用-第1章
查看>>