使用java爬取页面数据,Java使用Matcher进行内容循环比对
使用java爬取页面数据,Java使用Matcher进行内容循环比对//ip的正则表达式规则 private final static String IP_REG = "\\d{1 3}\\.\\d{1 3}\\.\\d{1 3}\\.\\d{1 3} \\d{1 6}"; //记录爬取到的ip数量 private static int IPNumber = 0;参数说明:String url :需要爬取代理IP的网址方法属性: <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.2</version> </
项目说明:
此类工具的本质是进行文本匹配,会使用到正则表达式进行循环匹配,得到需要的数据,本文主要使用于代理IP的爬取,并且做代理IP的校验,同时存入数据库的操作;本教程仅作学习交流使用,请勿将此代码用于非正常用途。
代码实现:
1.引入需求的maven
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.10.2</version>
</dependency>
2.配置文件:ftp.properties,我这里使用的方法比较老,大家可以使用@value注解去获取yml文件的值
在resources中添加此文件
#66代理网址
bbIP = http://www.66ip.cn/page.html
#快代理网址
kuaidaili = https://free.kuaidaili.com/free/inha/page/
#yq代理
yqie = http://ip.yqie.com/proxygaoni/index_page.htm
3.工具类:发送请求并匹配返回内容
方法属性:
//ip的正则表达式规则
private final static String IP_REG = "\\d{1 3}\\.\\d{1 3}\\.\\d{1 3}\\.\\d{1 3} \\d{1 6}";
//记录爬取到的ip数量
private static int IPNumber = 0;
参数说明:String url :需要爬取代理IP的网址
int wantNumber:需要爬取的数量(数量越多,爬取的页面就会越多。也就会更慢)
//开始进行ip爬虫
public HashMap<String String> findIP (String url int wantNumber){
HashMap<String String> ipMap = new HashMap<>();
for (int i = 1; i < 100; i ) {
String finalURL = url.replace("page" String.valueOf(i)); //组合出IP网页的地址,根据页数
log.info("--------------------当前页数:" i);
log.info("--------------------当前网址:" finalURL);
if (wantNumber >= IPNumber){
try {
Document doc = Jsoup.connect(finalURL)
.header("Accept" "text/html application/xhtml xml application/xml;q=0.9 image/webp */*;q=0.8")
.header("Accept-Encoding" "gzip deflate sdch")
.header("Accept-Language" "zh-CN zh;q=0.8 en;q=0.6")
.header("Cache-Control" "max-age=0")
.header("User-Agent" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML like Gecko) Chrome/51.0.2704.103 Safari/537.36")
.header("Cookie" "Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244; _gat=1; _ga=GA1.2.1061361785.1462812244")
.header("Host" "www.kuaidaili.com")
.header("Referer" "http://www.kuaidaili.com/free")
.timeout(30 * 1000) //设置连接超时为30秒
.get();
Matcher matcher = Pattern.compile(IP_REG).matcher(doc.text());
int matcher_start = 0;
while (matcher.find(matcher_start)){
String group = matcher.group();
String[] strs = group.split(" ");
String ip = strs[0];
String port = strs[1];
//检验获得的端口是否可用,调用checkProxy返回true或者false
if (checkProxy(ip Integer.valueOf(port))) {
log.info("--------------------获取到可用代理IP:" ip ":" port);
log.info("--------------------ip地址:" ip);
log.info("--------------------端口号:" port);
ipMap.put(ip port);
IPNumber = IPNumber 1;
log.info("--------------------可用代理IP:" ip ":" port "已入MAP集合,等待入库");
}
matcher_start = matcher.end();
}
} catch (Exception e) {
e.printStackTrace();
}
}else {
break;
}
}
return ipMap;
}
3.工具类:验证获取的代理IP是否可用
//验证获得的代理是否可用
public static boolean checkProxy(String ip Integer port) {
try {
//http://1212.ip138.com/ic.asp 可以换成任何比较快的网页
Jsoup.connect("http://1212.ip138.com/ic.asp")
.timeout(2 * 1000)
.proxy(ip port)
.get();
return true;
} catch (Exception e) {
log.info("--------------------获取到代理IP:" ip ":" port "---访问失败");
return false;
}
}
4.Controller层:
package cn.axin229913.SendMessage.Controller;
import cn.axin229913.SendMessage.Pojo.Result;
import cn.axin229913.SendMessage.Service.ToolService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import org.springframework.web.bind.annotation.CrossOrigin;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import java.io.IOException;
@RestController
@RequestMapping("/Tool")
@CrossOrigin //前后端进行跨域操作
public class ToolController {
@Autowired
private ToolService toolService;
@RequestMapping("findIP")
public Result findIP(String web int wantNumber) throws Exception {
return toolService.findIP(web wantNumber);
}
}
5.Service层:
package cn.axin229913.SendMessage.Service;
import cn.axin229913.SendMessage.Pojo.Result;
public interface ToolService {
Result findIP (String web int wantNumber) throws Exception;
}
Impl层:(这里的swicth case的值应该使用枚举进行定义,我这里为了展示方便,大家不要盲目抄哦)
package cn.axin229913.SendMessage.Service.Impl;
import cn.axin229913.SendMessage.Mapper.ToolMapper;
import cn.axin229913.SendMessage.Pojo.Result;
import cn.axin229913.SendMessage.Pojo.ipPojo;
import cn.axin229913.SendMessage.Service.ToolService;
import cn.axin229913.SendMessage.Tools.ProxyTool;
import cn.axin229913.SendMessage.Tools.SendMessage;
import cn.axin229913.SendMessage.Tools.uploadTool;
import lombok.extern.slf4j.Slf4j;
import org.apache.http.HttpEntity;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.io.*;
import java.util.HashMap;
import java.util.List;
import java.util.Properties;
@Slf4j
@Service
public class ToolServiceImpl implements ToolService {
@Autowired
private ToolMapper toolMapper;
@Override
public Result findIP (String web int wantNumber) throws Exception {
Result result = new Result();
Properties properties = new Properties();
InputStream in = uploadTool.class.getResourceAsStream("/ftp.properties");
BufferedReader bf = new BufferedReader(new InputStreamReader(in));
properties.load(bf);
String website;
switch (web){
case "2" :
website = properties.getProperty("kuaidaili");break;
case "3" :
website = properties.getProperty("yqie");break;
case "1" :
default :
website = properties.getProperty("bbip");break;
}
ProxyTool proxyTool = new ProxyTool();
HashMap<String String> ipMap = proxyTool.findIP(website wantNumber);
for (String ip : ipMap.keySet()){
String ip1 = ip;
String port1 = ipMap.get(ip);
List<ipPojo> ip2 = toolMapper.findIP(ip1 port1);
if (ip2.isEmpty()){
int rows = toolMapper.addIP2DB(ip1 port1 "1");
if (rows != 0){
log.info("--------------------可用代理IP:" ip1 ":" port1 "已入数据");
result.setStatus(200).setMsg(website "已爬取" (wantNumber 1) "条数据,且已入数据库");
}else {
result.setStatus(200).setMsg("可用代理IP:" ip1 ":" port1 "入库错误!!!");
log.info("--------------------可用代理IP:" ip1 ":" port1 "入库错误!!!");
}
}
result.setStatus(200).setMsg(website "已爬取" (wantNumber 1) "条数据,且已入数据库");
}
return result;
}
}
6.Mapper层:
package cn.axin229913.SendMessage.Mapper;
import cn.axin229913.SendMessage.Pojo.ipPojo;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Param;
import org.springframework.stereotype.Repository;
import java.util.List;
@Mapper
@Repository
public interface ToolMapper {
int addIP2DB(@Param("ip")String ip
@Param("port")String port
@Param("status")String status);
List<ipPojo> findIP(@Param("ip")String ip
@Param("port")String port);
}
7.Mapper.xml文件:
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="cn.axin229913.SendMessage.Mapper.ToolMapper">
<!-- 通用查询映射结果 -->
<resultMap id="IpMap" type="cn.axin229913.SendMessage.Pojo.ipPojo">
<id column="id" property="id" />
<result column="ip" property="ip" />
<result column="port" property="port" />
<result column="status" property="status" />
</resultMap>
<insert id="addIP2DB" parameterType="String">
insert into Ip_Proxy(id ip port status) values (null #{ip} #{port} #{status})
</insert>
<select id="findIP" parameterType="String" resultMap="IpMap">
select id from Ip_Proxy where ip = #{ip} and port = #{port} and status = 1
</select>
</mapper>
8.Pojo类:
package cn.axin229913.SendMessage.Pojo;
import com.baomidou.mybatisplus.annotation.IdType;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.Data;
import lombok.experimental.Accessors;
@Data
@Accessors(chain = true)
@TableName("Ip_Proxy") //对象与表需要一一映射
public class ipPojo {
@TableId(type = IdType.AUTO) //表示主键自增
private Integer id;
private String ip;
private String port;
private String status;
}
9.数据库字段执行sql:
数据库字段说明:
id:主键自增
ip:IP号
port:端口号
status:ip的状态,默认是1,表示正常使用
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for Ip_Proxy
-- ----------------------------
DROP TABLE IF EXISTS `Ip_Proxy`;
CREATE TABLE `Ip_Proxy` (
`id` int(11) NOT NULL AUTO_INCREMENT
`ip` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL
`port` varchar(11) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL
`status` varchar(11) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 21 CHARACTER SET = latin1 COLLATE = latin1_swedish_ci ROW_FORMAT = Dynamic;
SET FOREIGN_KEY_CHECKS = 1;
10.启动效果图:
编辑
编辑
编辑
方法到此处已完结!!!
如有错误,望大神指正!!