使用java爬取页面数据,Java使用Matcher进行内容循环比对
使用java爬取页面数据,Java使用Matcher进行内容循环比对//ip的正则表达式规则 private final static String IP_REG = "\\d{1 3}\\.\\d{1 3}\\.\\d{1 3}\\.\\d{1 3} \\d{1 6}"; //记录爬取到的ip数量 private static int IPNumber = 0;参数说明:String url :需要爬取代理IP的网址方法属性: <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.2</version> </
项目说明:
此类工具的本质是进行文本匹配,会使用到正则表达式进行循环匹配,得到需要的数据,本文主要使用于代理IP的爬取,并且做代理IP的校验,同时存入数据库的操作;本教程仅作学习交流使用,请勿将此代码用于非正常用途。
代码实现:
1.引入需求的maven
        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.10.2</version>
        </dependency>
    

2.配置文件:ftp.properties,我这里使用的方法比较老,大家可以使用@value注解去获取yml文件的值
在resources中添加此文件
#66代理网址
bbIP = http://www.66ip.cn/page.html
#快代理网址
kuaidaili = https://free.kuaidaili.com/free/inha/page/
#yq代理
yqie = http://ip.yqie.com/proxygaoni/index_page.htm
    

3.工具类:发送请求并匹配返回内容
方法属性:
    //ip的正则表达式规则
    private final static String IP_REG = "\\d{1 3}\\.\\d{1 3}\\.\\d{1 3}\\.\\d{1 3} \\d{1 6}";
    //记录爬取到的ip数量
    private static int IPNumber = 0;
    

参数说明:String url :需要爬取代理IP的网址
int wantNumber:需要爬取的数量(数量越多,爬取的页面就会越多。也就会更慢)
    //开始进行ip爬虫
    public HashMap<String  String> findIP (String url  int wantNumber){
        HashMap<String  String> ipMap = new HashMap<>();
        for (int i = 1; i < 100; i  ) {
            String finalURL = url.replace("page" String.valueOf(i));  //组合出IP网页的地址,根据页数
            log.info("--------------------当前页数:"   i);
            log.info("--------------------当前网址:"   finalURL);
            if (wantNumber >= IPNumber){
                try {
                    Document doc = Jsoup.connect(finalURL)
                            .header("Accept"  "text/html application/xhtml xml application/xml;q=0.9 image/webp */*;q=0.8")
                            .header("Accept-Encoding"  "gzip  deflate  sdch")
                            .header("Accept-Language"  "zh-CN zh;q=0.8 en;q=0.6")
                            .header("Cache-Control"  "max-age=0")
                            .header("User-Agent"  "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML  like Gecko) Chrome/51.0.2704.103 Safari/537.36")
                            .header("Cookie"  "Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244; _gat=1; _ga=GA1.2.1061361785.1462812244")
                            .header("Host"  "www.kuaidaili.com")
                            .header("Referer"  "http://www.kuaidaili.com/free")
                            .timeout(30 * 1000)  //设置连接超时为30秒
                            .get();
                    Matcher matcher = Pattern.compile(IP_REG).matcher(doc.text());
                    int matcher_start = 0;
                    while (matcher.find(matcher_start)){
                        String group = matcher.group();
                        String[] strs = group.split(" ");
                        String ip = strs[0];
                        String port = strs[1];
                        //检验获得的端口是否可用,调用checkProxy返回true或者false
                        if (checkProxy(ip  Integer.valueOf(port))) {
                            log.info("--------------------获取到可用代理IP:"   ip   ":"   port);
                            log.info("--------------------ip地址:"   ip);
                            log.info("--------------------端口号:"   port);
                            ipMap.put(ip  port);
                            IPNumber = IPNumber   1;
                            log.info("--------------------可用代理IP:"   ip   ":"   port   "已入MAP集合,等待入库");
                        }
                        matcher_start = matcher.end();
                    }
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }else {
                break;
            }
        }
        return ipMap;
    }
    

3.工具类:验证获取的代理IP是否可用
    //验证获得的代理是否可用
    public static boolean checkProxy(String ip  Integer port) {
        try {
            //http://1212.ip138.com/ic.asp 可以换成任何比较快的网页
            Jsoup.connect("http://1212.ip138.com/ic.asp")
                    .timeout(2 * 1000)
                    .proxy(ip  port)
                    .get();
            return true;
        } catch (Exception e) {
            log.info("--------------------获取到代理IP:"   ip   ":"   port   "---访问失败");
            return false;
        }
    }
    

4.Controller层:
package cn.axin229913.SendMessage.Controller;
import cn.axin229913.SendMessage.Pojo.Result;
import cn.axin229913.SendMessage.Service.ToolService;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Configuration;
import org.springframework.scheduling.annotation.EnableScheduling;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import org.springframework.web.bind.annotation.CrossOrigin;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import java.io.IOException;
@RestController
@RequestMapping("/Tool")
@CrossOrigin    //前后端进行跨域操作
public class ToolController {
    @Autowired
    private ToolService toolService;
        
    @RequestMapping("findIP")
    public Result findIP(String web  int wantNumber) throws Exception {
        return toolService.findIP(web  wantNumber);
    }
}
    

5.Service层:
package cn.axin229913.SendMessage.Service;
import cn.axin229913.SendMessage.Pojo.Result;
public interface ToolService {
    
    Result findIP (String web  int wantNumber) throws Exception;
}
    

Impl层:(这里的swicth case的值应该使用枚举进行定义,我这里为了展示方便,大家不要盲目抄哦)
package cn.axin229913.SendMessage.Service.Impl;
import cn.axin229913.SendMessage.Mapper.ToolMapper;
import cn.axin229913.SendMessage.Pojo.Result;
import cn.axin229913.SendMessage.Pojo.ipPojo;
import cn.axin229913.SendMessage.Service.ToolService;
import cn.axin229913.SendMessage.Tools.ProxyTool;
import cn.axin229913.SendMessage.Tools.SendMessage;
import cn.axin229913.SendMessage.Tools.uploadTool;
import lombok.extern.slf4j.Slf4j;
import org.apache.http.HttpEntity;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.io.*;
import java.util.HashMap;
import java.util.List;
import java.util.Properties;
@Slf4j
@Service
public class ToolServiceImpl implements ToolService {
    @Autowired
    private ToolMapper toolMapper;
    
    @Override
    public Result findIP (String web  int wantNumber) throws Exception {
        Result result = new Result();
        Properties properties = new Properties();
        InputStream in = uploadTool.class.getResourceAsStream("/ftp.properties");
        BufferedReader bf = new BufferedReader(new InputStreamReader(in));
        properties.load(bf);
        String website;
        switch (web){
            case "2" :
                website = properties.getProperty("kuaidaili");break;
            case "3" :
                website = properties.getProperty("yqie");break;
            case "1" :
            default :
                website = properties.getProperty("bbip");break;
        }
        ProxyTool proxyTool = new ProxyTool();
        HashMap<String  String> ipMap = proxyTool.findIP(website  wantNumber);
        for (String ip : ipMap.keySet()){
            String ip1 = ip;
            String port1 = ipMap.get(ip);
            List<ipPojo> ip2 = toolMapper.findIP(ip1  port1);
            if (ip2.isEmpty()){
                int rows = toolMapper.addIP2DB(ip1  port1  "1");
                if (rows != 0){
                    log.info("--------------------可用代理IP:"   ip1   ":"   port1   "已入数据");
                    result.setStatus(200).setMsg(website "已爬取" (wantNumber 1) "条数据,且已入数据库");
                }else {
                    result.setStatus(200).setMsg("可用代理IP:"   ip1   ":"   port1   "入库错误!!!");
                    log.info("--------------------可用代理IP:"   ip1   ":"   port1   "入库错误!!!");
                }
            }
            result.setStatus(200).setMsg(website "已爬取" (wantNumber 1) "条数据,且已入数据库");
        }
        return result;
    }
    
}
    

6.Mapper层:
package cn.axin229913.SendMessage.Mapper;
import cn.axin229913.SendMessage.Pojo.ipPojo;
import org.apache.ibatis.annotations.Mapper;
import org.apache.ibatis.annotations.Param;
import org.springframework.stereotype.Repository;
import java.util.List;
@Mapper
@Repository
public interface ToolMapper {
    
    int addIP2DB(@Param("ip")String ip 
                @Param("port")String port 
                 @Param("status")String status);
    
    List<ipPojo> findIP(@Param("ip")String ip 
                        @Param("port")String port);
    
}
    

7.Mapper.xml文件:
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE mapper PUBLIC "-//mybatis.org//DTD Mapper 3.0//EN" "http://mybatis.org/dtd/mybatis-3-mapper.dtd">
<mapper namespace="cn.axin229913.SendMessage.Mapper.ToolMapper">
    <!-- 通用查询映射结果 -->
    <resultMap id="IpMap" type="cn.axin229913.SendMessage.Pojo.ipPojo">
        <id column="id" property="id" />
        <result column="ip" property="ip" />
        <result column="port" property="port" />
        <result column="status" property="status" />
    </resultMap>
    <insert id="addIP2DB" parameterType="String">
        insert into Ip_Proxy(id ip port status) values (null #{ip} #{port} #{status})
    </insert>
    <select id="findIP" parameterType="String" resultMap="IpMap">
        select id from Ip_Proxy where ip = #{ip} and port = #{port} and status = 1
    </select>
</mapper>
    

8.Pojo类:
package cn.axin229913.SendMessage.Pojo;
import com.baomidou.mybatisplus.annotation.IdType;
import com.baomidou.mybatisplus.annotation.TableId;
import com.baomidou.mybatisplus.annotation.TableName;
import lombok.Data;
import lombok.experimental.Accessors;
@Data
@Accessors(chain = true)
@TableName("Ip_Proxy")  //对象与表需要一一映射
public class ipPojo {
    @TableId(type = IdType.AUTO)  //表示主键自增
    private Integer id;
    private String ip;
    private String port;
    private String status;
}
    

9.数据库字段执行sql:
数据库字段说明:
id:主键自增
ip:IP号
port:端口号
status:ip的状态,默认是1,表示正常使用
SET NAMES utf8mb4;
SET FOREIGN_KEY_CHECKS = 0;
-- ----------------------------
-- Table structure for Ip_Proxy
-- ----------------------------
DROP TABLE IF EXISTS `Ip_Proxy`;
CREATE TABLE `Ip_Proxy`  (
  `id` int(11) NOT NULL AUTO_INCREMENT 
  `ip` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL 
  `port` varchar(11) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL 
  `status` varchar(11) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci NOT NULL 
  PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 21 CHARACTER SET = latin1 COLLATE = latin1_swedish_ci ROW_FORMAT = Dynamic;
SET FOREIGN_KEY_CHECKS = 1;
    

10.启动效果图:


编辑


编辑


编辑
方法到此处已完结!!!
如有错误,望大神指正!!




