增加爬虫智能化配置

This commit is contained in:
xiongxiaoyang 2019-11-15 21:32:12 +08:00
parent 95149646f6
commit bebba0da31
8 changed files with 174 additions and 97 deletions

View File

@ -0,0 +1,19 @@
package com.java2nb.books.config;
import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.stereotype.Component;
import java.util.Date;
@Data
@Component
@ConfigurationProperties(prefix="crawl")
public class CrawlConfig {
private Integer threadCount;
private Integer priority;
private Float lowestScore;
private String minUptTime;
private Integer maxNumber;
}

View File

@ -1,8 +1,13 @@
package com.java2nb.books.controller; package com.java2nb.books.controller;
import java.util.List; import java.util.*;
import java.util.Map;
import com.java2nb.books.config.CrawlConfig;
import com.java2nb.common.utils.GenUtils;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.configuration.Configuration;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.shiro.authz.annotation.RequiresPermissions; import org.apache.shiro.authz.annotation.RequiresPermissions;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Controller; import org.springframework.stereotype.Controller;
@ -30,12 +35,16 @@ import com.java2nb.common.utils.R;
* @date 2019-11-15 03:42:54 * @date 2019-11-15 03:42:54
*/ */
@Slf4j
@Controller @Controller
@RequestMapping("/books/bookCrawl") @RequestMapping("/books/bookCrawl")
public class BookCrawlController { public class BookCrawlController {
@Autowired @Autowired
private BookCrawlService bookCrawlService; private BookCrawlService bookCrawlService;
@Autowired
private CrawlConfig crawlConfig;
@GetMapping() @GetMapping()
@RequiresPermissions("books:bookCrawl:bookCrawl") @RequiresPermissions("books:bookCrawl:bookCrawl")
String BookCrawl() { String BookCrawl() {
@ -63,11 +72,9 @@ public class BookCrawlController {
} }
@ApiOperation(value = "修改页面", notes = "修改页面") @ApiOperation(value = "修改页面", notes = "修改页面")
@GetMapping("/edit/{id}") @GetMapping("/edit")
@RequiresPermissions("books:bookCrawl:edit") String edit( Model model) throws Exception {
String edit(@PathVariable("id") Long id, Model model) { model.addAttribute("property", crawlConfig);
BookCrawlDO bookCrawl = bookCrawlService.get(id);
model.addAttribute("bookCrawl", bookCrawl);
return "books/bookCrawl/edit"; return "books/bookCrawl/edit";
} }
@ -100,9 +107,8 @@ public class BookCrawlController {
@ApiOperation(value = "修改", notes = "修改") @ApiOperation(value = "修改", notes = "修改")
@ResponseBody @ResponseBody
@RequestMapping("/update") @RequestMapping("/update")
@RequiresPermissions("books:bookCrawl:edit") public R update(CrawlConfig config) {
public R update( BookCrawlDO bookCrawl) { crawlConfig = config;
bookCrawlService.update(bookCrawl);
return R.ok(); return R.ok();
} }

View File

@ -1,5 +1,6 @@
package com.java2nb.books.service.impl; package com.java2nb.books.service.impl;
import com.java2nb.books.config.CrawlConfig;
import com.java2nb.books.dao.BookContentDao; import com.java2nb.books.dao.BookContentDao;
import com.java2nb.books.dao.BookDao; import com.java2nb.books.dao.BookDao;
import com.java2nb.books.dao.BookIndexDao; import com.java2nb.books.dao.BookIndexDao;
@ -7,6 +8,7 @@ import com.java2nb.books.domain.BookContentDO;
import com.java2nb.books.domain.BookDO; import com.java2nb.books.domain.BookDO;
import com.java2nb.books.domain.BookIndexDO; import com.java2nb.books.domain.BookIndexDO;
import com.java2nb.books.util.RestTemplateUtil; import com.java2nb.books.util.RestTemplateUtil;
import com.java2nb.common.utils.DateUtils;
import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.HttpStatus; import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity; import org.springframework.http.ResponseEntity;
@ -26,6 +28,9 @@ import org.springframework.web.client.RestTemplate;
@Service @Service
public class BookCrawlServiceImpl implements BookCrawlService { public class BookCrawlServiceImpl implements BookCrawlService {
@Autowired
private CrawlConfig crawlConfig;
private boolean isInteruptBiquDaoCrawl;//是否中断笔趣岛爬虫程序 private boolean isInteruptBiquDaoCrawl;//是否中断笔趣岛爬虫程序
private boolean isInteruptBiquTaCrawl;//是否中断笔趣塔爬虫程序 private boolean isInteruptBiquTaCrawl;//是否中断笔趣塔爬虫程序
@ -103,59 +108,66 @@ public class BookCrawlServiceImpl implements BookCrawlService {
private void crawlBook(BookCrawlDO bookCrawl) { private void crawlBook(BookCrawlDO bookCrawl) {
for (int i = 1; i <= 7; i++) { int threadCount = crawlConfig.getThreadCount();
int step = 7 / threadCount;
int finalI = i; int pos = step;
int i = 1;
while (i <= 7) {
final int fPos = pos;
final int fI = i;
i = pos + 1;
new Thread( new Thread(
() -> { () -> {
int j = fI;
for (; j <= fPos; j++) {
try {
try { switch (bookCrawl.getCrawlWebCode()) {
case 1: {
switch (bookCrawl.getCrawlWebCode()) { while (true) {
case 1: { if (isInteruptBiquDaoCrawl) {
while (true) { return;
if (isInteruptBiquDaoCrawl) { }
return; crawBiqudaoBooks(j);
Thread.sleep(1000 * 60 * 60 * 24);
} }
crawBiqudaoBooks(finalI);
Thread.sleep(1000 * 60 * 60 * 24);
} }
} case 2: {
case 2: { while (true) {
while (true) { if (isInteruptBiquTaCrawl) {
if (isInteruptBiquTaCrawl) { return;
return; }
crawBiquTaBooks(j);
Thread.sleep(1000 * 60 * 60 * 24);
} }
crawBiquTaBooks(finalI);
Thread.sleep(1000 * 60 * 60 * 24);
} }
}
}
} catch (Exception e) {
e.printStackTrace();
} }
} catch (Exception e) {
e.printStackTrace();
bookCrawl.setStatus(0);
bookCrawlDao.update(bookCrawl);
} }
} }
).start(); ).start();
pos += step;
if (7 - pos < step) {
pos = 7;
}
} }
new Thread(() -> {
for (int j = 21; j <= 29; j++) {
for (int j = 21; j <= 29; j++) { for (int k = 1; k <= 499; k++) {
int finalJ = j; if (isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl) {
new Thread(() -> {
for (int i = 1; i <= 499; i++) {
if(isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl){
return; return;
} }
System.out.println("==============分类============" + finalJ); System.out.println("==============分类============" + j);
System.out.println("==============页码============" + i); System.out.println("==============页码============" + k);
int catId = finalJ; int catId = j;
int page = i; int page = k;
String bookListUrl = "http://book.sfacg.com/List/default.aspx?&tid=" + catId + "&if=1&PageIndex=" + page; String bookListUrl = "http://book.sfacg.com/List/default.aspx?&tid=" + catId + "&if=1&PageIndex=" + page;
@ -168,7 +180,7 @@ public class BookCrawlServiceImpl implements BookCrawlService {
while (isFindBook) { while (isFindBook) {
try { try {
if(isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl){ if (isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl) {
return; return;
} }
long bookNum = Long.parseLong(bookMatcher.group(1)); long bookNum = Long.parseLong(bookMatcher.group(1));
@ -320,7 +332,7 @@ public class BookCrawlServiceImpl implements BookCrawlService {
List<Integer> hasIndexNum = queryIndexCountByBookNameAndBAuthor(bookName, author); List<Integer> hasIndexNum = queryIndexCountByBookNameAndBAuthor(bookName, author);
while (isFindIndex) { while (isFindIndex) {
if(isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl){ if (isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl) {
return; return;
} }
if (!hasIndexNum.contains(indexNum)) { if (!hasIndexNum.contains(indexNum)) {
@ -378,16 +390,18 @@ public class BookCrawlServiceImpl implements BookCrawlService {
} }
} }
}).start();
}
} }).start();
} }
private void crawBiquTaBooks(int i) { private void crawBiquTaBooks(int i) {
String baseUrl = "https://m.biquta.com"; String baseUrl = "https://m.biquta.com";
String catBookListUrlBase = baseUrl + "/class/"; String catBookListUrlBase = baseUrl + "/class/";
if (crawlConfig.getPriority() == 1) {
catBookListUrlBase = baseUrl + "/lhb/";
}
//拼接分类URL //拼接分类URL
int page = 1;//起始页码 int page = 1;//起始页码
int totalPage = page; int totalPage = page;
@ -457,10 +471,9 @@ public class BookCrawlServiceImpl implements BookCrawlService {
try { try {
Float score = Float.parseFloat(scoreMatch.group(1)); Float score = Float.parseFloat(scoreMatch.group(1));
/*if (score < lowestScore) {//数据库空间有限暂时爬取8.0分以上的小说 if (score < crawlConfig.getLowestScore()) {//数据库空间有限暂时爬取8.0分以上的小说
// Thread.sleep(1000 * 60 * 60 * 24);//因为爬的是龙虎榜所以遇到第一个8分以下的之后的都是8分以下的 continue;
continue; }
}*/
String bookName = bookNameMatch.group(1); String bookName = bookNameMatch.group(1);
String author = authoreMatch.group(1); String author = authoreMatch.group(1);
@ -487,6 +500,9 @@ public class BookCrawlServiceImpl implements BookCrawlService {
String updateTimeStr = updateTimeMatch.group(1); String updateTimeStr = updateTimeMatch.group(1);
SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss"); SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss");
Date updateTime = format.parse(updateTimeStr); Date updateTime = format.parse(updateTimeStr);
if (updateTime.getTime() < new SimpleDateFormat("yyyy-MM-dd").parse(crawlConfig.getMinUptTime()).getTime()) {
continue;
}
Pattern picPatten = Pattern.compile("<img src=\"([^>]+)\"\\s+onerror=\"this.src="); Pattern picPatten = Pattern.compile("<img src=\"([^>]+)\"\\s+onerror=\"this.src=");
Matcher picMather = picPatten.matcher(body); Matcher picMather = picPatten.matcher(body);
if (picMather.find()) { if (picMather.find()) {
@ -605,7 +621,11 @@ public class BookCrawlServiceImpl implements BookCrawlService {
private void crawBiqudaoBooks(final int i) { private void crawBiqudaoBooks(final int i) {
String baseUrl = "https://m.biqudao.com"; String baseUrl = "https://m.biqudao.com";
String catBookListUrlBase = baseUrl + "/bqgelhb/"; String catBookListUrlBase = baseUrl + "/bqgeclass/";
if (crawlConfig.getPriority() == 1) {
catBookListUrlBase = baseUrl + "/bqgelhb/";
}
//拼接分类URL //拼接分类URL
int page = 1;//起始页码 int page = 1;//起始页码
int totalPage = page; int totalPage = page;
@ -680,10 +700,9 @@ public class BookCrawlServiceImpl implements BookCrawlService {
Float score = Float.parseFloat(scoreMatch.group(1)); Float score = Float.parseFloat(scoreMatch.group(1));
/*if (score < lowestScore) {//数据库空间有限暂时爬取8.0分以上的小说 if (score < crawlConfig.getLowestScore()) {//数据库空间有限暂时爬取8.0分以上的小说
Thread.sleep(1000 * 60 * 60 * 24);//因为爬的是龙虎榜所以遇到第一个8分以下的之后的都是8分以下的 continue;
continue; }
}*/
String bookName = bookNameMatch.group(1); String bookName = bookNameMatch.group(1);
String author = authoreMatch.group(1); String author = authoreMatch.group(1);
@ -710,6 +729,9 @@ public class BookCrawlServiceImpl implements BookCrawlService {
String updateTimeStr = updateTimeMatch.group(1); String updateTimeStr = updateTimeMatch.group(1);
SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss"); SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss");
Date updateTime = format.parse(updateTimeStr); Date updateTime = format.parse(updateTimeStr);
if (updateTime.getTime() < new SimpleDateFormat("yyyy-MM-dd").parse(crawlConfig.getMinUptTime()).getTime()) {
continue;
}
Pattern picPatten = Pattern.compile("<img src=\"([^>]+)\"\\s+onerror=\"this.src="); Pattern picPatten = Pattern.compile("<img src=\"([^>]+)\"\\s+onerror=\"this.src=");
Matcher picMather = picPatten.matcher(body); Matcher picMather = picPatten.matcher(body);
if (picMather.find()) { if (picMather.find()) {

View File

@ -0,0 +1,11 @@
crawl:
#爬虫线程数
threadCount: 1
#爬取优先级 1评分优先 2更新时间优先
priority: 1
#小说最低评分
lowestScore: 7.8
#小说最小更新时间
minUptTime: 2000-01-01
#爬取最大条数
maxNumber: 100000

View File

@ -31,6 +31,7 @@ spring:
type: redis type: redis
datasource: datasource:
type: com.alibaba.druid.pool.DruidDataSource type: com.alibaba.druid.pool.DruidDataSource
driverClassName: com.mysql.jdbc.Driver driverClassName: com.mysql.jdbc.Driver
@ -76,6 +77,8 @@ spring:
max-active: 100 max-active: 100
# 连接池最大阻塞等待时间(使用负值表示没有限制) # 连接池最大阻塞等待时间(使用负值表示没有限制)
max-wait: -1 max-wait: -1
profiles:
include: crawl
mybatis: mybatis:
configuration: configuration:
#自动将数据库带下划线的表字段值映射到Java类的驼峰字段上 #自动将数据库带下划线的表字段值映射到Java类的驼峰字段上

View File

@ -2,9 +2,9 @@
threadCount=1 threadCount=1
#爬取优先级 1评分优先 2更新时间优先 #爬取优先级 1评分优先 2更新时间优先
priority=1 priority=1
#小说最低评分0表示不限制 #小说最低评分
lowestScore=0 lowestScore=0
#小说最小更新时间 #小说最小更新时间
minUptTime=2000-01-01 00:00:00 minUptTime=2000-01-01
#爬取最大条数,0表示不限制 #爬取最大条数
maxNumber=0 maxNumber=100000

View File

@ -163,7 +163,7 @@ function edit(){
console.log('打开配置页面'); console.log('打开配置页面');
layer.open({ layer.open({
type : 2, type : 2,
title : '增加', title : '爬虫配置修改',
maxmin : true, maxmin : true,
shadeClose : false, shadeClose : false,
area : [ '800px', '520px' ], area : [ '800px', '520px' ],

View File

@ -9,41 +9,57 @@
<div class="ibox float-e-margins"> <div class="ibox float-e-margins">
<div class="ibox-content"> <div class="ibox-content">
<form class="form-horizontal m-t" id="signupForm"> <form class="form-horizontal m-t" id="signupForm">
<input id="id" name="id" th:value="${bookCrawl.id}" <div class="form-group">
type="hidden"> <label class="col-sm-3 control-label">爬虫线程数: </label>
<div class="form-group"> <div class="col-sm-8">
<label class="col-sm-3 control-label"></label> <label class="radio-inline" > <input th:field="*{property.threadCount}"
<div class="col-sm-8"> type="radio" name="threadCount" th:value="1" />单线程
<input id="crawlWebName" name="crawlWebName" class="form-control" </label>
th:value="${bookCrawl.crawlWebName}" <label class="radio-inline" > <input th:field="*{property.threadCount}"
type="text"> type="radio" name="threadCount" th:value="3" />3线程
</div> </label>
<label class="radio-inline" > <input th:field="*{property.threadCount}"
type="radio" name="threadCount" th:value="7" />7线程
</label>
</div>
</div>
<div class="form-group">
<label class="col-sm-3 control-label">爬取优先级:</label>
<div class="col-sm-8">
<div class="col-sm-8">
<label class="radio-inline" > <input th:field="*{property.priority}"
type="radio" name="priority" th:value="1" />评分优先
</label>
<label class="radio-inline" > <input th:field="*{property.priority}"
type="radio" name="priority" th:value="2" />更新时间优先
</label>
</div> </div>
<div class="form-group"> </div>
<label class="col-sm-3 control-label"></label> </div>
<div class="col-sm-8"> <div class="form-group">
<input id="crawlWebUrl" name="crawlWebUrl" class="form-control" <label class="col-sm-3 control-label">小说最低评分:</label>
th:value="${bookCrawl.crawlWebUrl}" <div class="col-sm-8">
type="text"> <input id="lowestScore" name="lowestScore" class="form-control"
</div> th:value="${property.lowestScore}"
</div> type="number" max="10" min="0">
<div class="form-group"> </div>
<label class="col-sm-3 control-label"></label> </div>
<div class="col-sm-8"> <div class="form-group">
<input id="crawlWebCode" name="crawlWebCode" class="form-control" <label class="col-sm-3 control-label">小说最小更新时间:</label>
th:value="${bookCrawl.crawlWebCode}" <div class="col-sm-8">
type="text"> <input type="text" class="laydate-icon layer-date form-control" id="minUptTime" name="minUptTime" th:value="${property.minUptTime}==null?null:${#dates.format(property.minUptTime,'yyyy-MM-dd')}" placeholder="请选择小说最小更新时间"
</div> onclick="laydate({istime: true, format: 'YYYY-MM-DD'})" style="background-color: #fff;" readonly="readonly"/>
</div> </div>
<div class="form-group"> </div>
<label class="col-sm-3 control-label"></label> <!-- <div class="form-group">
<div class="col-sm-8"> <label class="col-sm-3 control-label">爬取最大条数:</label>
<input id="status" name="status" class="form-control" <div class="col-sm-8">
th:value="${bookCrawl.status}" <input id="maxNumber" name="maxNumber" class="form-control"
type="text"> th:value="${property.maxNumber}"
</div> type="number" max="100000" min="1">
</div> </div>
<div class="form-group"> </div>-->
<div class="form-group">
<div class="col-sm-8 col-sm-offset-3"> <div class="col-sm-8 col-sm-offset-3">
<button type="submit" class="btn btn-primary">提交</button> <button type="submit" class="btn btn-primary">提交</button>
</div> </div>