增加爬虫智能化配置

This commit is contained in:
xiongxiaoyang 2019-11-15 21:32:12 +08:00
parent 95149646f6
commit bebba0da31
8 changed files with 174 additions and 97 deletions

View File

@ -0,0 +1,19 @@
package com.java2nb.books.config;
import lombok.Data;
import org.springframework.boot.context.properties.ConfigurationProperties;
import org.springframework.stereotype.Component;
import java.util.Date;
@Data
@Component
@ConfigurationProperties(prefix="crawl")
public class CrawlConfig {
private Integer threadCount;
private Integer priority;
private Float lowestScore;
private String minUptTime;
private Integer maxNumber;
}

View File

@ -1,8 +1,13 @@
package com.java2nb.books.controller;
import java.util.List;
import java.util.Map;
import java.util.*;
import com.java2nb.books.config.CrawlConfig;
import com.java2nb.common.utils.GenUtils;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.configuration.Configuration;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.PropertiesConfiguration;
import org.apache.shiro.authz.annotation.RequiresPermissions;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Controller;
@ -30,12 +35,16 @@ import com.java2nb.common.utils.R;
* @date 2019-11-15 03:42:54
*/
@Slf4j
@Controller
@RequestMapping("/books/bookCrawl")
public class BookCrawlController {
@Autowired
private BookCrawlService bookCrawlService;
@Autowired
private CrawlConfig crawlConfig;
@GetMapping()
@RequiresPermissions("books:bookCrawl:bookCrawl")
String BookCrawl() {
@ -63,11 +72,9 @@ public class BookCrawlController {
}
@ApiOperation(value = "修改页面", notes = "修改页面")
@GetMapping("/edit/{id}")
@RequiresPermissions("books:bookCrawl:edit")
String edit(@PathVariable("id") Long id, Model model) {
BookCrawlDO bookCrawl = bookCrawlService.get(id);
model.addAttribute("bookCrawl", bookCrawl);
@GetMapping("/edit")
String edit( Model model) throws Exception {
model.addAttribute("property", crawlConfig);
return "books/bookCrawl/edit";
}
@ -100,9 +107,8 @@ public class BookCrawlController {
@ApiOperation(value = "修改", notes = "修改")
@ResponseBody
@RequestMapping("/update")
@RequiresPermissions("books:bookCrawl:edit")
public R update( BookCrawlDO bookCrawl) {
bookCrawlService.update(bookCrawl);
public R update(CrawlConfig config) {
crawlConfig = config;
return R.ok();
}

View File

@ -1,5 +1,6 @@
package com.java2nb.books.service.impl;
import com.java2nb.books.config.CrawlConfig;
import com.java2nb.books.dao.BookContentDao;
import com.java2nb.books.dao.BookDao;
import com.java2nb.books.dao.BookIndexDao;
@ -7,6 +8,7 @@ import com.java2nb.books.domain.BookContentDO;
import com.java2nb.books.domain.BookDO;
import com.java2nb.books.domain.BookIndexDO;
import com.java2nb.books.util.RestTemplateUtil;
import com.java2nb.common.utils.DateUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
@ -26,6 +28,9 @@ import org.springframework.web.client.RestTemplate;
@Service
public class BookCrawlServiceImpl implements BookCrawlService {
@Autowired
private CrawlConfig crawlConfig;
private boolean isInteruptBiquDaoCrawl;//是否中断笔趣岛爬虫程序
private boolean isInteruptBiquTaCrawl;//是否中断笔趣塔爬虫程序
@ -103,12 +108,18 @@ public class BookCrawlServiceImpl implements BookCrawlService {
private void crawlBook(BookCrawlDO bookCrawl) {
for (int i = 1; i <= 7; i++) {
int finalI = i;
int threadCount = crawlConfig.getThreadCount();
int step = 7 / threadCount;
int pos = step;
int i = 1;
while (i <= 7) {
final int fPos = pos;
final int fI = i;
i = pos + 1;
new Thread(
() -> {
int j = fI;
for (; j <= fPos; j++) {
try {
switch (bookCrawl.getCrawlWebCode()) {
@ -117,7 +128,7 @@ public class BookCrawlServiceImpl implements BookCrawlService {
if (isInteruptBiquDaoCrawl) {
return;
}
crawBiqudaoBooks(finalI);
crawBiqudaoBooks(j);
Thread.sleep(1000 * 60 * 60 * 24);
}
}
@ -126,7 +137,7 @@ public class BookCrawlServiceImpl implements BookCrawlService {
if (isInteruptBiquTaCrawl) {
return;
}
crawBiquTaBooks(finalI);
crawBiquTaBooks(j);
Thread.sleep(1000 * 60 * 60 * 24);
}
}
@ -134,28 +145,29 @@ public class BookCrawlServiceImpl implements BookCrawlService {
}
} catch (Exception e) {
e.printStackTrace();
bookCrawl.setStatus(0);
bookCrawlDao.update(bookCrawl);
}
}
}
).start();
pos += step;
if (7 - pos < step) {
pos = 7;
}
}
for (int j = 21; j <= 29; j++) {
int finalJ = j;
new Thread(() -> {
for (int j = 21; j <= 29; j++) {
for (int i = 1; i <= 499; i++) {
if(isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl){
for (int k = 1; k <= 499; k++) {
if (isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl) {
return;
}
System.out.println("==============分类============" + finalJ);
System.out.println("==============页码============" + i);
int catId = finalJ;
int page = i;
System.out.println("==============分类============" + j);
System.out.println("==============页码============" + k);
int catId = j;
int page = k;
String bookListUrl = "http://book.sfacg.com/List/default.aspx?&tid=" + catId + "&if=1&PageIndex=" + page;
@ -168,7 +180,7 @@ public class BookCrawlServiceImpl implements BookCrawlService {
while (isFindBook) {
try {
if(isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl){
if (isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl) {
return;
}
long bookNum = Long.parseLong(bookMatcher.group(1));
@ -320,7 +332,7 @@ public class BookCrawlServiceImpl implements BookCrawlService {
List<Integer> hasIndexNum = queryIndexCountByBookNameAndBAuthor(bookName, author);
while (isFindIndex) {
if(isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl){
if (isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl) {
return;
}
if (!hasIndexNum.contains(indexNum)) {
@ -378,16 +390,18 @@ public class BookCrawlServiceImpl implements BookCrawlService {
}
}
}).start();
}
}).start();
}
private void crawBiquTaBooks(int i) {
String baseUrl = "https://m.biquta.com";
String catBookListUrlBase = baseUrl + "/class/";
if (crawlConfig.getPriority() == 1) {
catBookListUrlBase = baseUrl + "/lhb/";
}
//拼接分类URL
int page = 1;//起始页码
int totalPage = page;
@ -457,10 +471,9 @@ public class BookCrawlServiceImpl implements BookCrawlService {
try {
Float score = Float.parseFloat(scoreMatch.group(1));
/*if (score < lowestScore) {//数据库空间有限暂时爬取8.0分以上的小说
// Thread.sleep(1000 * 60 * 60 * 24);//因为爬的是龙虎榜所以遇到第一个8分以下的之后的都是8分以下的
if (score < crawlConfig.getLowestScore()) {//数据库空间有限暂时爬取8.0分以上的小说
continue;
}*/
}
String bookName = bookNameMatch.group(1);
String author = authoreMatch.group(1);
@ -487,6 +500,9 @@ public class BookCrawlServiceImpl implements BookCrawlService {
String updateTimeStr = updateTimeMatch.group(1);
SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss");
Date updateTime = format.parse(updateTimeStr);
if (updateTime.getTime() < new SimpleDateFormat("yyyy-MM-dd").parse(crawlConfig.getMinUptTime()).getTime()) {
continue;
}
Pattern picPatten = Pattern.compile("<img src=\"([^>]+)\"\\s+onerror=\"this.src=");
Matcher picMather = picPatten.matcher(body);
if (picMather.find()) {
@ -605,7 +621,11 @@ public class BookCrawlServiceImpl implements BookCrawlService {
private void crawBiqudaoBooks(final int i) {
String baseUrl = "https://m.biqudao.com";
String catBookListUrlBase = baseUrl + "/bqgelhb/";
String catBookListUrlBase = baseUrl + "/bqgeclass/";
if (crawlConfig.getPriority() == 1) {
catBookListUrlBase = baseUrl + "/bqgelhb/";
}
//拼接分类URL
int page = 1;//起始页码
int totalPage = page;
@ -680,10 +700,9 @@ public class BookCrawlServiceImpl implements BookCrawlService {
Float score = Float.parseFloat(scoreMatch.group(1));
/*if (score < lowestScore) {//数据库空间有限暂时爬取8.0分以上的小说
Thread.sleep(1000 * 60 * 60 * 24);//因为爬的是龙虎榜所以遇到第一个8分以下的之后的都是8分以下的
if (score < crawlConfig.getLowestScore()) {//数据库空间有限暂时爬取8.0分以上的小说
continue;
}*/
}
String bookName = bookNameMatch.group(1);
String author = authoreMatch.group(1);
@ -710,6 +729,9 @@ public class BookCrawlServiceImpl implements BookCrawlService {
String updateTimeStr = updateTimeMatch.group(1);
SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss");
Date updateTime = format.parse(updateTimeStr);
if (updateTime.getTime() < new SimpleDateFormat("yyyy-MM-dd").parse(crawlConfig.getMinUptTime()).getTime()) {
continue;
}
Pattern picPatten = Pattern.compile("<img src=\"([^>]+)\"\\s+onerror=\"this.src=");
Matcher picMather = picPatten.matcher(body);
if (picMather.find()) {

View File

@ -0,0 +1,11 @@
crawl:
#爬虫线程数
threadCount: 1
#爬取优先级 1评分优先 2更新时间优先
priority: 1
#小说最低评分
lowestScore: 7.8
#小说最小更新时间
minUptTime: 2000-01-01
#爬取最大条数
maxNumber: 100000

View File

@ -31,6 +31,7 @@ spring:
type: redis
datasource:
type: com.alibaba.druid.pool.DruidDataSource
driverClassName: com.mysql.jdbc.Driver
@ -76,6 +77,8 @@ spring:
max-active: 100
# 连接池最大阻塞等待时间(使用负值表示没有限制)
max-wait: -1
profiles:
include: crawl
mybatis:
configuration:
#自动将数据库带下划线的表字段值映射到Java类的驼峰字段上

View File

@ -2,9 +2,9 @@
threadCount=1
#爬取优先级 1评分优先 2更新时间优先
priority=1
#小说最低评分0表示不限制
#小说最低评分
lowestScore=0
#小说最小更新时间
minUptTime=2000-01-01 00:00:00
#爬取最大条数,0表示不限制
maxNumber=0
minUptTime=2000-01-01
#爬取最大条数
maxNumber=100000

View File

@ -163,7 +163,7 @@ function edit(){
console.log('打开配置页面');
layer.open({
type : 2,
title : '增加',
title : '爬虫配置修改',
maxmin : true,
shadeClose : false,
area : [ '800px', '520px' ],

View File

@ -9,40 +9,56 @@
<div class="ibox float-e-margins">
<div class="ibox-content">
<form class="form-horizontal m-t" id="signupForm">
<input id="id" name="id" th:value="${bookCrawl.id}"
type="hidden">
<div class="form-group">
<label class="col-sm-3 control-label"></label>
<label class="col-sm-3 control-label">爬虫线程数: </label>
<div class="col-sm-8">
<input id="crawlWebName" name="crawlWebName" class="form-control"
th:value="${bookCrawl.crawlWebName}"
type="text">
<label class="radio-inline" > <input th:field="*{property.threadCount}"
type="radio" name="threadCount" th:value="1" />单线程
</label>
<label class="radio-inline" > <input th:field="*{property.threadCount}"
type="radio" name="threadCount" th:value="3" />3线程
</label>
<label class="radio-inline" > <input th:field="*{property.threadCount}"
type="radio" name="threadCount" th:value="7" />7线程
</label>
</div>
</div>
<div class="form-group">
<label class="col-sm-3 control-label"></label>
<label class="col-sm-3 control-label">爬取优先级</label>
<div class="col-sm-8">
<input id="crawlWebUrl" name="crawlWebUrl" class="form-control"
th:value="${bookCrawl.crawlWebUrl}"
type="text">
<div class="col-sm-8">
<label class="radio-inline" > <input th:field="*{property.priority}"
type="radio" name="priority" th:value="1" />评分优先
</label>
<label class="radio-inline" > <input th:field="*{property.priority}"
type="radio" name="priority" th:value="2" />更新时间优先
</label>
</div>
</div>
</div>
<div class="form-group">
<label class="col-sm-3 control-label"></label>
<label class="col-sm-3 control-label">小说最低评分</label>
<div class="col-sm-8">
<input id="crawlWebCode" name="crawlWebCode" class="form-control"
th:value="${bookCrawl.crawlWebCode}"
type="text">
<input id="lowestScore" name="lowestScore" class="form-control"
th:value="${property.lowestScore}"
type="number" max="10" min="0">
</div>
</div>
<div class="form-group">
<label class="col-sm-3 control-label"></label>
<label class="col-sm-3 control-label">小说最小更新时间</label>
<div class="col-sm-8">
<input id="status" name="status" class="form-control"
th:value="${bookCrawl.status}"
type="text">
<input type="text" class="laydate-icon layer-date form-control" id="minUptTime" name="minUptTime" th:value="${property.minUptTime}==null?null:${#dates.format(property.minUptTime,'yyyy-MM-dd')}" placeholder="请选择小说最小更新时间"
onclick="laydate({istime: true, format: 'YYYY-MM-DD'})" style="background-color: #fff;" readonly="readonly"/>
</div>
</div>
<!-- <div class="form-group">
<label class="col-sm-3 control-label">爬取最大条数:</label>
<div class="col-sm-8">
<input id="maxNumber" name="maxNumber" class="form-control"
th:value="${property.maxNumber}"
type="number" max="100000" min="1">
</div>
</div>-->
<div class="form-group">
<div class="col-sm-8 col-sm-offset-3">
<button type="submit" class="btn btn-primary">提交</button>