mirror of
https://github.com/201206030/novel.git
synced 2025-04-27 07:30:50 +00:00
增加爬虫智能化配置
This commit is contained in:
parent
95149646f6
commit
bebba0da31
@ -0,0 +1,19 @@
|
||||
package com.java2nb.books.config;
|
||||
|
||||
import lombok.Data;
|
||||
import org.springframework.boot.context.properties.ConfigurationProperties;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.Date;
|
||||
|
||||
@Data
|
||||
@Component
|
||||
@ConfigurationProperties(prefix="crawl")
|
||||
public class CrawlConfig {
|
||||
|
||||
private Integer threadCount;
|
||||
private Integer priority;
|
||||
private Float lowestScore;
|
||||
private String minUptTime;
|
||||
private Integer maxNumber;
|
||||
}
|
@ -1,8 +1,13 @@
|
||||
package com.java2nb.books.controller;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.*;
|
||||
|
||||
import com.java2nb.books.config.CrawlConfig;
|
||||
import com.java2nb.common.utils.GenUtils;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.configuration.Configuration;
|
||||
import org.apache.commons.configuration.ConfigurationException;
|
||||
import org.apache.commons.configuration.PropertiesConfiguration;
|
||||
import org.apache.shiro.authz.annotation.RequiresPermissions;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.stereotype.Controller;
|
||||
@ -30,12 +35,16 @@ import com.java2nb.common.utils.R;
|
||||
* @date 2019-11-15 03:42:54
|
||||
*/
|
||||
|
||||
@Slf4j
|
||||
@Controller
|
||||
@RequestMapping("/books/bookCrawl")
|
||||
public class BookCrawlController {
|
||||
@Autowired
|
||||
private BookCrawlService bookCrawlService;
|
||||
|
||||
@Autowired
|
||||
private CrawlConfig crawlConfig;
|
||||
|
||||
@GetMapping()
|
||||
@RequiresPermissions("books:bookCrawl:bookCrawl")
|
||||
String BookCrawl() {
|
||||
@ -63,11 +72,9 @@ public class BookCrawlController {
|
||||
}
|
||||
|
||||
@ApiOperation(value = "修改页面", notes = "修改页面")
|
||||
@GetMapping("/edit/{id}")
|
||||
@RequiresPermissions("books:bookCrawl:edit")
|
||||
String edit(@PathVariable("id") Long id, Model model) {
|
||||
BookCrawlDO bookCrawl = bookCrawlService.get(id);
|
||||
model.addAttribute("bookCrawl", bookCrawl);
|
||||
@GetMapping("/edit")
|
||||
String edit( Model model) throws Exception {
|
||||
model.addAttribute("property", crawlConfig);
|
||||
return "books/bookCrawl/edit";
|
||||
}
|
||||
|
||||
@ -100,9 +107,8 @@ public class BookCrawlController {
|
||||
@ApiOperation(value = "修改", notes = "修改")
|
||||
@ResponseBody
|
||||
@RequestMapping("/update")
|
||||
@RequiresPermissions("books:bookCrawl:edit")
|
||||
public R update( BookCrawlDO bookCrawl) {
|
||||
bookCrawlService.update(bookCrawl);
|
||||
public R update(CrawlConfig config) {
|
||||
crawlConfig = config;
|
||||
return R.ok();
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,6 @@
|
||||
package com.java2nb.books.service.impl;
|
||||
|
||||
import com.java2nb.books.config.CrawlConfig;
|
||||
import com.java2nb.books.dao.BookContentDao;
|
||||
import com.java2nb.books.dao.BookDao;
|
||||
import com.java2nb.books.dao.BookIndexDao;
|
||||
@ -7,6 +8,7 @@ import com.java2nb.books.domain.BookContentDO;
|
||||
import com.java2nb.books.domain.BookDO;
|
||||
import com.java2nb.books.domain.BookIndexDO;
|
||||
import com.java2nb.books.util.RestTemplateUtil;
|
||||
import com.java2nb.common.utils.DateUtils;
|
||||
import org.springframework.beans.factory.annotation.Autowired;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
@ -26,6 +28,9 @@ import org.springframework.web.client.RestTemplate;
|
||||
@Service
|
||||
public class BookCrawlServiceImpl implements BookCrawlService {
|
||||
|
||||
@Autowired
|
||||
private CrawlConfig crawlConfig;
|
||||
|
||||
private boolean isInteruptBiquDaoCrawl;//是否中断笔趣岛爬虫程序
|
||||
|
||||
private boolean isInteruptBiquTaCrawl;//是否中断笔趣塔爬虫程序
|
||||
@ -103,59 +108,66 @@ public class BookCrawlServiceImpl implements BookCrawlService {
|
||||
|
||||
|
||||
private void crawlBook(BookCrawlDO bookCrawl) {
|
||||
for (int i = 1; i <= 7; i++) {
|
||||
|
||||
int finalI = i;
|
||||
int threadCount = crawlConfig.getThreadCount();
|
||||
int step = 7 / threadCount;
|
||||
int pos = step;
|
||||
int i = 1;
|
||||
while (i <= 7) {
|
||||
final int fPos = pos;
|
||||
final int fI = i;
|
||||
i = pos + 1;
|
||||
new Thread(
|
||||
() -> {
|
||||
int j = fI;
|
||||
for (; j <= fPos; j++) {
|
||||
try {
|
||||
|
||||
try {
|
||||
|
||||
switch (bookCrawl.getCrawlWebCode()) {
|
||||
case 1: {
|
||||
while (true) {
|
||||
if (isInteruptBiquDaoCrawl) {
|
||||
return;
|
||||
switch (bookCrawl.getCrawlWebCode()) {
|
||||
case 1: {
|
||||
while (true) {
|
||||
if (isInteruptBiquDaoCrawl) {
|
||||
return;
|
||||
}
|
||||
crawBiqudaoBooks(j);
|
||||
Thread.sleep(1000 * 60 * 60 * 24);
|
||||
}
|
||||
crawBiqudaoBooks(finalI);
|
||||
Thread.sleep(1000 * 60 * 60 * 24);
|
||||
}
|
||||
}
|
||||
case 2: {
|
||||
while (true) {
|
||||
if (isInteruptBiquTaCrawl) {
|
||||
return;
|
||||
case 2: {
|
||||
while (true) {
|
||||
if (isInteruptBiquTaCrawl) {
|
||||
return;
|
||||
}
|
||||
crawBiquTaBooks(j);
|
||||
Thread.sleep(1000 * 60 * 60 * 24);
|
||||
}
|
||||
crawBiquTaBooks(finalI);
|
||||
Thread.sleep(1000 * 60 * 60 * 24);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
bookCrawl.setStatus(0);
|
||||
bookCrawlDao.update(bookCrawl);
|
||||
}
|
||||
|
||||
}
|
||||
).start();
|
||||
pos += step;
|
||||
if (7 - pos < step) {
|
||||
pos = 7;
|
||||
}
|
||||
}
|
||||
|
||||
new Thread(() -> {
|
||||
for (int j = 21; j <= 29; j++) {
|
||||
|
||||
|
||||
for (int j = 21; j <= 29; j++) {
|
||||
int finalJ = j;
|
||||
new Thread(() -> {
|
||||
|
||||
for (int i = 1; i <= 499; i++) {
|
||||
if(isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl){
|
||||
for (int k = 1; k <= 499; k++) {
|
||||
if (isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl) {
|
||||
return;
|
||||
}
|
||||
System.out.println("==============分类============:" + finalJ);
|
||||
System.out.println("==============页码============:" + i);
|
||||
int catId = finalJ;
|
||||
int page = i;
|
||||
System.out.println("==============分类============:" + j);
|
||||
System.out.println("==============页码============:" + k);
|
||||
int catId = j;
|
||||
int page = k;
|
||||
|
||||
String bookListUrl = "http://book.sfacg.com/List/default.aspx?&tid=" + catId + "&if=1&PageIndex=" + page;
|
||||
|
||||
@ -168,7 +180,7 @@ public class BookCrawlServiceImpl implements BookCrawlService {
|
||||
|
||||
while (isFindBook) {
|
||||
try {
|
||||
if(isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl){
|
||||
if (isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl) {
|
||||
return;
|
||||
}
|
||||
long bookNum = Long.parseLong(bookMatcher.group(1));
|
||||
@ -320,7 +332,7 @@ public class BookCrawlServiceImpl implements BookCrawlService {
|
||||
List<Integer> hasIndexNum = queryIndexCountByBookNameAndBAuthor(bookName, author);
|
||||
|
||||
while (isFindIndex) {
|
||||
if(isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl){
|
||||
if (isInteruptBiquTaCrawl || isInteruptBiquDaoCrawl) {
|
||||
return;
|
||||
}
|
||||
if (!hasIndexNum.contains(indexNum)) {
|
||||
@ -378,16 +390,18 @@ public class BookCrawlServiceImpl implements BookCrawlService {
|
||||
}
|
||||
}
|
||||
|
||||
}).start();
|
||||
|
||||
|
||||
}
|
||||
}
|
||||
}).start();
|
||||
|
||||
}
|
||||
|
||||
private void crawBiquTaBooks(int i) {
|
||||
String baseUrl = "https://m.biquta.com";
|
||||
String catBookListUrlBase = baseUrl + "/class/";
|
||||
if (crawlConfig.getPriority() == 1) {
|
||||
catBookListUrlBase = baseUrl + "/lhb/";
|
||||
}
|
||||
//拼接分类URL
|
||||
int page = 1;//起始页码
|
||||
int totalPage = page;
|
||||
@ -457,10 +471,9 @@ public class BookCrawlServiceImpl implements BookCrawlService {
|
||||
try {
|
||||
Float score = Float.parseFloat(scoreMatch.group(1));
|
||||
|
||||
/*if (score < lowestScore) {//数据库空间有限,暂时爬取8.0分以上的小说
|
||||
// Thread.sleep(1000 * 60 * 60 * 24);//因为爬的是龙虎榜,所以遇到第一个8分以下的,之后的都是8分以下的
|
||||
continue;
|
||||
}*/
|
||||
if (score < crawlConfig.getLowestScore()) {//数据库空间有限,暂时爬取8.0分以上的小说
|
||||
continue;
|
||||
}
|
||||
|
||||
String bookName = bookNameMatch.group(1);
|
||||
String author = authoreMatch.group(1);
|
||||
@ -487,6 +500,9 @@ public class BookCrawlServiceImpl implements BookCrawlService {
|
||||
String updateTimeStr = updateTimeMatch.group(1);
|
||||
SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss");
|
||||
Date updateTime = format.parse(updateTimeStr);
|
||||
if (updateTime.getTime() < new SimpleDateFormat("yyyy-MM-dd").parse(crawlConfig.getMinUptTime()).getTime()) {
|
||||
continue;
|
||||
}
|
||||
Pattern picPatten = Pattern.compile("<img src=\"([^>]+)\"\\s+onerror=\"this.src=");
|
||||
Matcher picMather = picPatten.matcher(body);
|
||||
if (picMather.find()) {
|
||||
@ -605,7 +621,11 @@ public class BookCrawlServiceImpl implements BookCrawlService {
|
||||
|
||||
private void crawBiqudaoBooks(final int i) {
|
||||
String baseUrl = "https://m.biqudao.com";
|
||||
String catBookListUrlBase = baseUrl + "/bqgelhb/";
|
||||
String catBookListUrlBase = baseUrl + "/bqgeclass/";
|
||||
if (crawlConfig.getPriority() == 1) {
|
||||
|
||||
catBookListUrlBase = baseUrl + "/bqgelhb/";
|
||||
}
|
||||
//拼接分类URL
|
||||
int page = 1;//起始页码
|
||||
int totalPage = page;
|
||||
@ -680,10 +700,9 @@ public class BookCrawlServiceImpl implements BookCrawlService {
|
||||
|
||||
Float score = Float.parseFloat(scoreMatch.group(1));
|
||||
|
||||
/*if (score < lowestScore) {//数据库空间有限,暂时爬取8.0分以上的小说
|
||||
Thread.sleep(1000 * 60 * 60 * 24);//因为爬的是龙虎榜,所以遇到第一个8分以下的,之后的都是8分以下的
|
||||
continue;
|
||||
}*/
|
||||
if (score < crawlConfig.getLowestScore()) {//数据库空间有限,暂时爬取8.0分以上的小说
|
||||
continue;
|
||||
}
|
||||
|
||||
String bookName = bookNameMatch.group(1);
|
||||
String author = authoreMatch.group(1);
|
||||
@ -710,6 +729,9 @@ public class BookCrawlServiceImpl implements BookCrawlService {
|
||||
String updateTimeStr = updateTimeMatch.group(1);
|
||||
SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss");
|
||||
Date updateTime = format.parse(updateTimeStr);
|
||||
if (updateTime.getTime() < new SimpleDateFormat("yyyy-MM-dd").parse(crawlConfig.getMinUptTime()).getTime()) {
|
||||
continue;
|
||||
}
|
||||
Pattern picPatten = Pattern.compile("<img src=\"([^>]+)\"\\s+onerror=\"this.src=");
|
||||
Matcher picMather = picPatten.matcher(body);
|
||||
if (picMather.find()) {
|
||||
|
11
novel-admin/src/main/resources/application-crawl.yml
Normal file
11
novel-admin/src/main/resources/application-crawl.yml
Normal file
@ -0,0 +1,11 @@
|
||||
crawl:
|
||||
#爬虫线程数
|
||||
threadCount: 1
|
||||
#爬取优先级 1:评分优先 2:更新时间优先
|
||||
priority: 1
|
||||
#小说最低评分
|
||||
lowestScore: 7.8
|
||||
#小说最小更新时间
|
||||
minUptTime: 2000-01-01
|
||||
#爬取最大条数
|
||||
maxNumber: 100000
|
@ -31,6 +31,7 @@ spring:
|
||||
type: redis
|
||||
|
||||
|
||||
|
||||
datasource:
|
||||
type: com.alibaba.druid.pool.DruidDataSource
|
||||
driverClassName: com.mysql.jdbc.Driver
|
||||
@ -76,6 +77,8 @@ spring:
|
||||
max-active: 100
|
||||
# 连接池最大阻塞等待时间(使用负值表示没有限制)
|
||||
max-wait: -1
|
||||
profiles:
|
||||
include: crawl
|
||||
mybatis:
|
||||
configuration:
|
||||
#自动将数据库带下划线的表字段值映射到Java类的驼峰字段上
|
||||
|
@ -2,9 +2,9 @@
|
||||
threadCount=1
|
||||
#爬取优先级 1:评分优先 2:更新时间优先
|
||||
priority=1
|
||||
#小说最低评分,0表示不限制
|
||||
#小说最低评分
|
||||
lowestScore=0
|
||||
#小说最小更新时间
|
||||
minUptTime=2000-01-01 00:00:00
|
||||
#爬取最大条数,0表示不限制
|
||||
maxNumber=0
|
||||
minUptTime=2000-01-01
|
||||
#爬取最大条数
|
||||
maxNumber=100000
|
||||
|
@ -163,7 +163,7 @@ function edit(){
|
||||
console.log('打开配置页面');
|
||||
layer.open({
|
||||
type : 2,
|
||||
title : '增加',
|
||||
title : '爬虫配置修改',
|
||||
maxmin : true,
|
||||
shadeClose : false,
|
||||
area : [ '800px', '520px' ],
|
||||
|
@ -9,41 +9,57 @@
|
||||
<div class="ibox float-e-margins">
|
||||
<div class="ibox-content">
|
||||
<form class="form-horizontal m-t" id="signupForm">
|
||||
<input id="id" name="id" th:value="${bookCrawl.id}"
|
||||
type="hidden">
|
||||
<div class="form-group">
|
||||
<label class="col-sm-3 control-label">:</label>
|
||||
<div class="col-sm-8">
|
||||
<input id="crawlWebName" name="crawlWebName" class="form-control"
|
||||
th:value="${bookCrawl.crawlWebName}"
|
||||
type="text">
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label class="col-sm-3 control-label">爬虫线程数: </label>
|
||||
<div class="col-sm-8">
|
||||
<label class="radio-inline" > <input th:field="*{property.threadCount}"
|
||||
type="radio" name="threadCount" th:value="1" />单线程
|
||||
</label>
|
||||
<label class="radio-inline" > <input th:field="*{property.threadCount}"
|
||||
type="radio" name="threadCount" th:value="3" />3线程
|
||||
</label>
|
||||
<label class="radio-inline" > <input th:field="*{property.threadCount}"
|
||||
type="radio" name="threadCount" th:value="7" />7线程
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label class="col-sm-3 control-label">爬取优先级:</label>
|
||||
<div class="col-sm-8">
|
||||
<div class="col-sm-8">
|
||||
<label class="radio-inline" > <input th:field="*{property.priority}"
|
||||
type="radio" name="priority" th:value="1" />评分优先
|
||||
</label>
|
||||
<label class="radio-inline" > <input th:field="*{property.priority}"
|
||||
type="radio" name="priority" th:value="2" />更新时间优先
|
||||
</label>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label class="col-sm-3 control-label">:</label>
|
||||
<div class="col-sm-8">
|
||||
<input id="crawlWebUrl" name="crawlWebUrl" class="form-control"
|
||||
th:value="${bookCrawl.crawlWebUrl}"
|
||||
type="text">
|
||||
</div>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label class="col-sm-3 control-label">:</label>
|
||||
<div class="col-sm-8">
|
||||
<input id="crawlWebCode" name="crawlWebCode" class="form-control"
|
||||
th:value="${bookCrawl.crawlWebCode}"
|
||||
type="text">
|
||||
</div>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label class="col-sm-3 control-label">:</label>
|
||||
<div class="col-sm-8">
|
||||
<input id="status" name="status" class="form-control"
|
||||
th:value="${bookCrawl.status}"
|
||||
type="text">
|
||||
</div>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
</div>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label class="col-sm-3 control-label">小说最低评分:</label>
|
||||
<div class="col-sm-8">
|
||||
<input id="lowestScore" name="lowestScore" class="form-control"
|
||||
th:value="${property.lowestScore}"
|
||||
type="number" max="10" min="0">
|
||||
</div>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label class="col-sm-3 control-label">小说最小更新时间:</label>
|
||||
<div class="col-sm-8">
|
||||
<input type="text" class="laydate-icon layer-date form-control" id="minUptTime" name="minUptTime" th:value="${property.minUptTime}==null?null:${#dates.format(property.minUptTime,'yyyy-MM-dd')}" placeholder="请选择小说最小更新时间"
|
||||
onclick="laydate({istime: true, format: 'YYYY-MM-DD'})" style="background-color: #fff;" readonly="readonly"/>
|
||||
</div>
|
||||
</div>
|
||||
<!-- <div class="form-group">
|
||||
<label class="col-sm-3 control-label">爬取最大条数:</label>
|
||||
<div class="col-sm-8">
|
||||
<input id="maxNumber" name="maxNumber" class="form-control"
|
||||
th:value="${property.maxNumber}"
|
||||
type="number" max="100000" min="1">
|
||||
</div>
|
||||
</div>-->
|
||||
<div class="form-group">
|
||||
<div class="col-sm-8 col-sm-offset-3">
|
||||
<button type="submit" class="btn btn-primary">提交</button>
|
||||
</div>
|
||||
|
Loading…
x
Reference in New Issue
Block a user