mirror of
https://github.com/201206030/novel-plus.git
synced 2025-07-01 23:26:38 +00:00
Compare commits
17 Commits
Author | SHA1 | Date | |
---|---|---|---|
efb136e3be | |||
7955db0e3c | |||
60dc28c5ed | |||
1534220f0c | |||
0830f6ffeb | |||
adc83db64e | |||
9c11f22816 | |||
24abe7714f | |||
a9fc80eba1 | |||
32541a7cb6 | |||
42bcecc304 | |||
a07643bde0 | |||
1f53b56bd6 | |||
2c86cb9a7d | |||
a4d6272a4f | |||
55d5deea74 | |||
4f474b91a8 |
@ -1,5 +1,4 @@
|
|||||||
<p align="center">
|
<p align="center">
|
||||||
<a href="https://www.swiftproxy.net/?code=T2WV1VT50"><img src="https://xxyopen.com/images/ad1.png" alt="AD" ></a>
|
|
||||||
<a href="https://cloud.tencent.com/act/cps/redirect?redirect=2446&cps_key=736e609d66e0ac4e57813316cec6fd0b&from=console"><img src="https://youdoc.github.io/img/tencent.jpg" alt="AD" ></a>
|
<a href="https://cloud.tencent.com/act/cps/redirect?redirect=2446&cps_key=736e609d66e0ac4e57813316cec6fd0b&from=console"><img src="https://youdoc.github.io/img/tencent.jpg" alt="AD" ></a>
|
||||||
</p>
|
</p>
|
||||||
<p align="center">
|
<p align="center">
|
||||||
@ -10,7 +9,7 @@
|
|||||||
</p>
|
</p>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
👉 <a href='https://novel.xxyopen.com'>官网</a> | 👉 <a href='https://www.bilibili.com/video/BV1Zo4y187Mi'>项目演示</a> | 👉 <a href='https://docs.xxyopen.com/course/novelplus/1.html'>安装教程</a>
|
👉 <a href='https://novel.xxyopen.com'>官网</a> | 👉 <a href='http://117.72.165.13:8888'>演示站点</a> | 👉 <a href='https://docs.xxyopen.com/course/novelplus/1.html'>安装教程</a>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
## 项目介绍
|
## 项目介绍
|
||||||
@ -22,8 +21,8 @@ TXT 文本存储)、阅读主题切换、多爬虫源自动采集和更新数
|
|||||||
## 项目地址
|
## 项目地址
|
||||||
|
|
||||||
- 学习版:[GitHub](https://github.com/201206030/novel) | [码云](https://gitee.com/novel_dev_team/novel)
|
- 学习版:[GitHub](https://github.com/201206030/novel) | [码云](https://gitee.com/novel_dev_team/novel)
|
||||||
| [保姆级教程](https://docs.xxyopen.com)
|
| [保姆级教程](https://docs.xxyopen.com)
|
||||||
- **应用版**:[GitHub](https://github.com/201206030/novel-plus) | [码云](https://gitee.com/novel_dev_team/novel-plus)
|
- **应用版**:[GitHub](https://github.com/201206030/novel-plus) | [码云](https://gitee.com/novel_dev_team/novel-plus) | [演示站点](http://117.72.165.13:8888)
|
||||||
- 微服务版:[GitHub](https://github.com/201206030/novel-cloud) | [码云](https://gitee.com/novel_dev_team/novel-cloud)
|
- 微服务版:[GitHub](https://github.com/201206030/novel-cloud) | [码云](https://gitee.com/novel_dev_team/novel-cloud)
|
||||||
|
|
||||||
## 项目结构
|
## 项目结构
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
<groupId>com.java2nb</groupId>
|
<groupId>com.java2nb</groupId>
|
||||||
<artifactId>novel-admin</artifactId>
|
<artifactId>novel-admin</artifactId>
|
||||||
<version>5.1.1</version>
|
<version>5.1.5</version>
|
||||||
<packaging>jar</packaging>
|
<packaging>jar</packaging>
|
||||||
|
|
||||||
<name>novel-admin</name>
|
<name>novel-admin</name>
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<artifactId>novel</artifactId>
|
<artifactId>novel</artifactId>
|
||||||
<groupId>com.java2nb</groupId>
|
<groupId>com.java2nb</groupId>
|
||||||
<version>5.1.1</version>
|
<version>5.1.5</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
@ -41,11 +41,6 @@ public interface CacheKey {
|
|||||||
* */
|
* */
|
||||||
String TEMPLATE_DIR_KEY = "templateDirKey";;
|
String TEMPLATE_DIR_KEY = "templateDirKey";;
|
||||||
|
|
||||||
/**
|
|
||||||
* 正在运行的爬虫线程存储KEY前缀
|
|
||||||
* */
|
|
||||||
String RUNNING_CRAWL_THREAD_KEY_PREFIX = "runningCrawlTreadDataKeyPrefix";
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 上一次搜索引擎更新的时间
|
* 上一次搜索引擎更新的时间
|
||||||
* */
|
* */
|
||||||
|
@ -5,8 +5,8 @@ import org.springframework.http.*;
|
|||||||
import org.springframework.web.client.RestTemplate;
|
import org.springframework.web.client.RestTemplate;
|
||||||
|
|
||||||
import java.nio.charset.Charset;
|
import java.nio.charset.Charset;
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author Administrator
|
* @author Administrator
|
||||||
@ -16,7 +16,7 @@ public class HttpUtil {
|
|||||||
|
|
||||||
private static final String DEFAULT_CHARSET = "utf-8";
|
private static final String DEFAULT_CHARSET = "utf-8";
|
||||||
|
|
||||||
private static final Map<String, RestTemplate> REST_TEMPLATE_MAP = new HashMap<>();
|
private static final Map<String, RestTemplate> REST_TEMPLATE_MAP = new ConcurrentHashMap<>();
|
||||||
|
|
||||||
public static String getByHttpClientWithChrome(String url, String charset) {
|
public static String getByHttpClientWithChrome(String url, String charset) {
|
||||||
log.debug("Get url:{}", url);
|
log.debug("Get url:{}", url);
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<artifactId>novel</artifactId>
|
<artifactId>novel</artifactId>
|
||||||
<groupId>com.java2nb</groupId>
|
<groupId>com.java2nb</groupId>
|
||||||
<version>5.1.1</version>
|
<version>5.1.5</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
@ -7,6 +7,6 @@ import com.java2nb.novel.entity.Book;
|
|||||||
* */
|
* */
|
||||||
public interface CrawlBookHandler {
|
public interface CrawlBookHandler {
|
||||||
|
|
||||||
void handle(Book book);
|
void handle(Book book) throws InterruptedException;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -10,9 +10,11 @@ import com.java2nb.novel.utils.CrawlHttpClient;
|
|||||||
import io.github.xxyopen.util.IdWorker;
|
import io.github.xxyopen.util.IdWorker;
|
||||||
import lombok.RequiredArgsConstructor;
|
import lombok.RequiredArgsConstructor;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
import lombok.extern.slf4j.Slf4j;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.springframework.stereotype.Component;
|
import org.springframework.stereotype.Component;
|
||||||
|
|
||||||
|
import java.text.ParseException;
|
||||||
import java.text.SimpleDateFormat;
|
import java.text.SimpleDateFormat;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
@ -26,6 +28,7 @@ import java.util.regex.Pattern;
|
|||||||
*
|
*
|
||||||
* @author Administrator
|
* @author Administrator
|
||||||
*/
|
*/
|
||||||
|
@Slf4j
|
||||||
@Component
|
@Component
|
||||||
@RequiredArgsConstructor
|
@RequiredArgsConstructor
|
||||||
public class CrawlParser {
|
public class CrawlParser {
|
||||||
@ -34,8 +37,8 @@ public class CrawlParser {
|
|||||||
|
|
||||||
private final CrawlHttpClient crawlHttpClient;
|
private final CrawlHttpClient crawlHttpClient;
|
||||||
|
|
||||||
@SneakyThrows
|
public void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler)
|
||||||
public void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) {
|
throws InterruptedException {
|
||||||
Book book = new Book();
|
Book book = new Book();
|
||||||
String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
|
String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
|
||||||
String bookDetailHtml = crawlHttpClient.get(bookDetailUrl, ruleBean.getCharset());
|
String bookDetailHtml = crawlHttpClient.get(bookDetailUrl, ruleBean.getCharset());
|
||||||
@ -97,6 +100,22 @@ public class CrawlParser {
|
|||||||
.replaceAll("<p>\\s*</p>", "")
|
.replaceAll("<p>\\s*</p>", "")
|
||||||
.replaceAll("<p>", "")
|
.replaceAll("<p>", "")
|
||||||
.replaceAll("</p>", "<br/>");
|
.replaceAll("</p>", "<br/>");
|
||||||
|
// 小说简介过滤
|
||||||
|
String filterDesc = ruleBean.getFilterDesc();
|
||||||
|
if (StringUtils.isNotBlank(filterDesc)) {
|
||||||
|
String[] filterRules = filterDesc.replace("\r\n", "\n").split("\n");
|
||||||
|
for (String filterRule : filterRules) {
|
||||||
|
if (StringUtils.isNotBlank(filterRule)) {
|
||||||
|
desc = desc.replaceAll(filterRule, "");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// 去除小说简介前后空格
|
||||||
|
desc = desc.trim();
|
||||||
|
// 去除小说简介末尾冗余的小说名
|
||||||
|
if (desc.endsWith(bookName)) {
|
||||||
|
desc = desc.substring(0, desc.length() - bookName.length());
|
||||||
|
}
|
||||||
//设置书籍简介
|
//设置书籍简介
|
||||||
book.setBookDesc(desc);
|
book.setBookDesc(desc);
|
||||||
if (StringUtils.isNotBlank(ruleBean.getStatusPatten())) {
|
if (StringUtils.isNotBlank(ruleBean.getStatusPatten())) {
|
||||||
@ -120,8 +139,12 @@ public class CrawlParser {
|
|||||||
if (isFindUpdateTime) {
|
if (isFindUpdateTime) {
|
||||||
String updateTime = updateTimeMatch.group(1);
|
String updateTime = updateTimeMatch.group(1);
|
||||||
//设置更新时间
|
//设置更新时间
|
||||||
book.setLastIndexUpdateTime(
|
try {
|
||||||
new SimpleDateFormat(ruleBean.getUpadateTimeFormatPatten()).parse(updateTime));
|
book.setLastIndexUpdateTime(
|
||||||
|
new SimpleDateFormat(ruleBean.getUpadateTimeFormatPatten()).parse(updateTime));
|
||||||
|
} catch (ParseException e) {
|
||||||
|
log.error("解析最新章节更新时间出错", e);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -144,7 +167,7 @@ public class CrawlParser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean,
|
public boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean,
|
||||||
Map<Integer, BookIndex> existBookIndexMap, CrawlBookChapterHandler handler) {
|
Map<Integer, BookIndex> existBookIndexMap, CrawlBookChapterHandler handler) throws InterruptedException {
|
||||||
|
|
||||||
Date currentDate = new Date();
|
Date currentDate = new Date();
|
||||||
|
|
||||||
@ -231,6 +254,8 @@ public class CrawlParser {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// 去除小说内容末尾的所有换行
|
||||||
|
content = removeTrailingBrTags(content);
|
||||||
//插入章节目录和章节内容
|
//插入章节目录和章节内容
|
||||||
BookIndex bookIndex = new BookIndex();
|
BookIndex bookIndex = new BookIndex();
|
||||||
bookIndex.setIndexName(indexName);
|
bookIndex.setIndexName(indexName);
|
||||||
@ -307,4 +332,12 @@ public class CrawlParser {
|
|||||||
return false;
|
return false;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 删除字符串末尾的所有 <br> 类似标签(允许各种空格)
|
||||||
|
*/
|
||||||
|
public static String removeTrailingBrTags(String str) {
|
||||||
|
return str.replaceAll("(?i)(?:\\s*<\\s*br\\s*/?\\s*>)++(?:\\s|\\u3000)*$", "");
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -45,6 +45,7 @@ public class RuleBean {
|
|||||||
private String visitCountPatten;
|
private String visitCountPatten;
|
||||||
private String descStart;
|
private String descStart;
|
||||||
private String descEnd;
|
private String descEnd;
|
||||||
|
private String filterDesc;
|
||||||
private String upadateTimePatten;
|
private String upadateTimePatten;
|
||||||
private String upadateTimeFormatPatten;
|
private String upadateTimeFormatPatten;
|
||||||
private String bookIndexUrl;
|
private String bookIndexUrl;
|
||||||
|
@ -74,10 +74,8 @@ public class StarterListener implements ServletContextInitializer {
|
|||||||
needUpdateBook.getId());
|
needUpdateBook.getId());
|
||||||
//解析章节目录
|
//解析章节目录
|
||||||
crawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book,
|
crawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book,
|
||||||
ruleBean, existBookIndexMap, chapter -> {
|
ruleBean, existBookIndexMap, chapter -> bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(),
|
||||||
bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(),
|
chapter.getBookContentList(), existBookIndexMap));
|
||||||
chapter.getBookContentList(), existBookIndexMap);
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.error(e.getMessage(), e);
|
log.error(e.getMessage(), e);
|
||||||
|
@ -1,61 +0,0 @@
|
|||||||
package com.java2nb.novel.core.schedule;
|
|
||||||
|
|
||||||
|
|
||||||
import com.java2nb.novel.core.cache.CacheKey;
|
|
||||||
import com.java2nb.novel.core.cache.CacheService;
|
|
||||||
import com.java2nb.novel.entity.CrawlSource;
|
|
||||||
import com.java2nb.novel.service.CrawlService;
|
|
||||||
import io.github.xxyopen.util.ThreadUtil;
|
|
||||||
import lombok.RequiredArgsConstructor;
|
|
||||||
import lombok.extern.slf4j.Slf4j;
|
|
||||||
import org.springframework.scheduling.annotation.Scheduled;
|
|
||||||
import org.springframework.stereotype.Service;
|
|
||||||
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 爬虫线程监控器,监控执行完成的爬虫源,并修改状态
|
|
||||||
*
|
|
||||||
* @author Administrator
|
|
||||||
*/
|
|
||||||
@Service
|
|
||||||
@RequiredArgsConstructor
|
|
||||||
@Slf4j
|
|
||||||
public class CrawlThreadMonitor {
|
|
||||||
|
|
||||||
private final CacheService cacheService;
|
|
||||||
|
|
||||||
private final CrawlService crawlService;
|
|
||||||
|
|
||||||
@Scheduled(fixedRate = 1000 * 60 * 5)
|
|
||||||
public void monitor() {
|
|
||||||
|
|
||||||
//查询需要监控的正在运行的爬虫源
|
|
||||||
List<CrawlSource> sources = crawlService.queryCrawlSourceByStatus((byte) 1);
|
|
||||||
|
|
||||||
for (CrawlSource source : sources) {
|
|
||||||
Set<Long> runningCrawlThreadIds = (Set<Long>) cacheService.getObject(CacheKey.RUNNING_CRAWL_THREAD_KEY_PREFIX + source.getId());
|
|
||||||
boolean sourceStop = true;
|
|
||||||
if (runningCrawlThreadIds != null) {
|
|
||||||
for (Long threadId : runningCrawlThreadIds) {
|
|
||||||
Thread thread = ThreadUtil.findThread(threadId);
|
|
||||||
|
|
||||||
if (thread != null && thread.isAlive()) {
|
|
||||||
//有活跃线程,说明该爬虫源正在运行,数据库中状态正确,不需要修改
|
|
||||||
sourceStop = false;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (sourceStop) {
|
|
||||||
crawlService.updateCrawlSourceStatus(source.getId(), (byte) 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
@ -53,7 +53,7 @@ public interface CrawlService {
|
|||||||
* @param ruleBean 采集规则\
|
* @param ruleBean 采集规则\
|
||||||
* @return true:成功,false:失败
|
* @return true:成功,false:失败
|
||||||
* */
|
* */
|
||||||
boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId);
|
boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId) throws InterruptedException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 根据爬虫状态查询爬虫源集合
|
* 根据爬虫状态查询爬虫源集合
|
||||||
|
@ -34,6 +34,7 @@ import org.mybatis.dynamic.sql.render.RenderingStrategies;
|
|||||||
import org.mybatis.dynamic.sql.select.render.SelectStatementProvider;
|
import org.mybatis.dynamic.sql.select.render.SelectStatementProvider;
|
||||||
import org.springframework.stereotype.Service;
|
import org.springframework.stereotype.Service;
|
||||||
|
|
||||||
|
import java.time.Duration;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
import java.util.regex.Matcher;
|
import java.util.regex.Matcher;
|
||||||
@ -65,6 +66,10 @@ public class CrawlServiceImpl implements CrawlService {
|
|||||||
|
|
||||||
private final CrawlHttpClient crawlHttpClient;
|
private final CrawlHttpClient crawlHttpClient;
|
||||||
|
|
||||||
|
private final Map<Integer, Byte> crawlSourceStatusMap = new HashMap<>();
|
||||||
|
|
||||||
|
private final Map<Integer, Set<Long>> runningCrawlThread = new HashMap<>();
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void addCrawlSource(CrawlSource source) {
|
public void addCrawlSource(CrawlSource source) {
|
||||||
@ -103,6 +108,8 @@ public class CrawlServiceImpl implements CrawlService {
|
|||||||
.build()
|
.build()
|
||||||
.render(RenderingStrategies.MYBATIS3);
|
.render(RenderingStrategies.MYBATIS3);
|
||||||
List<CrawlSource> crawlSources = crawlSourceMapper.selectMany(render);
|
List<CrawlSource> crawlSources = crawlSourceMapper.selectMany(render);
|
||||||
|
crawlSources.forEach(crawlSource -> crawlSource.setSourceStatus(
|
||||||
|
Optional.ofNullable(crawlSourceStatusMap.get(crawlSource.getId())).orElse((byte) 0)));
|
||||||
PageBean<CrawlSource> pageBean = PageBuilder.build(crawlSources);
|
PageBean<CrawlSource> pageBean = PageBuilder.build(crawlSources);
|
||||||
pageBean.setList(BeanUtil.copyList(crawlSources, CrawlSourceVO.class));
|
pageBean.setList(BeanUtil.copyList(crawlSources, CrawlSourceVO.class));
|
||||||
return pageBean;
|
return pageBean;
|
||||||
@ -112,14 +119,13 @@ public class CrawlServiceImpl implements CrawlService {
|
|||||||
@Override
|
@Override
|
||||||
public void openOrCloseCrawl(Integer sourceId, Byte sourceStatus) {
|
public void openOrCloseCrawl(Integer sourceId, Byte sourceStatus) {
|
||||||
|
|
||||||
//判断是开启还是关闭,如果是关闭,则修改数据库状态后获取该爬虫正在运行的线程集合并全部停止
|
// 判断是开启还是关闭,如果是关闭,则获取该爬虫源正在运行的线程集合并全部中断
|
||||||
//如果是开启,先查询数据库中状态,判断该爬虫源是否还在运行,如果在运行,则忽略,
|
// 如果是开启,先判断该爬虫源是否还在运行,如果在运行,则忽略,如果没有运行则启动线程爬取小说数据并加入到runningCrawlThread中
|
||||||
// 如果没有则修改数据库状态,并启动线程爬取小说数据加入到runningCrawlThread中
|
// 最后,保存爬虫源状态
|
||||||
if (sourceStatus == (byte) 0) {
|
if (sourceStatus == (byte) 0) {
|
||||||
//关闭,直接修改数据库状态,并直接修改数据库状态后获取该爬虫正在运行的线程集合全部停止
|
// 关闭
|
||||||
SpringUtil.getBean(CrawlService.class).updateCrawlSourceStatus(sourceId, sourceStatus);
|
// 将该爬虫源正在运行的线程集合全部停止
|
||||||
Set<Long> runningCrawlThreadId = (Set<Long>) cacheService.getObject(
|
Set<Long> runningCrawlThreadId = runningCrawlThread.get(sourceId);
|
||||||
CacheKey.RUNNING_CRAWL_THREAD_KEY_PREFIX + sourceId);
|
|
||||||
if (runningCrawlThreadId != null) {
|
if (runningCrawlThreadId != null) {
|
||||||
for (Long ThreadId : runningCrawlThreadId) {
|
for (Long ThreadId : runningCrawlThreadId) {
|
||||||
Thread thread = ThreadUtil.findThread(ThreadId);
|
Thread thread = ThreadUtil.findThread(ThreadId);
|
||||||
@ -131,16 +137,13 @@ public class CrawlServiceImpl implements CrawlService {
|
|||||||
|
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
//开启
|
// 开启
|
||||||
//查询爬虫源状态和规则
|
Byte realSourceStatus = Optional.ofNullable(crawlSourceStatusMap.get(sourceId)).orElse((byte) 0);
|
||||||
CrawlSource source = queryCrawlSource(sourceId);
|
|
||||||
Byte realSourceStatus = source.getSourceStatus();
|
|
||||||
|
|
||||||
if (realSourceStatus == (byte) 0) {
|
if (realSourceStatus == (byte) 0) {
|
||||||
//该爬虫源已经停止运行了,修改数据库状态,并启动线程爬取小说数据加入到runningCrawlThread中
|
// 查询爬虫源规则
|
||||||
SpringUtil.getBean(CrawlService.class).updateCrawlSourceStatus(sourceId, sourceStatus);
|
CrawlSource source = queryCrawlSource(sourceId);
|
||||||
|
//该爬虫源已经停止运行了,启动线程爬取小说数据并将线程加入到runningCrawlThread中
|
||||||
RuleBean ruleBean = new ObjectMapper().readValue(source.getCrawlRule(), RuleBean.class);
|
RuleBean ruleBean = new ObjectMapper().readValue(source.getCrawlRule(), RuleBean.class);
|
||||||
|
|
||||||
Set<Long> threadIds = new HashSet<>();
|
Set<Long> threadIds = new HashSet<>();
|
||||||
//按分类开始爬虫解析任务
|
//按分类开始爬虫解析任务
|
||||||
for (int i = 1; i < 8; i++) {
|
for (int i = 1; i < 8; i++) {
|
||||||
@ -149,16 +152,15 @@ public class CrawlServiceImpl implements CrawlService {
|
|||||||
thread.start();
|
thread.start();
|
||||||
//thread加入到监控缓存中
|
//thread加入到监控缓存中
|
||||||
threadIds.add(thread.getId());
|
threadIds.add(thread.getId());
|
||||||
|
|
||||||
}
|
}
|
||||||
cacheService.setObject(CacheKey.RUNNING_CRAWL_THREAD_KEY_PREFIX + sourceId, threadIds);
|
runningCrawlThread.put(sourceId, threadIds);
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 保存爬虫源状态
|
||||||
|
crawlSourceStatusMap.put(sourceId, sourceStatus);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -248,6 +250,11 @@ public class CrawlServiceImpl implements CrawlService {
|
|||||||
@Override
|
@Override
|
||||||
public void parseBookList(int catId, RuleBean ruleBean, Integer sourceId) {
|
public void parseBookList(int catId, RuleBean ruleBean, Integer sourceId) {
|
||||||
|
|
||||||
|
String catIdRule = ruleBean.getCatIdRule().get("catId" + catId);
|
||||||
|
if (StringUtils.isBlank(catIdRule)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
//当前页码1
|
//当前页码1
|
||||||
int page = 1;
|
int page = 1;
|
||||||
int totalPage = page;
|
int totalPage = page;
|
||||||
@ -255,73 +262,91 @@ public class CrawlServiceImpl implements CrawlService {
|
|||||||
while (page <= totalPage) {
|
while (page <= totalPage) {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
String catIdRule = ruleBean.getCatIdRule().get("catId" + catId);
|
String catBookListUrl;
|
||||||
if (StringUtils.isNotBlank(catIdRule)) {
|
if (StringUtils.isNotBlank(ruleBean.getBookListUrl())) {
|
||||||
String catBookListUrl = "";
|
// 兼容老规则
|
||||||
if (StringUtils.isNotBlank(ruleBean.getBookListUrl())) {
|
// 拼接分类URL
|
||||||
// 兼容老规则
|
catBookListUrl = ruleBean.getBookListUrl()
|
||||||
// 拼接分类URL
|
.replace("{catId}", catIdRule)
|
||||||
catBookListUrl = ruleBean.getBookListUrl()
|
.replace("{page}", page + "");
|
||||||
.replace("{catId}", catIdRule)
|
} else {
|
||||||
.replace("{page}", page + "");
|
// 新规则
|
||||||
} else {
|
// 拼接分类URL
|
||||||
// 新规则
|
catBookListUrl = catIdRule.replace("{page}", page + "");
|
||||||
// 拼接分类URL
|
}
|
||||||
catBookListUrl = catIdRule.replace("{page}", page + "");
|
log.info("catBookListUrl:{}", catBookListUrl);
|
||||||
}
|
|
||||||
log.info("catBookListUrl:{}", catBookListUrl);
|
|
||||||
|
|
||||||
String bookListHtml = crawlHttpClient.get(catBookListUrl, ruleBean.getCharset());
|
String bookListHtml = crawlHttpClient.get(catBookListUrl, ruleBean.getCharset());
|
||||||
if (bookListHtml != null) {
|
if (bookListHtml != null) {
|
||||||
Pattern bookIdPatten = Pattern.compile(ruleBean.getBookIdPatten());
|
Pattern bookIdPatten = Pattern.compile(ruleBean.getBookIdPatten());
|
||||||
Matcher bookIdMatcher = bookIdPatten.matcher(bookListHtml);
|
Matcher bookIdMatcher = bookIdPatten.matcher(bookListHtml);
|
||||||
boolean isFindBookId = bookIdMatcher.find();
|
boolean isFindBookId = bookIdMatcher.find();
|
||||||
while (isFindBookId) {
|
while (isFindBookId) {
|
||||||
try {
|
try {
|
||||||
//1.阻塞过程(使用了 sleep,同步锁的 wait,socket 中的 receiver,accept 等方法时)
|
//1.阻塞过程(使用了 sleep,同步锁的 wait,socket 中的 receiver,accept 等方法时)
|
||||||
//捕获中断异常InterruptedException来退出线程。
|
//捕获中断异常InterruptedException来退出线程。
|
||||||
//2.非阻塞过程中通过判断中断标志来退出线程。
|
//2.非阻塞过程中通过判断中断标志来退出线程。
|
||||||
if (Thread.currentThread().isInterrupted()) {
|
if (Thread.currentThread().isInterrupted()) {
|
||||||
return;
|
return;
|
||||||
}
|
|
||||||
|
|
||||||
String bookId = bookIdMatcher.group(1);
|
|
||||||
parseBookAndSave(catId, ruleBean, sourceId, bookId);
|
|
||||||
} catch (Exception e) {
|
|
||||||
log.error(e.getMessage(), e);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
isFindBookId = bookIdMatcher.find();
|
String bookId = bookIdMatcher.group(1);
|
||||||
|
parseBookAndSave(catId, ruleBean, sourceId, bookId);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
log.error(e.getMessage(), e);
|
||||||
|
//1.阻塞过程(使用了 sleep,同步锁的 wait,socket 中的 receiver,accept 等方法时)
|
||||||
|
//捕获中断异常InterruptedException来退出线程。
|
||||||
|
//2.非阻塞过程中通过判断中断标志来退出线程。
|
||||||
|
return;
|
||||||
|
} catch (Exception e) {
|
||||||
|
log.error(e.getMessage(), e);
|
||||||
}
|
}
|
||||||
|
|
||||||
Pattern totalPagePatten = Pattern.compile(ruleBean.getTotalPagePatten());
|
isFindBookId = bookIdMatcher.find();
|
||||||
Matcher totalPageMatcher = totalPagePatten.matcher(bookListHtml);
|
}
|
||||||
boolean isFindTotalPage = totalPageMatcher.find();
|
|
||||||
if (isFindTotalPage) {
|
|
||||||
|
|
||||||
totalPage = Integer.parseInt(totalPageMatcher.group(1));
|
Pattern totalPagePatten = Pattern.compile(ruleBean.getTotalPagePatten());
|
||||||
|
Matcher totalPageMatcher = totalPagePatten.matcher(bookListHtml);
|
||||||
}
|
boolean isFindTotalPage = totalPageMatcher.find();
|
||||||
|
if (isFindTotalPage) {
|
||||||
|
|
||||||
|
totalPage = Integer.parseInt(totalPageMatcher.group(1));
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
log.error(e.getMessage(), e);
|
||||||
|
//1.阻塞过程(使用了 sleep,同步锁的 wait,socket 中的 receiver,accept 等方法时)
|
||||||
|
//捕获中断异常InterruptedException来退出线程。
|
||||||
|
//2.非阻塞过程中通过判断中断标志来退出线程。
|
||||||
|
return;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
log.error(e.getMessage(), e);
|
log.error(e.getMessage(), e);
|
||||||
}
|
}
|
||||||
if (page == totalPage) {
|
if (page >= totalPage) {
|
||||||
// 第一遍采集完成,翻到第一页,继续第二次采集,适用于分页数比较少的最近更新列表
|
// 第一遍采集完成,翻到第一页,继续第二次采集,适用于分页数比较少的最近更新列表
|
||||||
page = 0;
|
page = 1;
|
||||||
|
try {
|
||||||
|
// 第一遍采集完成,休眠1分钟
|
||||||
|
Thread.sleep(Duration.ofMinutes(1));
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
log.error(e.getMessage(), e);
|
||||||
|
//1.阻塞过程(使用了 sleep,同步锁的 wait,socket 中的 receiver,accept 等方法时)
|
||||||
|
//捕获中断异常InterruptedException来退出线程。
|
||||||
|
//2.非阻塞过程中通过判断中断标志来退出线程。
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
page += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
page += 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId) {
|
public boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId)
|
||||||
|
throws InterruptedException {
|
||||||
|
|
||||||
final AtomicBoolean parseResult = new AtomicBoolean(false);
|
final AtomicBoolean parseResult = new AtomicBoolean(false);
|
||||||
|
|
||||||
@ -385,4 +410,5 @@ public class CrawlServiceImpl implements CrawlService {
|
|||||||
.render(RenderingStrategies.MYBATIS3);
|
.render(RenderingStrategies.MYBATIS3);
|
||||||
return crawlSourceMapper.selectMany(render);
|
return crawlSourceMapper.selectMany(render);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -25,13 +25,9 @@ public class CrawlHttpClient {
|
|||||||
|
|
||||||
private static final ThreadLocal<Integer> RETRY_COUNT = new ThreadLocal<>();
|
private static final ThreadLocal<Integer> RETRY_COUNT = new ThreadLocal<>();
|
||||||
|
|
||||||
public String get(String url, String charset) {
|
public String get(String url, String charset) throws InterruptedException {
|
||||||
if (Objects.nonNull(intervalMin) && Objects.nonNull(intervalMax) && intervalMax > intervalMin) {
|
if (Objects.nonNull(intervalMin) && Objects.nonNull(intervalMax) && intervalMax > intervalMin) {
|
||||||
try {
|
Thread.sleep(random.nextInt(intervalMax - intervalMin + 1) + intervalMin);
|
||||||
Thread.sleep(random.nextInt(intervalMax - intervalMin + 1) + intervalMin);
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
log.error(e.getMessage(), e);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
String body = HttpUtil.getByHttpClientWithChrome(url, charset);
|
String body = HttpUtil.getByHttpClientWithChrome(url, charset);
|
||||||
if (Objects.isNull(body) || body.length() < Constants.INVALID_HTML_LENGTH) {
|
if (Objects.isNull(body) || body.length() < Constants.INVALID_HTML_LENGTH) {
|
||||||
@ -41,7 +37,7 @@ public class CrawlHttpClient {
|
|||||||
return body;
|
return body;
|
||||||
}
|
}
|
||||||
|
|
||||||
private String processErrorHttpResult(String url, String charset) {
|
private String processErrorHttpResult(String url, String charset) throws InterruptedException{
|
||||||
Integer count = RETRY_COUNT.get();
|
Integer count = RETRY_COUNT.get();
|
||||||
if (count == null) {
|
if (count == null) {
|
||||||
count = 0;
|
count = 0;
|
||||||
|
@ -118,6 +118,9 @@
|
|||||||
示例:<b></p></b>
|
示例:<b></p></b>
|
||||||
<li><input type="text" id="descEnd" class="s_input icon_key" placeholder="小说简介结束截取字符串:">
|
<li><input type="text" id="descEnd" class="s_input icon_key" placeholder="小说简介结束截取字符串:">
|
||||||
</li>
|
</li>
|
||||||
|
示例:<b><span\s+class="allshow">([^/]+)</span></b>
|
||||||
|
<li><textarea id="filterDesc"
|
||||||
|
placeholder="过滤简介(多个内容换行)" rows="5" cols="52"></textarea></li>
|
||||||
示例:<b>更新:(\d+-\d+-\d+\s\d+:\d+:\d+)</a></b>
|
示例:<b>更新:(\d+-\d+-\d+\s\d+:\d+:\d+)</a></b>
|
||||||
<li><input type="text" id="upadateTimePatten" class="s_input icon_key"
|
<li><input type="text" id="upadateTimePatten" class="s_input icon_key"
|
||||||
placeholder="小说更新时间的正则表达式:"></li>
|
placeholder="小说更新时间的正则表达式:"></li>
|
||||||
@ -338,6 +341,9 @@
|
|||||||
|
|
||||||
crawlRule.descEnd = descEnd;
|
crawlRule.descEnd = descEnd;
|
||||||
|
|
||||||
|
var filterDesc = $("#filterDesc").val();
|
||||||
|
crawlRule.filterDesc = filterDesc;
|
||||||
|
|
||||||
var upadateTimePatten = $("#upadateTimePatten").val();
|
var upadateTimePatten = $("#upadateTimePatten").val();
|
||||||
|
|
||||||
if (upadateTimePatten.length > 0) {
|
if (upadateTimePatten.length > 0) {
|
||||||
|
@ -119,6 +119,9 @@
|
|||||||
示例:<b></p></b>
|
示例:<b></p></b>
|
||||||
<li><input type="text" id="descEnd" class="s_input icon_key" placeholder="小说简介结束截取字符串:">
|
<li><input type="text" id="descEnd" class="s_input icon_key" placeholder="小说简介结束截取字符串:">
|
||||||
</li>
|
</li>
|
||||||
|
示例:<b><span\s+class="allshow">([^/]+)</span></b>
|
||||||
|
<li><textarea id="filterDesc"
|
||||||
|
placeholder="过滤简介(多个内容换行)" rows="5" cols="52"></textarea></li>
|
||||||
示例:<b>更新:(\d+-\d+-\d+\s\d+:\d+:\d+)</a></b>
|
示例:<b>更新:(\d+-\d+-\d+\s\d+:\d+:\d+)</a></b>
|
||||||
<li><input type="text" id="upadateTimePatten" class="s_input icon_key"
|
<li><input type="text" id="upadateTimePatten" class="s_input icon_key"
|
||||||
placeholder="小说更新时间的正则表达式:"></li>
|
placeholder="小说更新时间的正则表达式:"></li>
|
||||||
@ -266,6 +269,7 @@
|
|||||||
$("#visitCountPatten").val(crawlRule.visitCountPatten);
|
$("#visitCountPatten").val(crawlRule.visitCountPatten);
|
||||||
$("#descStart").val(crawlRule.descStart);
|
$("#descStart").val(crawlRule.descStart);
|
||||||
$("#descEnd").val(crawlRule.descEnd);
|
$("#descEnd").val(crawlRule.descEnd);
|
||||||
|
$("#filterDesc").val(crawlRule.filterDesc);
|
||||||
$("#upadateTimePatten").val(crawlRule.upadateTimePatten);
|
$("#upadateTimePatten").val(crawlRule.upadateTimePatten);
|
||||||
$("#upadateTimeFormatPatten").val(crawlRule.upadateTimeFormatPatten);
|
$("#upadateTimeFormatPatten").val(crawlRule.upadateTimeFormatPatten);
|
||||||
$("#bookIndexUrl").val(crawlRule.bookIndexUrl);
|
$("#bookIndexUrl").val(crawlRule.bookIndexUrl);
|
||||||
@ -424,6 +428,9 @@
|
|||||||
|
|
||||||
crawlRule.descEnd = descEnd;
|
crawlRule.descEnd = descEnd;
|
||||||
|
|
||||||
|
var filterDesc = $("#filterDesc").val();
|
||||||
|
crawlRule.filterDesc = filterDesc;
|
||||||
|
|
||||||
var upadateTimePatten = $("#upadateTimePatten").val();
|
var upadateTimePatten = $("#upadateTimePatten").val();
|
||||||
|
|
||||||
if (upadateTimePatten.length > 0) {
|
if (upadateTimePatten.length > 0) {
|
||||||
|
@ -5,7 +5,7 @@
|
|||||||
<parent>
|
<parent>
|
||||||
<artifactId>novel</artifactId>
|
<artifactId>novel</artifactId>
|
||||||
<groupId>com.java2nb</groupId>
|
<groupId>com.java2nb</groupId>
|
||||||
<version>5.1.1</version>
|
<version>5.1.5</version>
|
||||||
</parent>
|
</parent>
|
||||||
<modelVersion>4.0.0</modelVersion>
|
<modelVersion>4.0.0</modelVersion>
|
||||||
|
|
||||||
|
2
pom.xml
2
pom.xml
@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
<groupId>com.java2nb</groupId>
|
<groupId>com.java2nb</groupId>
|
||||||
<artifactId>novel</artifactId>
|
<artifactId>novel</artifactId>
|
||||||
<version>5.1.1</version>
|
<version>5.1.5</version>
|
||||||
<modules>
|
<modules>
|
||||||
<module>novel-common</module>
|
<module>novel-common</module>
|
||||||
<module>novel-front</module>
|
<module>novel-front</module>
|
||||||
|
Reference in New Issue
Block a user