feat(novel-crawl): 增加爬虫源采集章节数量监控功能

可以监测到爬虫源在当前环境下是否可用
This commit is contained in:
xiongxiaoyang
2025-07-16 19:52:07 +08:00
parent 3d41cf3ebb
commit 54bd194b98
5 changed files with 58 additions and 15 deletions

View File

@ -12,6 +12,8 @@ import io.github.xxyopen.util.IdWorker;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.data.redis.core.RedisTemplate;
import org.springframework.data.redis.core.StringRedisTemplate;
import org.springframework.stereotype.Component;
import java.text.ParseException;
@ -34,6 +36,13 @@ public class CrawlParser {
private final CrawlHttpClient crawlHttpClient;
private final StringRedisTemplate stringRedisTemplate;
/**
* 爬虫源采集章节数量缓存key
*/
private static final String CRAWL_SOURCE_CHAPTER_COUNT_CACHE_KEY = "crawlSource:chapterCount:";
/**
* 爬虫任务进度
*/
@ -53,6 +62,20 @@ public class CrawlParser {
crawlTaskProgress.remove(taskId);
}
/**
* 获取爬虫源采集的章节数量
*/
public Long getCrawlSourceChapterCount(Integer sourceId) {
return Optional.ofNullable(
stringRedisTemplate.opsForValue().get(CRAWL_SOURCE_CHAPTER_COUNT_CACHE_KEY + sourceId)).map(v -> {
try {
return Long.parseLong(v);
} catch (NumberFormatException e) {
return 0L;
}
}).orElse(0L);
}
public void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler)
throws InterruptedException {
Book book = new Book();
@ -182,7 +205,7 @@ public class CrawlParser {
handler.handle(book);
}
public boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean,
public boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Integer sourceId,
Map<Integer, BookIndex> existBookIndexMap, CrawlBookChapterHandler handler, CrawlSingleTask task)
throws InterruptedException {
@ -314,10 +337,12 @@ public class CrawlParser {
bookIndex.setUpdateTime(currentDate);
if (task != null) {
// 更新采集进度
// 更新单本任务采集进度
crawlTaskProgress.put(task.getId(), indexList.size());
}
// 更新爬虫源采集章节数量
stringRedisTemplate.opsForValue().increment(CRAWL_SOURCE_CHAPTER_COUNT_CACHE_KEY + sourceId);
}

View File

@ -74,7 +74,7 @@ public class StarterListener implements ServletContextInitializer {
needUpdateBook.getId());
//解析章节目录
crawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book,
ruleBean, existBookIndexMap,
ruleBean, needUpdateBook.getCrawlSourceId(), existBookIndexMap,
chapter -> bookService.updateBookAndIndexAndContent(book,
chapter.getBookIndexList(),
chapter.getBookContentList(), existBookIndexMap), null);

View File

@ -104,10 +104,15 @@ public class CrawlServiceImpl implements CrawlService {
.build()
.render(RenderingStrategies.MYBATIS3);
List<CrawlSource> crawlSources = crawlSourceMapper.selectMany(render);
crawlSources.forEach(crawlSource -> crawlSource.setSourceStatus(
Optional.ofNullable(crawlSourceStatusMap.get(crawlSource.getId())).orElse((byte) 0)));
PageBean<CrawlSource> pageBean = PageBuilder.build(crawlSources);
pageBean.setList(BeanUtil.copyList(crawlSources, CrawlSourceVO.class));
List<CrawlSourceVO> crawlSourceVOS = BeanUtil.copyList(crawlSources, CrawlSourceVO.class);
crawlSourceVOS.forEach(crawlSource -> {
crawlSource.setSourceStatus(
Optional.ofNullable(crawlSourceStatusMap.get(crawlSource.getId())).orElse((byte) 0));
crawlSource.setChapterCount(crawlParser.getCrawlSourceChapterCount(crawlSource.getId()));
}
);
pageBean.setList(crawlSourceVOS);
return pageBean;
}
@ -386,7 +391,7 @@ public class CrawlServiceImpl implements CrawlService {
book.setCrawlLastTime(new Date());
book.setId(idWorker.nextId());
//解析章节目录
boolean parseIndexContentResult = crawlParser.parseBookIndexAndContent(bookId, book, ruleBean,
boolean parseIndexContentResult = crawlParser.parseBookIndexAndContent(bookId, book, ruleBean, sourceId,
new HashMap<>(0), chapter -> {
bookService.saveBookAndIndexAndContent(book, chapter.getBookIndexList(),
chapter.getBookContentList());

View File

@ -20,7 +20,7 @@ public class CrawlSourceVO extends CrawlSource{
@JsonFormat(timezone = "GMT+8", pattern = "yyyy-MM-dd HH:mm")
private Date updateTime;
private Long chapterCount;
@Override
public String toString() {

View File

@ -43,7 +43,7 @@
<th class="style">
序号
</th>
<th class="chapter">
<th class="name">
爬虫源
</th>
<th class="name">
@ -52,6 +52,9 @@
<th class="name">
更新时间
</th>
<th class="goread">
采集数量
</th>
<th class="goread">
状态
</th>
@ -111,11 +114,17 @@
<script src="/javascript/header.js" type="text/javascript"></script>
<script src="/javascript/user.js" type="text/javascript"></script>
<script language="javascript" type="text/javascript">
search(1, 10);
let curr = 1;
let limit = 10;
search();
setInterval(function(){
search();
}, 10000);
var pageCrawlSourceList = null;
function search(curr, limit) {
function search() {
$.ajax({
type: "get",
@ -134,13 +143,15 @@
" <td class=\"style bookclass\">\n" +
" [" + (i + 1) + "]\n" +
" </td>\n" +
" <td class=\"chapter\">\n" +
" <td class=\"name\">\n" +
" " + crawlSource.sourceName + "</td>\n" +
" <td class=\"name\" valsc=\"291|2037554|1\">"
+ crawlSource.createTime + "</td>\n" +
" <td class=\"name\">\n" +
" " + crawlSource.updateTime + "\n" +
" </td>\n" +
" <td class=\"goread\">\n" +
" " + crawlSource.chapterCount + "章</td>\n" +
" <td class=\"goread\" id='sourceStatus" + crawlSource.id + "'>" + (crawlSource.sourceStatus == 0 ? '停止运行' : '正在运行') +
" </td>\n" +
@ -169,7 +180,9 @@
//首次不执行
if (!first) {
search(obj.curr, obj.limit);
curr = obj.curr;
limit = obj.limit;
search();
} else {
}
@ -216,11 +229,11 @@
if (status == 0) {
//开启
$("#sourceStatus" + sourceId).html("正在运行");
$("#opt" + sourceId).html("<a href='javascript:openOrStopCrawl(" + sourceId + "," + 1 + ")'>关闭</a>");
$("#opt" + sourceId).html("<a href='javascript:openOrStopCrawl(" + sourceId + "," + 1 + ")'>关闭 </a>"+"<a href='javascript:updateCrawlSource(" + sourceId + ")'>修改 </a>");
} else {
//关闭
$("#sourceStatus" + sourceId).html("停止运行");
$("#opt" + sourceId).html("<a href='javascript:openOrStopCrawl(" + sourceId + "," + 0 + ")'>开启</a>");
$("#opt" + sourceId).html("<a href='javascript:openOrStopCrawl(" + sourceId + "," + 0 + ")'>开启 </a>"+"<a href='javascript:updateCrawlSource(" + sourceId + ")'>修改 </a>");
}