feat(novel-crawl): 增加爬虫源采集章节数量监控功能

可以监测到爬虫源在当前环境下是否可用
2025-09-14 05:02:43 +00:00 · 2025-07-16 19:52:07 +08:00
parent 3d41cf3ebb
commit 54bd194b98
5 changed files with 58 additions and 15 deletions
--- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java
+++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java
@@ -12,6 +12,8 @@ import io.github.xxyopen.util.IdWorker;
 import lombok.RequiredArgsConstructor;
 import lombok.extern.slf4j.Slf4j;
 import org.apache.commons.lang3.StringUtils;
+import org.springframework.data.redis.core.RedisTemplate;
+import org.springframework.data.redis.core.StringRedisTemplate;
 import org.springframework.stereotype.Component;

 import java.text.ParseException;
@@ -34,6 +36,13 @@ public class CrawlParser {

    private final CrawlHttpClient crawlHttpClient;

+    private final StringRedisTemplate stringRedisTemplate;
+
+    /**
+     * 爬虫源采集章节数量缓存key
+     */
+    private static final String CRAWL_SOURCE_CHAPTER_COUNT_CACHE_KEY = "crawlSource:chapterCount:";
+
    /**
     * 爬虫任务进度
     */
@@ -53,6 +62,20 @@ public class CrawlParser {
        crawlTaskProgress.remove(taskId);
    }

+    /**
+     * 获取爬虫源采集的章节数量
+     */
+    public Long getCrawlSourceChapterCount(Integer sourceId) {
+        return Optional.ofNullable(
+            stringRedisTemplate.opsForValue().get(CRAWL_SOURCE_CHAPTER_COUNT_CACHE_KEY + sourceId)).map(v -> {
+            try {
+                return Long.parseLong(v);
+            } catch (NumberFormatException e) {
+                return 0L;
+            }
+        }).orElse(0L);
+    }
+
    public void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler)
        throws InterruptedException {
        Book book = new Book();
@@ -182,7 +205,7 @@ public class CrawlParser {
        handler.handle(book);
    }

-    public boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean,
+    public boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Integer sourceId,
        Map<Integer, BookIndex> existBookIndexMap, CrawlBookChapterHandler handler, CrawlSingleTask task)
        throws InterruptedException {

@@ -314,10 +337,12 @@ public class CrawlParser {
                        bookIndex.setUpdateTime(currentDate);

                        if (task != null) {
-                            // 更新采集进度
+                            // 更新单本任务采集进度
                            crawlTaskProgress.put(task.getId(), indexList.size());
                        }

+                        // 更新爬虫源采集章节数量
+                        stringRedisTemplate.opsForValue().increment(CRAWL_SOURCE_CHAPTER_COUNT_CACHE_KEY + sourceId);

                    }

--- a/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java
+++ b/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java
@@ -74,7 +74,7 @@ public class StarterListener implements ServletContextInitializer {
                                        needUpdateBook.getId());
                                    //解析章节目录
                                    crawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book,
-                                        ruleBean, existBookIndexMap,
+                                        ruleBean, needUpdateBook.getCrawlSourceId(), existBookIndexMap,
                                        chapter -> bookService.updateBookAndIndexAndContent(book,
                                            chapter.getBookIndexList(),
                                            chapter.getBookContentList(), existBookIndexMap), null);
--- a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java
+++ b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java
@@ -104,10 +104,15 @@ public class CrawlServiceImpl implements CrawlService {
            .build()
            .render(RenderingStrategies.MYBATIS3);
        List<CrawlSource> crawlSources = crawlSourceMapper.selectMany(render);
-        crawlSources.forEach(crawlSource -> crawlSource.setSourceStatus(
-            Optional.ofNullable(crawlSourceStatusMap.get(crawlSource.getId())).orElse((byte) 0)));
        PageBean<CrawlSource> pageBean = PageBuilder.build(crawlSources);
-        pageBean.setList(BeanUtil.copyList(crawlSources, CrawlSourceVO.class));
+        List<CrawlSourceVO> crawlSourceVOS = BeanUtil.copyList(crawlSources, CrawlSourceVO.class);
+        crawlSourceVOS.forEach(crawlSource -> {
+                crawlSource.setSourceStatus(
+                    Optional.ofNullable(crawlSourceStatusMap.get(crawlSource.getId())).orElse((byte) 0));
+                crawlSource.setChapterCount(crawlParser.getCrawlSourceChapterCount(crawlSource.getId()));
+            }
+        );
+        pageBean.setList(crawlSourceVOS);
        return pageBean;
    }

@@ -386,7 +391,7 @@ public class CrawlServiceImpl implements CrawlService {
                book.setCrawlLastTime(new Date());
                book.setId(idWorker.nextId());
                //解析章节目录
-                boolean parseIndexContentResult = crawlParser.parseBookIndexAndContent(bookId, book, ruleBean,
+                boolean parseIndexContentResult = crawlParser.parseBookIndexAndContent(bookId, book, ruleBean, sourceId,
                    new HashMap<>(0), chapter -> {
                        bookService.saveBookAndIndexAndContent(book, chapter.getBookIndexList(),
                            chapter.getBookContentList());
--- a/novel-crawl/src/main/java/com/java2nb/novel/vo/CrawlSourceVO.java
+++ b/novel-crawl/src/main/java/com/java2nb/novel/vo/CrawlSourceVO.java
@@ -20,7 +20,7 @@ public class CrawlSourceVO extends CrawlSource{
    @JsonFormat(timezone = "GMT+8", pattern = "yyyy-MM-dd HH:mm")
    private Date updateTime;

-
+    private Long chapterCount;

    @Override
    public String toString() {
--- a/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html
+++ b/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html
@@ -43,7 +43,7 @@
                            <th class="style">
                                序号
                            </th>
-                            <th class="chapter">
+                            <th class="name">
                                爬虫源
                            </th>
                            <th class="name">
@@ -52,6 +52,9 @@
                            <th class="name">
                                更新时间
                            </th>
+                            <th class="goread">
+                                采集数量
+                            </th>
                            <th class="goread">
                                状态
                            </th>
@@ -111,11 +114,17 @@
 <script src="/javascript/header.js" type="text/javascript"></script>
 <script src="/javascript/user.js" type="text/javascript"></script>
 <script language="javascript" type="text/javascript">
-    search(1, 10);
+    let curr = 1;
+    let limit = 10;
+
+    search();
+    setInterval(function(){
+        search();
+    }, 10000);

    var pageCrawlSourceList = null;

-    function search(curr, limit) {
+    function search() {

        $.ajax({
            type: "get",
@@ -134,13 +143,15 @@
                                "                            <td class=\"style bookclass\">\n" +
                                "                                [" + (i + 1) + "]\n" +
                                "                            </td>\n" +
-                                "                            <td class=\"chapter\">\n" +
+                                "                            <td class=\"name\">\n" +
                                "                                " + crawlSource.sourceName + "</td>\n" +
                                "                            <td class=\"name\" valsc=\"291|2037554|1\">"
                                + crawlSource.createTime + "</td>\n" +
                                "                            <td class=\"name\">\n" +
                                "                                " + crawlSource.updateTime + "\n" +
                                "                            </td>\n" +
+                                "                            <td class=\"goread\">\n" +
+                                "                                " + crawlSource.chapterCount + "章</td>\n" +
                                "                            <td class=\"goread\" id='sourceStatus" + crawlSource.id + "'>" + (crawlSource.sourceStatus == 0 ? '停止运行' : '正在运行') +
                                "                            </td>\n" +

@@ -169,7 +180,9 @@

                                    //首次不执行
                                    if (!first) {
-                                        search(obj.curr, obj.limit);
+                                        curr = obj.curr;
+                                        limit = obj.limit;
+                                        search();
                                    } else {

                                    }
@@ -216,11 +229,11 @@
                    if (status == 0) {
                        //开启
                        $("#sourceStatus" + sourceId).html("正在运行");
-                        $("#opt" + sourceId).html("<a href='javascript:openOrStopCrawl(" + sourceId + "," + 1 + ")'>关闭</a>");
+                        $("#opt" + sourceId).html("<a href='javascript:openOrStopCrawl(" + sourceId + "," + 1 + ")'>关闭 </a>"+"<a href='javascript:updateCrawlSource(" + sourceId + ")'>修改  </a>");
                    } else {
                        //关闭
                        $("#sourceStatus" + sourceId).html("停止运行");
-                        $("#opt" + sourceId).html("<a href='javascript:openOrStopCrawl(" + sourceId + "," + 0 + ")'>开启</a>");
+                        $("#opt" + sourceId).html("<a href='javascript:openOrStopCrawl(" + sourceId + "," + 0 + ")'>开启 </a>"+"<a href='javascript:updateCrawlSource(" + sourceId + ")'>修改  </a>");
                    }