From 54bd194b98f91d395cf2b8fb96f3b56cd9b0ea0c Mon Sep 17 00:00:00 2001 From: xiongxiaoyang <1179705413@qq.com> Date: Wed, 16 Jul 2025 19:52:07 +0800 Subject: [PATCH] =?UTF-8?q?feat(novel-crawl):=20=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E7=88=AC=E8=99=AB=E6=BA=90=E9=87=87=E9=9B=86=E7=AB=A0=E8=8A=82?= =?UTF-8?q?=E6=95=B0=E9=87=8F=E7=9B=91=E6=8E=A7=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 可以监测到爬虫源在当前环境下是否可用 --- .../java2nb/novel/core/crawl/CrawlParser.java | 29 +++++++++++++++++-- .../novel/core/listener/StarterListener.java | 2 +- .../novel/service/impl/CrawlServiceImpl.java | 13 ++++++--- .../com/java2nb/novel/vo/CrawlSourceVO.java | 2 +- .../templates/crawl/crawlSource_list.html | 27 ++++++++++++----- 5 files changed, 58 insertions(+), 15 deletions(-) diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java index a08fd3d..1cb6aaf 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java @@ -12,6 +12,8 @@ import io.github.xxyopen.util.IdWorker; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.StringUtils; +import org.springframework.data.redis.core.RedisTemplate; +import org.springframework.data.redis.core.StringRedisTemplate; import org.springframework.stereotype.Component; import java.text.ParseException; @@ -34,6 +36,13 @@ public class CrawlParser { private final CrawlHttpClient crawlHttpClient; + private final StringRedisTemplate stringRedisTemplate; + + /** + * 爬虫源采集章节数量缓存key + */ + private static final String CRAWL_SOURCE_CHAPTER_COUNT_CACHE_KEY = "crawlSource:chapterCount:"; + /** * 爬虫任务进度 */ @@ -53,6 +62,20 @@ public class CrawlParser { crawlTaskProgress.remove(taskId); } + /** + * 获取爬虫源采集的章节数量 + */ + public Long getCrawlSourceChapterCount(Integer sourceId) { + return Optional.ofNullable( + stringRedisTemplate.opsForValue().get(CRAWL_SOURCE_CHAPTER_COUNT_CACHE_KEY + sourceId)).map(v -> { + try { + return Long.parseLong(v); + } catch (NumberFormatException e) { + return 0L; + } + }).orElse(0L); + } + public void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) throws InterruptedException { Book book = new Book(); @@ -182,7 +205,7 @@ public class CrawlParser { handler.handle(book); } - public boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, + public boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Integer sourceId, Map existBookIndexMap, CrawlBookChapterHandler handler, CrawlSingleTask task) throws InterruptedException { @@ -314,10 +337,12 @@ public class CrawlParser { bookIndex.setUpdateTime(currentDate); if (task != null) { - // 更新采集进度 + // 更新单本任务采集进度 crawlTaskProgress.put(task.getId(), indexList.size()); } + // 更新爬虫源采集章节数量 + stringRedisTemplate.opsForValue().increment(CRAWL_SOURCE_CHAPTER_COUNT_CACHE_KEY + sourceId); } diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java b/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java index 4ca21cd..cfad5ee 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java @@ -74,7 +74,7 @@ public class StarterListener implements ServletContextInitializer { needUpdateBook.getId()); //解析章节目录 crawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, - ruleBean, existBookIndexMap, + ruleBean, needUpdateBook.getCrawlSourceId(), existBookIndexMap, chapter -> bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList(), existBookIndexMap), null); diff --git a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java index 305191c..e65d7e9 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java @@ -104,10 +104,15 @@ public class CrawlServiceImpl implements CrawlService { .build() .render(RenderingStrategies.MYBATIS3); List crawlSources = crawlSourceMapper.selectMany(render); - crawlSources.forEach(crawlSource -> crawlSource.setSourceStatus( - Optional.ofNullable(crawlSourceStatusMap.get(crawlSource.getId())).orElse((byte) 0))); PageBean pageBean = PageBuilder.build(crawlSources); - pageBean.setList(BeanUtil.copyList(crawlSources, CrawlSourceVO.class)); + List crawlSourceVOS = BeanUtil.copyList(crawlSources, CrawlSourceVO.class); + crawlSourceVOS.forEach(crawlSource -> { + crawlSource.setSourceStatus( + Optional.ofNullable(crawlSourceStatusMap.get(crawlSource.getId())).orElse((byte) 0)); + crawlSource.setChapterCount(crawlParser.getCrawlSourceChapterCount(crawlSource.getId())); + } + ); + pageBean.setList(crawlSourceVOS); return pageBean; } @@ -386,7 +391,7 @@ public class CrawlServiceImpl implements CrawlService { book.setCrawlLastTime(new Date()); book.setId(idWorker.nextId()); //解析章节目录 - boolean parseIndexContentResult = crawlParser.parseBookIndexAndContent(bookId, book, ruleBean, + boolean parseIndexContentResult = crawlParser.parseBookIndexAndContent(bookId, book, ruleBean, sourceId, new HashMap<>(0), chapter -> { bookService.saveBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList()); diff --git a/novel-crawl/src/main/java/com/java2nb/novel/vo/CrawlSourceVO.java b/novel-crawl/src/main/java/com/java2nb/novel/vo/CrawlSourceVO.java index 45860fb..d15b636 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/vo/CrawlSourceVO.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/vo/CrawlSourceVO.java @@ -20,7 +20,7 @@ public class CrawlSourceVO extends CrawlSource{ @JsonFormat(timezone = "GMT+8", pattern = "yyyy-MM-dd HH:mm") private Date updateTime; - + private Long chapterCount; @Override public String toString() { diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html b/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html index 35ebdbb..6d68121 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSource_list.html @@ -43,7 +43,7 @@ 序号 - + 爬虫源 @@ -52,6 +52,9 @@ 更新时间 + + 采集数量 + 状态 @@ -111,11 +114,17 @@