爬虫部分代码重构，准备适配TXT文本存储方案

2025-07-13 04:36:40 +00:00 · 2021-08-17 11:08:51 +08:00
parent 7f0331e095
commit cbfd0b049f
5 changed files with 89 additions and 55 deletions
--- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlBookChapterHandler.java
+++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlBookChapterHandler.java
@ -0,0 +1,12 @@
+package com.java2nb.novel.core.crawl;
+
+import com.java2nb.novel.entity.Book;
+
+/**
+ * 爬虫小说章节内容处理器
+ * */
+public interface CrawlBookChapterHandler {
+
+    void handle(ChapterBean chapterBean);
+
+}
--- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlBookHandler.java
+++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlBookHandler.java
@ -0,0 +1,12 @@
+package com.java2nb.novel.core.crawl;
+
+import com.java2nb.novel.entity.Book;
+
+/**
+ * 爬虫小说处理器
+ * */
+public interface CrawlBookHandler {
+
+    void handle(Book book);
+
+}
--- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java
+++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java
@ -33,7 +33,7 @@ public class CrawlParser {
    private static final ThreadLocal<Integer> retryCount = new ThreadLocal<>();

    @SneakyThrows
-    public static Book parseBook(RuleBean ruleBean, String bookId) {
+    public static void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) {
        Book book = new Book();
        String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
        String bookDetailHtml = getByHttpClientWithChrome(bookDetailUrl);
@ -135,10 +135,10 @@ public class CrawlParser {
                }
            }
        }
-        return book;
+        handler.handle(book);
    }

-    public static ChapterBean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map<Integer, BookIndex> existBookIndexMap) {
+    public static void parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map<Integer, BookIndex> existBookIndexMap, CrawlBookChapterHandler handler) {

        Date currentDate = new Date();

@ -228,7 +228,7 @@ public class CrawlParser {
                            bookContent.setIndexId(hasIndex.getId());

                            //计算总字数
-                            totalWordCount = (totalWordCount+wordCount-hasIndex.getWordCount());
+                            totalWordCount = (totalWordCount + wordCount - hasIndex.getWordCount());
                        } else {
                            //章节插入
                            //设置目录和章节内容
@ -246,7 +246,6 @@ public class CrawlParser {
                        bookIndex.setUpdateTime(currentDate);


-
                    }


@ -259,7 +258,7 @@ public class CrawlParser {
            if (indexList.size() > 0) {
                //如果有爬到最新章节，则设置小说主表的最新章节信息
                //获取爬取到的最新章节
-                BookIndex lastIndex = indexList.get(indexList.size()-1);
+                BookIndex lastIndex = indexList.get(indexList.size() - 1);
                book.setLastIndexId(lastIndex.getId());
                book.setLastIndexName(lastIndex.getIndexName());
                book.setLastIndexUpdateTime(currentDate);
@ -270,20 +269,22 @@ public class CrawlParser {

            if (indexList.size() == contentList.size() && indexList.size() > 0) {

-                return new ChapterBean(){{
+                handler.handle(new ChapterBean() {{
                    setBookIndexList(indexList);
                    setBookContentList(contentList);
-                }};
+                }});
+
+                return;

            }

        }

-
-        return new ChapterBean(){{
+        handler.handle(new ChapterBean() {{
            setBookIndexList(new ArrayList<>(0));
            setBookContentList(new ArrayList<>(0));
-        }};
+        }});
+
    }


--- a/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java
+++ b/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java
@ -56,19 +56,21 @@ public class StarterListener implements ServletContextListener {
                                CrawlSource source = crawlService.queryCrawlSource(needUpdateBook.getCrawlSourceId());
                                RuleBean ruleBean = new ObjectMapper().readValue(source.getCrawlRule(), RuleBean.class);
                                //解析小说基本信息
-                                Book book = CrawlParser.parseBook(ruleBean, needUpdateBook.getCrawlBookId());
-                                //这里只做老书更新
-                                book.setId(needUpdateBook.getId());
-                                book.setWordCount(needUpdateBook.getWordCount());
-                                if (needUpdateBook.getPicUrl() != null && needUpdateBook.getPicUrl().contains(Constants.LOCAL_PIC_PREFIX)) {
-                                    //本地图片则不更新
-                                    book.setPicUrl(null);
-                                }
-                                //查询已存在的章节
-                                Map<Integer, BookIndex> existBookIndexMap = bookService.queryExistBookIndexMap(needUpdateBook.getId());
-                                //解析章节目录
-                                ChapterBean chapter = CrawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, ruleBean, existBookIndexMap);
-                                bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList(), existBookIndexMap);
+                                CrawlParser.parseBook(ruleBean, needUpdateBook.getCrawlBookId(),book -> {
+                                    //这里只做老书更新
+                                    book.setId(needUpdateBook.getId());
+                                    book.setWordCount(needUpdateBook.getWordCount());
+                                    if (needUpdateBook.getPicUrl() != null && needUpdateBook.getPicUrl().contains(Constants.LOCAL_PIC_PREFIX)) {
+                                        //本地图片则不更新
+                                        book.setPicUrl(null);
+                                    }
+                                    //查询已存在的章节
+                                    Map<Integer, BookIndex> existBookIndexMap = bookService.queryExistBookIndexMap(needUpdateBook.getId());
+                                    //解析章节目录
+                                    CrawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, ruleBean, existBookIndexMap,chapter -> {
+                                        bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList(), existBookIndexMap);
+                                    });
+                                });
                            } catch (Exception e) {
                                log.error(e.getMessage(), e);
                            }
--- a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java
+++ b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java
@ -5,7 +5,6 @@ import com.github.pagehelper.PageHelper;
 import com.java2nb.novel.core.bean.PageBean;
 import com.java2nb.novel.core.cache.CacheKey;
 import com.java2nb.novel.core.cache.CacheService;
-import com.java2nb.novel.core.crawl.ChapterBean;
 import com.java2nb.novel.core.crawl.CrawlParser;
 import com.java2nb.novel.core.crawl.RuleBean;
 import com.java2nb.novel.core.enums.ResponseStatus;
@ -34,6 +33,7 @@ import org.mybatis.dynamic.sql.select.render.SelectStatementProvider;
 import org.springframework.stereotype.Service;

 import java.util.*;
+import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

@ -274,39 +274,46 @@ public class CrawlServiceImpl implements CrawlService {

    @Override
    public boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId) {
-        Book book = CrawlParser.parseBook(ruleBean, bookId);
-        if(book.getBookName() == null || book.getAuthorName() == null){
-            return false;
-        }
-        //这里只做新书入库，查询是否存在这本书
-        Book existBook = bookService.queryBookByBookNameAndAuthorName(book.getBookName(), book.getAuthorName());
-        //如果该小说不存在，则可以解析入库，但是标记该小说正在入库，30分钟之后才允许再次入库
-        if (existBook == null) {
-            //没有该书，可以入库
-            book.setCatId(catId);
-            //根据分类ID查询分类
-            book.setCatName(bookService.queryCatNameByCatId(catId));
-            if (catId == 7) {
-                //女频
-                book.setWorkDirection((byte) 1);
-            } else {
-                //男频
-                book.setWorkDirection((byte) 0);
+
+        final AtomicBoolean parseResult = new AtomicBoolean(false);
+
+        CrawlParser.parseBook(ruleBean, bookId, book -> {
+            if(book.getBookName() == null || book.getAuthorName() == null){
+                return;
            }
-            book.setCrawlBookId(bookId);
-            book.setCrawlSourceId(sourceId);
-            book.setCrawlLastTime(new Date());
-            book.setId(new IdWorker().nextId());
-            //解析章节目录
-            ChapterBean chapter = CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0));
+            //这里只做新书入库，查询是否存在这本书
+            Book existBook = bookService.queryBookByBookNameAndAuthorName(book.getBookName(), book.getAuthorName());
+            //如果该小说不存在，则可以解析入库，但是标记该小说正在入库，30分钟之后才允许再次入库
+            if (existBook == null) {
+                //没有该书，可以入库
+                book.setCatId(catId);
+                //根据分类ID查询分类
+                book.setCatName(bookService.queryCatNameByCatId(catId));
+                if (catId == 7) {
+                    //女频
+                    book.setWorkDirection((byte) 1);
+                } else {
+                    //男频
+                    book.setWorkDirection((byte) 0);
+                }
+                book.setCrawlBookId(bookId);
+                book.setCrawlSourceId(sourceId);
+                book.setCrawlLastTime(new Date());
+                book.setId(new IdWorker().nextId());
+                //解析章节目录
+                CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0),chapter -> {
+                    bookService.saveBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList());
+                });

-            bookService.saveBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList());
+            } else {
+                //只更新书籍的爬虫相关字段
+                bookService.updateCrawlProperties(existBook.getId(), sourceId, bookId);
+            }
+            parseResult.set(true);
+        });
+
+        return parseResult.get();

-        } else {
-            //只更新书籍的爬虫相关字段
-            bookService.updateCrawlProperties(existBook.getId(), sourceId, bookId);
-        }
-        return true;
    }

    @Override