diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlBookChapterHandler.java b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlBookChapterHandler.java new file mode 100644 index 0000000..e0a4eb6 --- /dev/null +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlBookChapterHandler.java @@ -0,0 +1,12 @@ +package com.java2nb.novel.core.crawl; + +import com.java2nb.novel.entity.Book; + +/** + * 爬虫小说章节内容处理器 + * */ +public interface CrawlBookChapterHandler { + + void handle(ChapterBean chapterBean); + +} diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlBookHandler.java b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlBookHandler.java new file mode 100644 index 0000000..7d6a8c0 --- /dev/null +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlBookHandler.java @@ -0,0 +1,12 @@ +package com.java2nb.novel.core.crawl; + +import com.java2nb.novel.entity.Book; + +/** + * 爬虫小说处理器 + * */ +public interface CrawlBookHandler { + + void handle(Book book); + +} diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java index 4f3723c..3fd3d91 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java @@ -33,7 +33,7 @@ public class CrawlParser { private static final ThreadLocal retryCount = new ThreadLocal<>(); @SneakyThrows - public static Book parseBook(RuleBean ruleBean, String bookId) { + public static void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) { Book book = new Book(); String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId); String bookDetailHtml = getByHttpClientWithChrome(bookDetailUrl); @@ -135,10 +135,10 @@ public class CrawlParser { } } } - return book; + handler.handle(book); } - public static ChapterBean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map existBookIndexMap) { + public static void parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map existBookIndexMap, CrawlBookChapterHandler handler) { Date currentDate = new Date(); @@ -228,7 +228,7 @@ public class CrawlParser { bookContent.setIndexId(hasIndex.getId()); //计算总字数 - totalWordCount = (totalWordCount+wordCount-hasIndex.getWordCount()); + totalWordCount = (totalWordCount + wordCount - hasIndex.getWordCount()); } else { //章节插入 //设置目录和章节内容 @@ -246,7 +246,6 @@ public class CrawlParser { bookIndex.setUpdateTime(currentDate); - } @@ -259,7 +258,7 @@ public class CrawlParser { if (indexList.size() > 0) { //如果有爬到最新章节,则设置小说主表的最新章节信息 //获取爬取到的最新章节 - BookIndex lastIndex = indexList.get(indexList.size()-1); + BookIndex lastIndex = indexList.get(indexList.size() - 1); book.setLastIndexId(lastIndex.getId()); book.setLastIndexName(lastIndex.getIndexName()); book.setLastIndexUpdateTime(currentDate); @@ -270,20 +269,22 @@ public class CrawlParser { if (indexList.size() == contentList.size() && indexList.size() > 0) { - return new ChapterBean(){{ + handler.handle(new ChapterBean() {{ setBookIndexList(indexList); setBookContentList(contentList); - }}; + }}); + + return; } } - - return new ChapterBean(){{ + handler.handle(new ChapterBean() {{ setBookIndexList(new ArrayList<>(0)); setBookContentList(new ArrayList<>(0)); - }}; + }}); + } diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java b/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java index 2fb8a5a..ad57357 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java @@ -56,19 +56,21 @@ public class StarterListener implements ServletContextListener { CrawlSource source = crawlService.queryCrawlSource(needUpdateBook.getCrawlSourceId()); RuleBean ruleBean = new ObjectMapper().readValue(source.getCrawlRule(), RuleBean.class); //解析小说基本信息 - Book book = CrawlParser.parseBook(ruleBean, needUpdateBook.getCrawlBookId()); - //这里只做老书更新 - book.setId(needUpdateBook.getId()); - book.setWordCount(needUpdateBook.getWordCount()); - if (needUpdateBook.getPicUrl() != null && needUpdateBook.getPicUrl().contains(Constants.LOCAL_PIC_PREFIX)) { - //本地图片则不更新 - book.setPicUrl(null); - } - //查询已存在的章节 - Map existBookIndexMap = bookService.queryExistBookIndexMap(needUpdateBook.getId()); - //解析章节目录 - ChapterBean chapter = CrawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, ruleBean, existBookIndexMap); - bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList(), existBookIndexMap); + CrawlParser.parseBook(ruleBean, needUpdateBook.getCrawlBookId(),book -> { + //这里只做老书更新 + book.setId(needUpdateBook.getId()); + book.setWordCount(needUpdateBook.getWordCount()); + if (needUpdateBook.getPicUrl() != null && needUpdateBook.getPicUrl().contains(Constants.LOCAL_PIC_PREFIX)) { + //本地图片则不更新 + book.setPicUrl(null); + } + //查询已存在的章节 + Map existBookIndexMap = bookService.queryExistBookIndexMap(needUpdateBook.getId()); + //解析章节目录 + CrawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, ruleBean, existBookIndexMap,chapter -> { + bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList(), existBookIndexMap); + }); + }); } catch (Exception e) { log.error(e.getMessage(), e); } diff --git a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java index db0a7cc..c9492cc 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java @@ -5,7 +5,6 @@ import com.github.pagehelper.PageHelper; import com.java2nb.novel.core.bean.PageBean; import com.java2nb.novel.core.cache.CacheKey; import com.java2nb.novel.core.cache.CacheService; -import com.java2nb.novel.core.crawl.ChapterBean; import com.java2nb.novel.core.crawl.CrawlParser; import com.java2nb.novel.core.crawl.RuleBean; import com.java2nb.novel.core.enums.ResponseStatus; @@ -34,6 +33,7 @@ import org.mybatis.dynamic.sql.select.render.SelectStatementProvider; import org.springframework.stereotype.Service; import java.util.*; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -274,39 +274,46 @@ public class CrawlServiceImpl implements CrawlService { @Override public boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId) { - Book book = CrawlParser.parseBook(ruleBean, bookId); - if(book.getBookName() == null || book.getAuthorName() == null){ - return false; - } - //这里只做新书入库,查询是否存在这本书 - Book existBook = bookService.queryBookByBookNameAndAuthorName(book.getBookName(), book.getAuthorName()); - //如果该小说不存在,则可以解析入库,但是标记该小说正在入库,30分钟之后才允许再次入库 - if (existBook == null) { - //没有该书,可以入库 - book.setCatId(catId); - //根据分类ID查询分类 - book.setCatName(bookService.queryCatNameByCatId(catId)); - if (catId == 7) { - //女频 - book.setWorkDirection((byte) 1); - } else { - //男频 - book.setWorkDirection((byte) 0); + + final AtomicBoolean parseResult = new AtomicBoolean(false); + + CrawlParser.parseBook(ruleBean, bookId, book -> { + if(book.getBookName() == null || book.getAuthorName() == null){ + return; } - book.setCrawlBookId(bookId); - book.setCrawlSourceId(sourceId); - book.setCrawlLastTime(new Date()); - book.setId(new IdWorker().nextId()); - //解析章节目录 - ChapterBean chapter = CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0)); + //这里只做新书入库,查询是否存在这本书 + Book existBook = bookService.queryBookByBookNameAndAuthorName(book.getBookName(), book.getAuthorName()); + //如果该小说不存在,则可以解析入库,但是标记该小说正在入库,30分钟之后才允许再次入库 + if (existBook == null) { + //没有该书,可以入库 + book.setCatId(catId); + //根据分类ID查询分类 + book.setCatName(bookService.queryCatNameByCatId(catId)); + if (catId == 7) { + //女频 + book.setWorkDirection((byte) 1); + } else { + //男频 + book.setWorkDirection((byte) 0); + } + book.setCrawlBookId(bookId); + book.setCrawlSourceId(sourceId); + book.setCrawlLastTime(new Date()); + book.setId(new IdWorker().nextId()); + //解析章节目录 + CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0),chapter -> { + bookService.saveBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList()); + }); - bookService.saveBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList()); + } else { + //只更新书籍的爬虫相关字段 + bookService.updateCrawlProperties(existBook.getId(), sourceId, bookId); + } + parseResult.set(true); + }); + + return parseResult.get(); - } else { - //只更新书籍的爬虫相关字段 - bookService.updateCrawlProperties(existBook.getId(), sourceId, bookId); - } - return true; } @Override