From 4fe36a8f4fedaf5d04a4e386a5772b64adc98a46 Mon Sep 17 00:00:00 2001 From: xiaoyang Date: Sat, 24 Jul 2021 15:51:54 +0800 Subject: [PATCH] =?UTF-8?q?=E9=83=A8=E5=88=86=E7=88=AC=E8=99=AB=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=E4=BC=98=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java2nb/novel/core/crawl/ChapterBean.java | 25 ++++++++++++ .../java2nb/novel/core/crawl/CrawlParser.java | 39 +++++++++---------- .../java2nb/novel/core/crawl/RuleBean.java | 4 +- .../novel/core/listener/StarterListener.java | 14 ++++--- .../novel/service/impl/CrawlServiceImpl.java | 5 ++- 5 files changed, 57 insertions(+), 30 deletions(-) create mode 100644 novel-crawl/src/main/java/com/java2nb/novel/core/crawl/ChapterBean.java diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/ChapterBean.java b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/ChapterBean.java new file mode 100644 index 0000000..6eaaa9b --- /dev/null +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/ChapterBean.java @@ -0,0 +1,25 @@ +package com.java2nb.novel.core.crawl; + +import com.java2nb.novel.entity.BookContent; +import com.java2nb.novel.entity.BookIndex; +import lombok.Data; + +import java.util.List; + +/** + * 章节数据封装bean + * @author Administrator + */ +@Data +public class ChapterBean { + + /** + * 章节索引集合 + * */ + List bookIndexList; + + /** + * 章节内容集合 + * */ + List bookContentList; +} diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java index 505fd6b..4cce9d0 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java @@ -26,15 +26,11 @@ import static java.util.regex.Pattern.compile; @Slf4j public class CrawlParser { - private static IdWorker idWorker = new IdWorker(); + private static final IdWorker idWorker = new IdWorker(); - public static final Integer BOOK_INDEX_LIST_KEY = 1; + private static final RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8"); - public static final Integer BOOK_CONTENT_LIST_KEY = 2; - - private static RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8"); - - private static ThreadLocal retryCount = new ThreadLocal<>(); + private static final ThreadLocal retryCount = new ThreadLocal<>(); @SneakyThrows public static Book parseBook(RuleBean ruleBean, String bookId) { @@ -113,14 +109,14 @@ public class CrawlParser { } } - if (StringUtils.isNotBlank(ruleBean.getUpadateTimePatten()) && StringUtils.isNotBlank(ruleBean.getUpadateTimeFormatPatten())) { - Pattern updateTimePatten = compile(ruleBean.getUpadateTimePatten()); + if (StringUtils.isNotBlank(ruleBean.getUpdateTimePatten()) && StringUtils.isNotBlank(ruleBean.getUpdateTimeFormatPatten())) { + Pattern updateTimePatten = compile(ruleBean.getUpdateTimePatten()); Matcher updateTimeMatch = updateTimePatten.matcher(bookDetailHtml); boolean isFindUpdateTime = updateTimeMatch.find(); if (isFindUpdateTime) { String updateTime = updateTimeMatch.group(1); //设置更新时间 - book.setLastIndexUpdateTime(new SimpleDateFormat(ruleBean.getUpadateTimeFormatPatten()).parse(updateTime)); + book.setLastIndexUpdateTime(new SimpleDateFormat(ruleBean.getUpdateTimeFormatPatten()).parse(updateTime)); } } @@ -142,10 +138,7 @@ public class CrawlParser { return book; } - public static Map parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map hasIndexs) { - Map result = new HashMap<>(2); - result.put(BOOK_INDEX_LIST_KEY, new ArrayList(0)); - result.put(BOOK_CONTENT_LIST_KEY, new ArrayList(0)); + public static ChapterBean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map existBookIndexMap) { Date currentDate = new Date(); @@ -171,11 +164,11 @@ public class CrawlParser { int indexNum = 0; //总字数 - Integer totalWordCount = book.getWordCount() == null ? 0 : book.getWordCount(); + int totalWordCount = book.getWordCount() == null ? 0 : book.getWordCount(); while (isFindIndex) { - BookIndex hasIndex = hasIndexs.get(indexNum); + BookIndex hasIndex = existBookIndexMap.get(indexNum); String indexName = indexNameMatch.group(1); if (hasIndex == null || !StringUtils.deleteWhitespace(hasIndex.getIndexName()).equals(StringUtils.deleteWhitespace(indexName))) { @@ -221,7 +214,7 @@ public class CrawlParser { BookIndex bookIndex = new BookIndex(); bookIndex.setIndexName(indexName); bookIndex.setIndexNum(indexNum); - Integer wordCount = StringUtil.getStrValidWordCount(content); + int wordCount = StringUtil.getStrValidWordCount(content); bookIndex.setWordCount(wordCount); indexList.add(bookIndex); @@ -277,15 +270,20 @@ public class CrawlParser { if (indexList.size() == contentList.size() && indexList.size() > 0) { - result.put(BOOK_INDEX_LIST_KEY, indexList); - result.put(BOOK_CONTENT_LIST_KEY, contentList); + return new ChapterBean(){{ + setBookIndexList(indexList); + setBookContentList(contentList); + }}; } } - return result; + return new ChapterBean(){{ + setBookIndexList(new ArrayList<>(0)); + setBookContentList(new ArrayList<>(0)); + }}; } @@ -294,6 +292,7 @@ public class CrawlParser { ResponseEntity forEntity = restTemplate.getForEntity(url, String.class); if (forEntity.getStatusCode() == HttpStatus.OK) { String body = forEntity.getBody(); + assert body != null; if (body.length() < Constants.INVALID_HTML_LENGTH) { return processErrorHttpResult(url); } diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/RuleBean.java b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/RuleBean.java index 5a8d319..1f519ac 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/RuleBean.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/RuleBean.java @@ -37,8 +37,8 @@ public class RuleBean { private String visitCountPatten; private String descStart;; private String descEnd; - private String upadateTimePatten; - private String upadateTimeFormatPatten; + private String updateTimePatten; + private String updateTimeFormatPatten; private String bookIndexUrl; private String indexIdPatten; private String indexNamePatten; diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java b/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java index d673dfd..2fb8a5a 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java @@ -1,6 +1,7 @@ package com.java2nb.novel.core.listener; import com.fasterxml.jackson.databind.ObjectMapper; +import com.java2nb.novel.core.crawl.ChapterBean; import com.java2nb.novel.core.crawl.CrawlParser; import com.java2nb.novel.core.crawl.RuleBean; import com.java2nb.novel.entity.*; @@ -16,9 +17,9 @@ import javax.servlet.ServletContextEvent; import javax.servlet.ServletContextListener; import javax.servlet.annotation.WebListener; import java.util.Date; -import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.concurrent.TimeUnit; /** * @author Administrator @@ -66,15 +67,15 @@ public class StarterListener implements ServletContextListener { //查询已存在的章节 Map existBookIndexMap = bookService.queryExistBookIndexMap(needUpdateBook.getId()); //解析章节目录 - Map indexAndContentList = CrawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, ruleBean, existBookIndexMap); - bookService.updateBookAndIndexAndContent(book, (List) indexAndContentList.get(CrawlParser.BOOK_INDEX_LIST_KEY), (List) indexAndContentList.get(CrawlParser.BOOK_CONTENT_LIST_KEY), existBookIndexMap); + ChapterBean chapter = CrawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, ruleBean, existBookIndexMap); + bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList(), existBookIndexMap); } catch (Exception e) { log.error(e.getMessage(), e); } } - - Thread.sleep(1000 * 60 * 10); + // 休眠10分钟 + TimeUnit.MINUTES.sleep(10); } catch (Exception e) { log.error(e.getMessage(), e); } @@ -107,7 +108,8 @@ public class StarterListener implements ServletContextListener { } - Thread.sleep(1000 * 60); + //休眠1分钟 + TimeUnit.MINUTES.sleep(1); } catch (Exception e) { log.error(e.getMessage(), e); diff --git a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java index 3a757d5..0686c44 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java @@ -7,6 +7,7 @@ import com.github.pagehelper.PageInfo; import com.java2nb.novel.core.bean.PageBean; import com.java2nb.novel.core.cache.CacheKey; import com.java2nb.novel.core.cache.CacheService; +import com.java2nb.novel.core.crawl.ChapterBean; import com.java2nb.novel.core.crawl.CrawlParser; import com.java2nb.novel.core.crawl.RuleBean; import com.java2nb.novel.core.enums.ResponseStatus; @@ -303,9 +304,9 @@ public class CrawlServiceImpl implements CrawlService { book.setCrawlLastTime(new Date()); book.setId(new IdWorker().nextId()); //解析章节目录 - Map indexAndContentList = CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0)); + ChapterBean chapter = CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0)); - bookService.saveBookAndIndexAndContent(book, (List) indexAndContentList.get(CrawlParser.BOOK_INDEX_LIST_KEY), (List) indexAndContentList.get(CrawlParser.BOOK_CONTENT_LIST_KEY)); + bookService.saveBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList()); } else { //只更新书籍的爬虫相关字段