From 0c6c7ba8f335d9324501d8d33311eeb85089f5b5 Mon Sep 17 00:00:00 2001 From: xiongxiaoyang <773861846@qq.com> Date: Mon, 13 Jan 2020 11:28:59 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=E7=AD=96=E7=95=A5=E8=B0=83?= =?UTF-8?q?=E6=95=B4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../books/core/crawl/BaseCrawlSource.java | 6 + .../books/core/crawl/BiquCrawlSource.java | 340 +++++++----- .../books/core/utils/Constants.java | 5 + .../books/mapper/BookParseLogMapper.java | 30 + .../zinglizingli/books/po/BookParseLog.java | 55 ++ .../books/po/BookParseLogExample.java | 521 ++++++++++++++++++ .../books/service/BookService.java | 46 +- .../books/web/IndexController.java | 2 +- .../resources/mybatis/generatorConfig.xml | 8 +- .../mybatis/mapping/BookParseLogMapper.xml | 211 +++++++ 10 files changed, 1063 insertions(+), 161 deletions(-) create mode 100644 novel-front/src/main/java/xyz/zinglizingli/books/mapper/BookParseLogMapper.java create mode 100644 novel-front/src/main/java/xyz/zinglizingli/books/po/BookParseLog.java create mode 100644 novel-front/src/main/java/xyz/zinglizingli/books/po/BookParseLogExample.java create mode 100644 novel-front/src/main/resources/mybatis/mapping/BookParseLogMapper.xml diff --git a/novel-front/src/main/java/xyz/zinglizingli/books/core/crawl/BaseCrawlSource.java b/novel-front/src/main/java/xyz/zinglizingli/books/core/crawl/BaseCrawlSource.java index 3096606..c6c92a3 100644 --- a/novel-front/src/main/java/xyz/zinglizingli/books/core/crawl/BaseCrawlSource.java +++ b/novel-front/src/main/java/xyz/zinglizingli/books/core/crawl/BaseCrawlSource.java @@ -17,4 +17,10 @@ public abstract class BaseCrawlSource { * 解析数据 * */ public abstract void parse(); + + + /** + * 更新书籍 + * */ + public abstract void update(); } diff --git a/novel-front/src/main/java/xyz/zinglizingli/books/core/crawl/BiquCrawlSource.java b/novel-front/src/main/java/xyz/zinglizingli/books/core/crawl/BiquCrawlSource.java index 78ba6e5..95b7272 100644 --- a/novel-front/src/main/java/xyz/zinglizingli/books/core/crawl/BiquCrawlSource.java +++ b/novel-front/src/main/java/xyz/zinglizingli/books/core/crawl/BiquCrawlSource.java @@ -3,9 +3,12 @@ package xyz.zinglizingli.books.core.crawl; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.StringUtils; import org.springframework.beans.factory.annotation.Autowired; +import xyz.zinglizingli.books.core.utils.Constants; +import xyz.zinglizingli.books.mapper.BookParseLogMapper; import xyz.zinglizingli.books.po.Book; import xyz.zinglizingli.books.po.BookContent; import xyz.zinglizingli.books.po.BookIndex; +import xyz.zinglizingli.books.po.BookParseLog; import xyz.zinglizingli.books.service.BookService; import xyz.zinglizingli.books.core.utils.CatUtil; import xyz.zinglizingli.common.utils.ExcutorUtils; @@ -34,177 +37,208 @@ public class BiquCrawlSource extends BaseHtmlCrawlSource { @Override public void parse() { - String catBookListUrl = getListPageUrl().replace("{0}", "0").replace("{1}", "1"); - String forObject = RestTemplateUtil.getBodyByUtf8(catBookListUrl); - if (forObject != null) { - //解析第一页书籍的数据 - Pattern bookPatten = compile(getBookUrlPattern()); + for(int page = 1; page<= Constants.UPDATE_PAGES_ONCE; page++) { + String catBookListUrl = getListPageUrl().replace("{0}", "0").replace("{1}", page+""); + String forObject = RestTemplateUtil.getBodyByUtf8(catBookListUrl); + if (forObject != null) { + //解析第一页书籍的数据 + Pattern bookPatten = compile(getBookUrlPattern()); - Matcher bookMatcher = bookPatten.matcher(forObject); + Matcher bookMatcher = bookPatten.matcher(forObject); - boolean isFind = bookMatcher.find(); - Pattern scorePatten = compile(getScorePattern()); - Matcher scoreMatch = scorePatten.matcher(forObject); - boolean scoreFind = scoreMatch.find(); + boolean isFind = bookMatcher.find(); + Pattern scorePatten = compile(getScorePattern()); + Matcher scoreMatch = scorePatten.matcher(forObject); + boolean scoreFind = scoreMatch.find(); - Pattern bookNamePatten = compile(getBookNamePattern()); + Pattern bookNamePatten = compile(getBookNamePattern()); - Matcher bookNameMatch = bookNamePatten.matcher(forObject); + Matcher bookNameMatch = bookNamePatten.matcher(forObject); - boolean isBookNameMatch = bookNameMatch.find(); + boolean isBookNameMatch = bookNameMatch.find(); - while (isFind && scoreFind && isBookNameMatch) { - - try { - Float score = Float.parseFloat(scoreMatch.group(1)); - - if (score < getLowestScore()) { - continue; - } - - String bokNum = bookMatcher.group(1); - String bookUrl = getIndexUrl() + "/" + bokNum + "/"; - - String body = RestTemplateUtil.getBodyByUtf8(bookUrl); - if (body != null) { - - String bookName = bookNameMatch.group(1); - Pattern authorPatten = compile(getAuthorPattern()); - Matcher authoreMatch = authorPatten.matcher(body); - if (authoreMatch.find()) { - String author = authoreMatch.group(1); - - Pattern statusPatten = compile(getStatusPattern()); - Matcher statusMatch = statusPatten.matcher(body); - if (statusMatch.find()) { - String status = statusMatch.group(1); - - Pattern catPatten = compile(getCatPattern()); - Matcher catMatch = catPatten.matcher(body); - if (catMatch.find()) { - String catName = catMatch.group(1); - int catNum = CatUtil.getCatNum(catName); - - - Pattern updateTimePatten = compile(getUpdateTimePattern()); - Matcher updateTimeMatch = updateTimePatten.matcher(body); - if (updateTimeMatch.find()) { - String updateTimeStr = updateTimeMatch.group(1); - SimpleDateFormat format = new SimpleDateFormat("yy-MM-dd HH:mm:ss"); - Date updateTime = format.parse(updateTimeStr); - Pattern picPatten = compile(getPicPattern()); - Matcher picMather = picPatten.matcher(body); - if (picMather.find()) { - String picSrc = picMather.group(1); - String desc = body.substring(body.indexOf("
") + "
".length()); - desc = desc.substring(0, desc.indexOf("
")); - - - Book book = new Book(); - book.setAuthor(author); - book.setCatid(catNum); - book.setBookDesc(desc); - book.setBookName(bookName); - book.setScore(score > 10 ? 8.0f : score); - book.setPicUrl(picSrc); - book.setBookStatus(status); - book.setUpdateTime(updateTime); - - List