From eac0ce9302f31ebcf0ad33680b2cfbc47bd78951 Mon Sep 17 00:00:00 2001 From: xiongxiaoyang <773861846@qq.com> Date: Thu, 19 Dec 2019 11:26:56 +0800 Subject: [PATCH] =?UTF-8?q?=E7=88=AC=E8=99=AB=E8=87=AA=E5=8A=A8=E6=9B=B4?= =?UTF-8?q?=E6=96=B0=E7=A8=8B=E5=BA=8F=E4=BC=98=E5=8C=96=EF=BC=8C=E5=A2=9E?= =?UTF-8?q?=E5=8A=A0=E8=87=AA=E5=8A=A8=E4=BF=AE=E5=A4=8D=E9=94=99=E8=AF=AF?= =?UTF-8?q?=E7=AB=A0=E8=8A=82?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../service/impl/BookCrawlServiceImpl.java | 128 ++++++++-------- .../books/core/crawl/BiquCrawlSource.java | 144 +++++++++--------- .../books/service/BookService.java | 36 +++-- .../src/main/resources/application-crawl.yml | 2 +- script/crawlbook/application.yml | 2 +- script/crawlbook/crawl-book-1.0-SNAPSHOT.jar | Bin 24677825 -> 24677988 bytes 6 files changed, 158 insertions(+), 154 deletions(-) diff --git a/novel-admin/src/main/java/com/java2nb/books/service/impl/BookCrawlServiceImpl.java b/novel-admin/src/main/java/com/java2nb/books/service/impl/BookCrawlServiceImpl.java index 23bfee5..4c3a7a2 100644 --- a/novel-admin/src/main/java/com/java2nb/books/service/impl/BookCrawlServiceImpl.java +++ b/novel-admin/src/main/java/com/java2nb/books/service/impl/BookCrawlServiceImpl.java @@ -522,88 +522,85 @@ public class BookCrawlServiceImpl implements BookCrawlService { if (picMather.find()) { String picSrc = picMather.group(1); - Pattern descPatten = compile("class=\"review\">([^<]+)

"); - Matcher descMatch = descPatten.matcher(body); - if (descMatch.find()) { - String desc = descMatch.group(1); + String desc = body.substring(body.indexOf("

") + "

".length()); + desc = desc.substring(0, desc.indexOf("

")); - BookDO book = new BookDO(); - book.setAuthor(author); - book.setCatid(catNum); - book.setBookDesc(desc); - book.setBookName(bookName); - book.setScore(score > 10 ? 8.0f : score); - book.setPicUrl(picSrc); - book.setBookStatus(status); - book.setUpdateTime(updateTime); + BookDO book = new BookDO(); + book.setAuthor(author); + book.setCatid(catNum); + book.setBookDesc(desc); + book.setBookName(bookName); + book.setScore(score > 10 ? 8.0f : score); + book.setPicUrl(picSrc); + book.setBookStatus(status); + book.setUpdateTime(updateTime); - List indexList = new ArrayList<>(); - List contentList = new ArrayList<>(); + List indexList = new ArrayList<>(); + List contentList = new ArrayList<>(); - //读取目录 - Pattern indexPatten = compile("查看完整目录"); - Matcher indexMatch = indexPatten.matcher(body); - if (indexMatch.find()) { - String indexUrl = baseUrl + indexMatch.group(1); - String body2 = getByTemplate(indexUrl); - if (body2 != null) { - Pattern indexListPatten = compile("([^/]+)"); - Matcher indexListMatch = indexListPatten.matcher(body2); + //读取目录 + Pattern indexPatten = compile("查看完整目录"); + Matcher indexMatch = indexPatten.matcher(body); + if (indexMatch.find()) { + String indexUrl = baseUrl + indexMatch.group(1); + String body2 = getByTemplate(indexUrl); + if (body2 != null) { + Pattern indexListPatten = compile("([^/]+)"); + Matcher indexListMatch = indexListPatten.matcher(body2); - boolean isFindIndex = indexListMatch.find(); + boolean isFindIndex = indexListMatch.find(); - int indexNum = 0; - //查询该书籍已存在目录号 - List hasIndexNum = queryIndexCountByBookNameAndBAuthor(bookName, author); + int indexNum = 0; + //查询该书籍已存在目录号 + List hasIndexNum = queryIndexCountByBookNameAndBAuthor(bookName, author); - while (isFindIndex) { - if (isInteruptBiquTaCrawl) { - return; - } - - if (!hasIndexNum.contains(indexNum)) { - - String contentUrl = baseUrl + indexListMatch.group(1); - String indexName = indexListMatch.group(2); - - - //查询章节内容 - String body3 = getByTemplate(contentUrl.replace("//m.","//www.")); - if (body3 != null) { - String start = "id=\"content\">"; - String end = "