From 89992dc781d8189f25ec58bca0558e22ae3dafff Mon Sep 17 00:00:00 2001 From: xiongxiaoyang <1179705413@qq.com> Date: Sat, 1 Jun 2024 09:56:07 +0800 Subject: [PATCH] =?UTF-8?q?perf(novel-crawl):=20=E5=A2=9E=E5=8A=A0?= =?UTF-8?q?=E5=B0=8F=E8=AF=B4=E5=86=85=E5=AE=B9=E8=BF=87=E6=BB=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java2nb/novel/core/crawl/CrawlParser.java | 46 +++++++++++++------ .../java2nb/novel/core/crawl/RuleBean.java | 11 +++-- .../templates/crawl/crawlSource_add.html | 6 +++ .../templates/crawl/crawlSource_update.html | 8 ++++ 4 files changed, 53 insertions(+), 18 deletions(-) diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java index 192c4e4..7a76987 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java @@ -89,14 +89,15 @@ public class CrawlParser { } } - String desc = bookDetailHtml.substring(bookDetailHtml.indexOf(ruleBean.getDescStart()) + ruleBean.getDescStart().length()); + String desc = bookDetailHtml.substring( + bookDetailHtml.indexOf(ruleBean.getDescStart()) + ruleBean.getDescStart().length()); desc = desc.substring(0, desc.indexOf(ruleBean.getDescEnd())); //过滤掉简介中的特殊标签 desc = desc.replaceAll("", "") - .replaceAll("", "") - .replaceAll("

\\s*

", "") - .replaceAll("

", "") - .replaceAll("

", "
"); + .replaceAll("", "") + .replaceAll("

\\s*

", "") + .replaceAll("

", "") + .replaceAll("

", "
"); //设置书籍简介 book.setBookDesc(desc); if (StringUtils.isNotBlank(ruleBean.getStatusPatten())) { @@ -112,14 +113,16 @@ public class CrawlParser { } } - if (StringUtils.isNotBlank(ruleBean.getUpadateTimePatten()) && StringUtils.isNotBlank(ruleBean.getUpadateTimeFormatPatten())) { + if (StringUtils.isNotBlank(ruleBean.getUpadateTimePatten()) && StringUtils.isNotBlank( + ruleBean.getUpadateTimeFormatPatten())) { Pattern updateTimePatten = PatternFactory.getPattern(ruleBean.getUpadateTimePatten()); Matcher updateTimeMatch = updateTimePatten.matcher(bookDetailHtml); boolean isFindUpdateTime = updateTimeMatch.find(); if (isFindUpdateTime) { String updateTime = updateTimeMatch.group(1); //设置更新时间 - book.setLastIndexUpdateTime(new SimpleDateFormat(ruleBean.getUpadateTimeFormatPatten()).parse(updateTime)); + book.setLastIndexUpdateTime( + new SimpleDateFormat(ruleBean.getUpadateTimeFormatPatten()).parse(updateTime)); } } @@ -141,7 +144,8 @@ public class CrawlParser { handler.handle(book); } - public static boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map existBookIndexMap, CrawlBookChapterHandler handler) { + public static boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, + Map existBookIndexMap, CrawlBookChapterHandler handler) { Date currentDate = new Date(); @@ -153,7 +157,8 @@ public class CrawlParser { if (indexListHtml != null) { if (StringUtils.isNotBlank(ruleBean.getBookIndexStart())) { - indexListHtml = indexListHtml.substring(indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length()); + indexListHtml = indexListHtml.substring( + indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length()); } Pattern indexIdPatten = PatternFactory.getPattern(ruleBean.getIndexIdPatten()); @@ -174,14 +179,16 @@ public class CrawlParser { BookIndex hasIndex = existBookIndexMap.get(indexNum); String indexName = indexNameMatch.group(1); - if (hasIndex == null || !StringUtils.deleteWhitespace(hasIndex.getIndexName()).equals(StringUtils.deleteWhitespace(indexName))) { + if (hasIndex == null || !StringUtils.deleteWhitespace(hasIndex.getIndexName()) + .equals(StringUtils.deleteWhitespace(indexName))) { String sourceIndexId = indexIdMatch.group(1); String bookContentUrl = ruleBean.getBookContentUrl(); int calStart = bookContentUrl.indexOf("{cal_"); if (calStart != -1) { //内容页URL需要进行计算才能得到 - String calStr = bookContentUrl.substring(calStart, calStart + bookContentUrl.substring(calStart).indexOf("}")); + String calStr = bookContentUrl.substring(calStart, + calStart + bookContentUrl.substring(calStart).indexOf("}")); String[] calArr = calStr.split("_"); int calType = Integer.parseInt(calArr[1]); if (calType == 1) { @@ -206,13 +213,25 @@ public class CrawlParser { } - String contentUrl = bookContentUrl.replace("{bookId}", sourceBookId).replace("{indexId}", sourceIndexId); + String contentUrl = bookContentUrl.replace("{bookId}", sourceBookId) + .replace("{indexId}", sourceIndexId); //查询章节内容 String contentHtml = getByHttpClientWithChrome(contentUrl); if (contentHtml != null && !contentHtml.contains("正在手打中")) { - String content = contentHtml.substring(contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length()); + String content = contentHtml.substring( + contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length()); content = content.substring(0, content.indexOf(ruleBean.getContentEnd())); + // 小说内容过滤 + String filterContent = ruleBean.getFilterContent(); + if (StringUtils.isNotBlank(filterContent)) { + String[] filterRules = filterContent.replace("\r\n", "\n").split("\n"); + for (String filterRule : filterRules) { + if (StringUtils.isNotBlank(filterRule)) { + content = content.replaceAll(filterRule, ""); + } + } + } //插入章节目录和章节内容 BookIndex bookIndex = new BookIndex(); bookIndex.setIndexName(indexName); @@ -257,7 +276,6 @@ public class CrawlParser { isFindIndex = indexIdMatch.find() & indexNameMatch.find(); } - if (indexList.size() > 0) { //如果有爬到最新章节,则设置小说主表的最新章节信息 //获取爬取到的最新章节 diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/RuleBean.java b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/RuleBean.java index 6b42685..58a4efb 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/RuleBean.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/RuleBean.java @@ -6,6 +6,7 @@ import java.util.Map; /** * 爬虫解析规则bean + * * @author Administrator */ @Data @@ -13,17 +14,17 @@ public class RuleBean { /** * 小说更新列表url - * */ + */ private String updateBookListUrl; /** * 分类列表页URL规则 - * */ + */ private String bookListUrl; - private Map catIdRule; + private Map catIdRule; - private Map bookStatusRule; + private Map bookStatusRule; private String bookIdPatten; private String pagePatten; @@ -51,5 +52,7 @@ public class RuleBean { private String bookIndexStart; + private String filterContent; + } diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html b/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html index 2d7dd83..5162663 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html @@ -144,6 +144,9 @@ 示例:<script>
  • + 示例:<div\s+id="content_tip">\s*<b>([^/]+)</b>\s*</div> +
  • @@ -405,6 +408,9 @@ crawlRule.contentEnd = contentEnd; + var filterContent = $("#filterContent").val(); + crawlRule.filterContent = filterContent; + $.ajax({ type: "POST", diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSource_update.html b/novel-crawl/src/main/resources/templates/crawl/crawlSource_update.html index edf0587..08dd048 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSource_update.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSource_update.html @@ -145,6 +145,10 @@ 示例:<script>
  • + 示例:<div\s+id="content_tip">\s*<b>([^/]+)</b>\s*</div> +
  • +
  • @@ -269,6 +273,7 @@ $("#bookContentUrl").val(crawlRule.bookContentUrl); $("#contentStart").val(crawlRule.contentStart); $("#contentEnd").val(crawlRule.contentEnd); + $("#filterContent").val(crawlRule.filterContent); } } @@ -488,6 +493,9 @@ crawlRule.contentEnd = contentEnd; + var filterContent = $("#filterContent").val(); + crawlRule.filterContent = filterContent; + $.ajax({ type: "POST",