From 465e03a17b23cf5066b878fc03d45778166ab38e Mon Sep 17 00:00:00 2001 From: xiongxiaoyang <773861846@qq.com> Date: Mon, 17 Jan 2022 20:58:57 +0800 Subject: [PATCH] =?UTF-8?q?perf:=20=E7=BC=93=E5=AD=98=E9=A2=84=E7=BC=96?= =?UTF-8?q?=E8=AF=91=E7=9A=84Pattern=E5=AF=B9=E8=B1=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java2nb/novel/core/crawl/CrawlParser.java | 28 +++++++++-------- .../novel/core/crawl/PatternFactory.java | 30 +++++++++++++++++++ 2 files changed, 45 insertions(+), 13 deletions(-) create mode 100644 novel-crawl/src/main/java/com/java2nb/novel/core/crawl/PatternFactory.java diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java index 9ec4628..192c4e4 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java @@ -1,6 +1,9 @@ package com.java2nb.novel.core.crawl; -import com.java2nb.novel.core.utils.*; +import com.java2nb.novel.core.utils.HttpUtil; +import com.java2nb.novel.core.utils.RandomBookInfoUtil; +import com.java2nb.novel.core.utils.RestTemplateUtil; +import com.java2nb.novel.core.utils.StringUtil; import com.java2nb.novel.entity.Book; import com.java2nb.novel.entity.BookContent; import com.java2nb.novel.entity.BookIndex; @@ -9,7 +12,8 @@ import io.github.xxyopen.util.IdWorker; import lombok.SneakyThrows; import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.StringUtils; -import org.springframework.http.*; +import org.springframework.http.HttpStatus; +import org.springframework.http.ResponseEntity; import org.springframework.web.client.RestTemplate; import java.text.SimpleDateFormat; @@ -17,8 +21,6 @@ import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; -import static java.util.regex.Pattern.compile; - /** * 爬虫解析器 * @@ -39,14 +41,14 @@ public class CrawlParser { String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId); String bookDetailHtml = getByHttpClientWithChrome(bookDetailUrl); if (bookDetailHtml != null) { - Pattern bookNamePatten = compile(ruleBean.getBookNamePatten()); + Pattern bookNamePatten = PatternFactory.getPattern(ruleBean.getBookNamePatten()); Matcher bookNameMatch = bookNamePatten.matcher(bookDetailHtml); boolean isFindBookName = bookNameMatch.find(); if (isFindBookName) { String bookName = bookNameMatch.group(1); //设置小说名 book.setBookName(bookName); - Pattern authorNamePatten = compile(ruleBean.getAuthorNamePatten()); + Pattern authorNamePatten = PatternFactory.getPattern(ruleBean.getAuthorNamePatten()); Matcher authorNameMatch = authorNamePatten.matcher(bookDetailHtml); boolean isFindAuthorName = authorNameMatch.find(); if (isFindAuthorName) { @@ -54,7 +56,7 @@ public class CrawlParser { //设置作者名 book.setAuthorName(authorName); if (StringUtils.isNotBlank(ruleBean.getPicUrlPatten())) { - Pattern picUrlPatten = compile(ruleBean.getPicUrlPatten()); + Pattern picUrlPatten = PatternFactory.getPattern(ruleBean.getPicUrlPatten()); Matcher picUrlMatch = picUrlPatten.matcher(bookDetailHtml); boolean isFindPicUrl = picUrlMatch.find(); if (isFindPicUrl) { @@ -67,7 +69,7 @@ public class CrawlParser { } } if (StringUtils.isNotBlank(ruleBean.getScorePatten())) { - Pattern scorePatten = compile(ruleBean.getScorePatten()); + Pattern scorePatten = PatternFactory.getPattern(ruleBean.getScorePatten()); Matcher scoreMatch = scorePatten.matcher(bookDetailHtml); boolean isFindScore = scoreMatch.find(); if (isFindScore) { @@ -77,7 +79,7 @@ public class CrawlParser { } } if (StringUtils.isNotBlank(ruleBean.getVisitCountPatten())) { - Pattern visitCountPatten = compile(ruleBean.getVisitCountPatten()); + Pattern visitCountPatten = PatternFactory.getPattern(ruleBean.getVisitCountPatten()); Matcher visitCountMatch = visitCountPatten.matcher(bookDetailHtml); boolean isFindVisitCount = visitCountMatch.find(); if (isFindVisitCount) { @@ -98,7 +100,7 @@ public class CrawlParser { //设置书籍简介 book.setBookDesc(desc); if (StringUtils.isNotBlank(ruleBean.getStatusPatten())) { - Pattern bookStatusPatten = compile(ruleBean.getStatusPatten()); + Pattern bookStatusPatten = PatternFactory.getPattern(ruleBean.getStatusPatten()); Matcher bookStatusMatch = bookStatusPatten.matcher(bookDetailHtml); boolean isFindBookStatus = bookStatusMatch.find(); if (isFindBookStatus) { @@ -111,7 +113,7 @@ public class CrawlParser { } if (StringUtils.isNotBlank(ruleBean.getUpadateTimePatten()) && StringUtils.isNotBlank(ruleBean.getUpadateTimeFormatPatten())) { - Pattern updateTimePatten = compile(ruleBean.getUpadateTimePatten()); + Pattern updateTimePatten = PatternFactory.getPattern(ruleBean.getUpadateTimePatten()); Matcher updateTimeMatch = updateTimePatten.matcher(bookDetailHtml); boolean isFindUpdateTime = updateTimeMatch.find(); if (isFindUpdateTime) { @@ -154,10 +156,10 @@ public class CrawlParser { indexListHtml = indexListHtml.substring(indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length()); } - Pattern indexIdPatten = compile(ruleBean.getIndexIdPatten()); + Pattern indexIdPatten = PatternFactory.getPattern(ruleBean.getIndexIdPatten()); Matcher indexIdMatch = indexIdPatten.matcher(indexListHtml); - Pattern indexNamePatten = compile(ruleBean.getIndexNamePatten()); + Pattern indexNamePatten = PatternFactory.getPattern(ruleBean.getIndexNamePatten()); Matcher indexNameMatch = indexNamePatten.matcher(indexListHtml); boolean isFindIndex = indexIdMatch.find() & indexNameMatch.find(); diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/PatternFactory.java b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/PatternFactory.java new file mode 100644 index 0000000..3328ec8 --- /dev/null +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/PatternFactory.java @@ -0,0 +1,30 @@ +package com.java2nb.novel.core.crawl; + +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import java.util.regex.Pattern; + +/** + * @author xiongxiaoyang + */ +public class PatternFactory { + + private static final Map PATTERN_CACHED_MAP = new HashMap<>(); + + /** + * 根据正则表达式获取一个预编译的Pattern对象 + */ + public static Pattern getPattern(String regex) { + Pattern pattern = PATTERN_CACHED_MAP.get(regex); + if (Objects.isNull(pattern)) { + pattern = Pattern.compile(regex); + PATTERN_CACHED_MAP.put(regex, pattern); + } + return pattern; + } + +} + + +