perf: 缓存预编译的Pattern对象

This commit is contained in:
xiongxiaoyang 2022-01-17 20:58:57 +08:00
parent 094ac95428
commit 465e03a17b
2 changed files with 45 additions and 13 deletions

View File

@ -1,6 +1,9 @@
package com.java2nb.novel.core.crawl;
import com.java2nb.novel.core.utils.*;
import com.java2nb.novel.core.utils.HttpUtil;
import com.java2nb.novel.core.utils.RandomBookInfoUtil;
import com.java2nb.novel.core.utils.RestTemplateUtil;
import com.java2nb.novel.core.utils.StringUtil;
import com.java2nb.novel.entity.Book;
import com.java2nb.novel.entity.BookContent;
import com.java2nb.novel.entity.BookIndex;
@ -9,7 +12,8 @@ import io.github.xxyopen.util.IdWorker;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.http.*;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.web.client.RestTemplate;
import java.text.SimpleDateFormat;
@ -17,8 +21,6 @@ import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static java.util.regex.Pattern.compile;
/**
* 爬虫解析器
*
@ -39,14 +41,14 @@ public class CrawlParser {
String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
String bookDetailHtml = getByHttpClientWithChrome(bookDetailUrl);
if (bookDetailHtml != null) {
Pattern bookNamePatten = compile(ruleBean.getBookNamePatten());
Pattern bookNamePatten = PatternFactory.getPattern(ruleBean.getBookNamePatten());
Matcher bookNameMatch = bookNamePatten.matcher(bookDetailHtml);
boolean isFindBookName = bookNameMatch.find();
if (isFindBookName) {
String bookName = bookNameMatch.group(1);
//设置小说名
book.setBookName(bookName);
Pattern authorNamePatten = compile(ruleBean.getAuthorNamePatten());
Pattern authorNamePatten = PatternFactory.getPattern(ruleBean.getAuthorNamePatten());
Matcher authorNameMatch = authorNamePatten.matcher(bookDetailHtml);
boolean isFindAuthorName = authorNameMatch.find();
if (isFindAuthorName) {
@ -54,7 +56,7 @@ public class CrawlParser {
//设置作者名
book.setAuthorName(authorName);
if (StringUtils.isNotBlank(ruleBean.getPicUrlPatten())) {
Pattern picUrlPatten = compile(ruleBean.getPicUrlPatten());
Pattern picUrlPatten = PatternFactory.getPattern(ruleBean.getPicUrlPatten());
Matcher picUrlMatch = picUrlPatten.matcher(bookDetailHtml);
boolean isFindPicUrl = picUrlMatch.find();
if (isFindPicUrl) {
@ -67,7 +69,7 @@ public class CrawlParser {
}
}
if (StringUtils.isNotBlank(ruleBean.getScorePatten())) {
Pattern scorePatten = compile(ruleBean.getScorePatten());
Pattern scorePatten = PatternFactory.getPattern(ruleBean.getScorePatten());
Matcher scoreMatch = scorePatten.matcher(bookDetailHtml);
boolean isFindScore = scoreMatch.find();
if (isFindScore) {
@ -77,7 +79,7 @@ public class CrawlParser {
}
}
if (StringUtils.isNotBlank(ruleBean.getVisitCountPatten())) {
Pattern visitCountPatten = compile(ruleBean.getVisitCountPatten());
Pattern visitCountPatten = PatternFactory.getPattern(ruleBean.getVisitCountPatten());
Matcher visitCountMatch = visitCountPatten.matcher(bookDetailHtml);
boolean isFindVisitCount = visitCountMatch.find();
if (isFindVisitCount) {
@ -98,7 +100,7 @@ public class CrawlParser {
//设置书籍简介
book.setBookDesc(desc);
if (StringUtils.isNotBlank(ruleBean.getStatusPatten())) {
Pattern bookStatusPatten = compile(ruleBean.getStatusPatten());
Pattern bookStatusPatten = PatternFactory.getPattern(ruleBean.getStatusPatten());
Matcher bookStatusMatch = bookStatusPatten.matcher(bookDetailHtml);
boolean isFindBookStatus = bookStatusMatch.find();
if (isFindBookStatus) {
@ -111,7 +113,7 @@ public class CrawlParser {
}
if (StringUtils.isNotBlank(ruleBean.getUpadateTimePatten()) && StringUtils.isNotBlank(ruleBean.getUpadateTimeFormatPatten())) {
Pattern updateTimePatten = compile(ruleBean.getUpadateTimePatten());
Pattern updateTimePatten = PatternFactory.getPattern(ruleBean.getUpadateTimePatten());
Matcher updateTimeMatch = updateTimePatten.matcher(bookDetailHtml);
boolean isFindUpdateTime = updateTimeMatch.find();
if (isFindUpdateTime) {
@ -154,10 +156,10 @@ public class CrawlParser {
indexListHtml = indexListHtml.substring(indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length());
}
Pattern indexIdPatten = compile(ruleBean.getIndexIdPatten());
Pattern indexIdPatten = PatternFactory.getPattern(ruleBean.getIndexIdPatten());
Matcher indexIdMatch = indexIdPatten.matcher(indexListHtml);
Pattern indexNamePatten = compile(ruleBean.getIndexNamePatten());
Pattern indexNamePatten = PatternFactory.getPattern(ruleBean.getIndexNamePatten());
Matcher indexNameMatch = indexNamePatten.matcher(indexListHtml);
boolean isFindIndex = indexIdMatch.find() & indexNameMatch.find();

View File

@ -0,0 +1,30 @@
package com.java2nb.novel.core.crawl;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.regex.Pattern;
/**
* @author xiongxiaoyang
*/
public class PatternFactory {
private static final Map<String, Pattern> PATTERN_CACHED_MAP = new HashMap<>();
/**
* 根据正则表达式获取一个预编译的Pattern对象
*/
public static Pattern getPattern(String regex) {
Pattern pattern = PATTERN_CACHED_MAP.get(regex);
if (Objects.isNull(pattern)) {
pattern = Pattern.compile(regex);
PATTERN_CACHED_MAP.put(regex, pattern);
}
return pattern;
}
}