perf(novel-crawl): 增加小说内容过滤

This commit is contained in:
xiongxiaoyang 2024-06-01 09:56:07 +08:00
parent 976db9420e
commit 89992dc781
4 changed files with 53 additions and 18 deletions

View File

@ -89,14 +89,15 @@ public class CrawlParser {
} }
} }
String desc = bookDetailHtml.substring(bookDetailHtml.indexOf(ruleBean.getDescStart()) + ruleBean.getDescStart().length()); String desc = bookDetailHtml.substring(
bookDetailHtml.indexOf(ruleBean.getDescStart()) + ruleBean.getDescStart().length());
desc = desc.substring(0, desc.indexOf(ruleBean.getDescEnd())); desc = desc.substring(0, desc.indexOf(ruleBean.getDescEnd()));
//过滤掉简介中的特殊标签 //过滤掉简介中的特殊标签
desc = desc.replaceAll("<a[^<]+</a>", "") desc = desc.replaceAll("<a[^<]+</a>", "")
.replaceAll("<font[^<]+</font>", "") .replaceAll("<font[^<]+</font>", "")
.replaceAll("<p>\\s*</p>", "") .replaceAll("<p>\\s*</p>", "")
.replaceAll("<p>", "") .replaceAll("<p>", "")
.replaceAll("</p>", "<br/>"); .replaceAll("</p>", "<br/>");
//设置书籍简介 //设置书籍简介
book.setBookDesc(desc); book.setBookDesc(desc);
if (StringUtils.isNotBlank(ruleBean.getStatusPatten())) { if (StringUtils.isNotBlank(ruleBean.getStatusPatten())) {
@ -112,14 +113,16 @@ public class CrawlParser {
} }
} }
if (StringUtils.isNotBlank(ruleBean.getUpadateTimePatten()) && StringUtils.isNotBlank(ruleBean.getUpadateTimeFormatPatten())) { if (StringUtils.isNotBlank(ruleBean.getUpadateTimePatten()) && StringUtils.isNotBlank(
ruleBean.getUpadateTimeFormatPatten())) {
Pattern updateTimePatten = PatternFactory.getPattern(ruleBean.getUpadateTimePatten()); Pattern updateTimePatten = PatternFactory.getPattern(ruleBean.getUpadateTimePatten());
Matcher updateTimeMatch = updateTimePatten.matcher(bookDetailHtml); Matcher updateTimeMatch = updateTimePatten.matcher(bookDetailHtml);
boolean isFindUpdateTime = updateTimeMatch.find(); boolean isFindUpdateTime = updateTimeMatch.find();
if (isFindUpdateTime) { if (isFindUpdateTime) {
String updateTime = updateTimeMatch.group(1); String updateTime = updateTimeMatch.group(1);
//设置更新时间 //设置更新时间
book.setLastIndexUpdateTime(new SimpleDateFormat(ruleBean.getUpadateTimeFormatPatten()).parse(updateTime)); book.setLastIndexUpdateTime(
new SimpleDateFormat(ruleBean.getUpadateTimeFormatPatten()).parse(updateTime));
} }
} }
@ -141,7 +144,8 @@ public class CrawlParser {
handler.handle(book); handler.handle(book);
} }
public static boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map<Integer, BookIndex> existBookIndexMap, CrawlBookChapterHandler handler) { public static boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean,
Map<Integer, BookIndex> existBookIndexMap, CrawlBookChapterHandler handler) {
Date currentDate = new Date(); Date currentDate = new Date();
@ -153,7 +157,8 @@ public class CrawlParser {
if (indexListHtml != null) { if (indexListHtml != null) {
if (StringUtils.isNotBlank(ruleBean.getBookIndexStart())) { if (StringUtils.isNotBlank(ruleBean.getBookIndexStart())) {
indexListHtml = indexListHtml.substring(indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length()); indexListHtml = indexListHtml.substring(
indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length());
} }
Pattern indexIdPatten = PatternFactory.getPattern(ruleBean.getIndexIdPatten()); Pattern indexIdPatten = PatternFactory.getPattern(ruleBean.getIndexIdPatten());
@ -174,14 +179,16 @@ public class CrawlParser {
BookIndex hasIndex = existBookIndexMap.get(indexNum); BookIndex hasIndex = existBookIndexMap.get(indexNum);
String indexName = indexNameMatch.group(1); String indexName = indexNameMatch.group(1);
if (hasIndex == null || !StringUtils.deleteWhitespace(hasIndex.getIndexName()).equals(StringUtils.deleteWhitespace(indexName))) { if (hasIndex == null || !StringUtils.deleteWhitespace(hasIndex.getIndexName())
.equals(StringUtils.deleteWhitespace(indexName))) {
String sourceIndexId = indexIdMatch.group(1); String sourceIndexId = indexIdMatch.group(1);
String bookContentUrl = ruleBean.getBookContentUrl(); String bookContentUrl = ruleBean.getBookContentUrl();
int calStart = bookContentUrl.indexOf("{cal_"); int calStart = bookContentUrl.indexOf("{cal_");
if (calStart != -1) { if (calStart != -1) {
//内容页URL需要进行计算才能得到 //内容页URL需要进行计算才能得到
String calStr = bookContentUrl.substring(calStart, calStart + bookContentUrl.substring(calStart).indexOf("}")); String calStr = bookContentUrl.substring(calStart,
calStart + bookContentUrl.substring(calStart).indexOf("}"));
String[] calArr = calStr.split("_"); String[] calArr = calStr.split("_");
int calType = Integer.parseInt(calArr[1]); int calType = Integer.parseInt(calArr[1]);
if (calType == 1) { if (calType == 1) {
@ -206,13 +213,25 @@ public class CrawlParser {
} }
String contentUrl = bookContentUrl.replace("{bookId}", sourceBookId).replace("{indexId}", sourceIndexId); String contentUrl = bookContentUrl.replace("{bookId}", sourceBookId)
.replace("{indexId}", sourceIndexId);
//查询章节内容 //查询章节内容
String contentHtml = getByHttpClientWithChrome(contentUrl); String contentHtml = getByHttpClientWithChrome(contentUrl);
if (contentHtml != null && !contentHtml.contains("正在手打中")) { if (contentHtml != null && !contentHtml.contains("正在手打中")) {
String content = contentHtml.substring(contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length()); String content = contentHtml.substring(
contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());
content = content.substring(0, content.indexOf(ruleBean.getContentEnd())); content = content.substring(0, content.indexOf(ruleBean.getContentEnd()));
// 小说内容过滤
String filterContent = ruleBean.getFilterContent();
if (StringUtils.isNotBlank(filterContent)) {
String[] filterRules = filterContent.replace("\r\n", "\n").split("\n");
for (String filterRule : filterRules) {
if (StringUtils.isNotBlank(filterRule)) {
content = content.replaceAll(filterRule, "");
}
}
}
//插入章节目录和章节内容 //插入章节目录和章节内容
BookIndex bookIndex = new BookIndex(); BookIndex bookIndex = new BookIndex();
bookIndex.setIndexName(indexName); bookIndex.setIndexName(indexName);
@ -257,7 +276,6 @@ public class CrawlParser {
isFindIndex = indexIdMatch.find() & indexNameMatch.find(); isFindIndex = indexIdMatch.find() & indexNameMatch.find();
} }
if (indexList.size() > 0) { if (indexList.size() > 0) {
//如果有爬到最新章节则设置小说主表的最新章节信息 //如果有爬到最新章节则设置小说主表的最新章节信息
//获取爬取到的最新章节 //获取爬取到的最新章节

View File

@ -6,6 +6,7 @@ import java.util.Map;
/** /**
* 爬虫解析规则bean * 爬虫解析规则bean
*
* @author Administrator * @author Administrator
*/ */
@Data @Data
@ -13,17 +14,17 @@ public class RuleBean {
/** /**
* 小说更新列表url * 小说更新列表url
* */ */
private String updateBookListUrl; private String updateBookListUrl;
/** /**
* 分类列表页URL规则 * 分类列表页URL规则
* */ */
private String bookListUrl; private String bookListUrl;
private Map<String,String> catIdRule; private Map<String, String> catIdRule;
private Map<String,Byte> bookStatusRule; private Map<String, Byte> bookStatusRule;
private String bookIdPatten; private String bookIdPatten;
private String pagePatten; private String pagePatten;
@ -51,5 +52,7 @@ public class RuleBean {
private String bookIndexStart; private String bookIndexStart;
private String filterContent;
} }

View File

@ -144,6 +144,9 @@
示例<b>&lt;script&gt;</b> 示例<b>&lt;script&gt;</b>
<li><input type="text" id="contentEnd" class="s_input icon_key" <li><input type="text" id="contentEnd" class="s_input icon_key"
placeholder="小说内容结束截取字符串:"></li> placeholder="小说内容结束截取字符串:"></li>
示例<b>&lt;div\s+id=&quot;content_tip&quot;&gt;\s*&lt;b&gt;([^/]+)&lt;/b&gt;\s*&lt;/div&gt;</b>
<li><textarea id="filterContent"
placeholder="过滤内容(多个内容换行)" rows="5" cols="52"></textarea></li>
<li><input type="button" onclick="addCrawlSource()" name="btnRegister" value="提交" <li><input type="button" onclick="addCrawlSource()" name="btnRegister" value="提交"
id="btnRegister" class="btn_red"></li> id="btnRegister" class="btn_red"></li>
@ -405,6 +408,9 @@
crawlRule.contentEnd = contentEnd; crawlRule.contentEnd = contentEnd;
var filterContent = $("#filterContent").val();
crawlRule.filterContent = filterContent;
$.ajax({ $.ajax({
type: "POST", type: "POST",

View File

@ -145,6 +145,10 @@
示例<b>&lt;script&gt;</b> 示例<b>&lt;script&gt;</b>
<li><input type="text" id="contentEnd" class="s_input icon_key" <li><input type="text" id="contentEnd" class="s_input icon_key"
placeholder="小说内容结束截取字符串:"></li> placeholder="小说内容结束截取字符串:"></li>
示例<b>&lt;div\s+id=&quot;content_tip&quot;&gt;\s*&lt;b&gt;([^/]+)&lt;/b&gt;\s*&lt;/div&gt;</b>
<li><textarea id="filterContent"
placeholder="过滤内容(多个内容换行)" rows="5" cols="52"></textarea></li>
<li><input type="button" onclick="updateCrawlSource()" name="btnRegister" value="提交" <li><input type="button" onclick="updateCrawlSource()" name="btnRegister" value="提交"
id="btnRegister" class="btn_red"></li> id="btnRegister" class="btn_red"></li>
@ -269,6 +273,7 @@
$("#bookContentUrl").val(crawlRule.bookContentUrl); $("#bookContentUrl").val(crawlRule.bookContentUrl);
$("#contentStart").val(crawlRule.contentStart); $("#contentStart").val(crawlRule.contentStart);
$("#contentEnd").val(crawlRule.contentEnd); $("#contentEnd").val(crawlRule.contentEnd);
$("#filterContent").val(crawlRule.filterContent);
} }
} }
@ -488,6 +493,9 @@
crawlRule.contentEnd = contentEnd; crawlRule.contentEnd = contentEnd;
var filterContent = $("#filterContent").val();
crawlRule.filterContent = filterContent;
$.ajax({ $.ajax({
type: "POST", type: "POST",