perf(novel-crawl): 增加小说内容过滤

This commit is contained in:
xiongxiaoyang 2024-06-01 09:56:07 +08:00
parent 976db9420e
commit 89992dc781
4 changed files with 53 additions and 18 deletions

View File

@ -89,14 +89,15 @@ public class CrawlParser {
}
}
String desc = bookDetailHtml.substring(bookDetailHtml.indexOf(ruleBean.getDescStart()) + ruleBean.getDescStart().length());
String desc = bookDetailHtml.substring(
bookDetailHtml.indexOf(ruleBean.getDescStart()) + ruleBean.getDescStart().length());
desc = desc.substring(0, desc.indexOf(ruleBean.getDescEnd()));
//过滤掉简介中的特殊标签
desc = desc.replaceAll("<a[^<]+</a>", "")
.replaceAll("<font[^<]+</font>", "")
.replaceAll("<p>\\s*</p>", "")
.replaceAll("<p>", "")
.replaceAll("</p>", "<br/>");
.replaceAll("<font[^<]+</font>", "")
.replaceAll("<p>\\s*</p>", "")
.replaceAll("<p>", "")
.replaceAll("</p>", "<br/>");
//设置书籍简介
book.setBookDesc(desc);
if (StringUtils.isNotBlank(ruleBean.getStatusPatten())) {
@ -112,14 +113,16 @@ public class CrawlParser {
}
}
if (StringUtils.isNotBlank(ruleBean.getUpadateTimePatten()) && StringUtils.isNotBlank(ruleBean.getUpadateTimeFormatPatten())) {
if (StringUtils.isNotBlank(ruleBean.getUpadateTimePatten()) && StringUtils.isNotBlank(
ruleBean.getUpadateTimeFormatPatten())) {
Pattern updateTimePatten = PatternFactory.getPattern(ruleBean.getUpadateTimePatten());
Matcher updateTimeMatch = updateTimePatten.matcher(bookDetailHtml);
boolean isFindUpdateTime = updateTimeMatch.find();
if (isFindUpdateTime) {
String updateTime = updateTimeMatch.group(1);
//设置更新时间
book.setLastIndexUpdateTime(new SimpleDateFormat(ruleBean.getUpadateTimeFormatPatten()).parse(updateTime));
book.setLastIndexUpdateTime(
new SimpleDateFormat(ruleBean.getUpadateTimeFormatPatten()).parse(updateTime));
}
}
@ -141,7 +144,8 @@ public class CrawlParser {
handler.handle(book);
}
public static boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map<Integer, BookIndex> existBookIndexMap, CrawlBookChapterHandler handler) {
public static boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean,
Map<Integer, BookIndex> existBookIndexMap, CrawlBookChapterHandler handler) {
Date currentDate = new Date();
@ -153,7 +157,8 @@ public class CrawlParser {
if (indexListHtml != null) {
if (StringUtils.isNotBlank(ruleBean.getBookIndexStart())) {
indexListHtml = indexListHtml.substring(indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length());
indexListHtml = indexListHtml.substring(
indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length());
}
Pattern indexIdPatten = PatternFactory.getPattern(ruleBean.getIndexIdPatten());
@ -174,14 +179,16 @@ public class CrawlParser {
BookIndex hasIndex = existBookIndexMap.get(indexNum);
String indexName = indexNameMatch.group(1);
if (hasIndex == null || !StringUtils.deleteWhitespace(hasIndex.getIndexName()).equals(StringUtils.deleteWhitespace(indexName))) {
if (hasIndex == null || !StringUtils.deleteWhitespace(hasIndex.getIndexName())
.equals(StringUtils.deleteWhitespace(indexName))) {
String sourceIndexId = indexIdMatch.group(1);
String bookContentUrl = ruleBean.getBookContentUrl();
int calStart = bookContentUrl.indexOf("{cal_");
if (calStart != -1) {
//内容页URL需要进行计算才能得到
String calStr = bookContentUrl.substring(calStart, calStart + bookContentUrl.substring(calStart).indexOf("}"));
String calStr = bookContentUrl.substring(calStart,
calStart + bookContentUrl.substring(calStart).indexOf("}"));
String[] calArr = calStr.split("_");
int calType = Integer.parseInt(calArr[1]);
if (calType == 1) {
@ -206,13 +213,25 @@ public class CrawlParser {
}
String contentUrl = bookContentUrl.replace("{bookId}", sourceBookId).replace("{indexId}", sourceIndexId);
String contentUrl = bookContentUrl.replace("{bookId}", sourceBookId)
.replace("{indexId}", sourceIndexId);
//查询章节内容
String contentHtml = getByHttpClientWithChrome(contentUrl);
if (contentHtml != null && !contentHtml.contains("正在手打中")) {
String content = contentHtml.substring(contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());
String content = contentHtml.substring(
contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());
content = content.substring(0, content.indexOf(ruleBean.getContentEnd()));
// 小说内容过滤
String filterContent = ruleBean.getFilterContent();
if (StringUtils.isNotBlank(filterContent)) {
String[] filterRules = filterContent.replace("\r\n", "\n").split("\n");
for (String filterRule : filterRules) {
if (StringUtils.isNotBlank(filterRule)) {
content = content.replaceAll(filterRule, "");
}
}
}
//插入章节目录和章节内容
BookIndex bookIndex = new BookIndex();
bookIndex.setIndexName(indexName);
@ -257,7 +276,6 @@ public class CrawlParser {
isFindIndex = indexIdMatch.find() & indexNameMatch.find();
}
if (indexList.size() > 0) {
//如果有爬到最新章节则设置小说主表的最新章节信息
//获取爬取到的最新章节

View File

@ -6,6 +6,7 @@ import java.util.Map;
/**
* 爬虫解析规则bean
*
* @author Administrator
*/
@Data
@ -13,17 +14,17 @@ public class RuleBean {
/**
* 小说更新列表url
* */
*/
private String updateBookListUrl;
/**
* 分类列表页URL规则
* */
*/
private String bookListUrl;
private Map<String,String> catIdRule;
private Map<String, String> catIdRule;
private Map<String,Byte> bookStatusRule;
private Map<String, Byte> bookStatusRule;
private String bookIdPatten;
private String pagePatten;
@ -51,5 +52,7 @@ public class RuleBean {
private String bookIndexStart;
private String filterContent;
}

View File

@ -144,6 +144,9 @@
示例<b>&lt;script&gt;</b>
<li><input type="text" id="contentEnd" class="s_input icon_key"
placeholder="小说内容结束截取字符串:"></li>
示例<b>&lt;div\s+id=&quot;content_tip&quot;&gt;\s*&lt;b&gt;([^/]+)&lt;/b&gt;\s*&lt;/div&gt;</b>
<li><textarea id="filterContent"
placeholder="过滤内容(多个内容换行)" rows="5" cols="52"></textarea></li>
<li><input type="button" onclick="addCrawlSource()" name="btnRegister" value="提交"
id="btnRegister" class="btn_red"></li>
@ -405,6 +408,9 @@
crawlRule.contentEnd = contentEnd;
var filterContent = $("#filterContent").val();
crawlRule.filterContent = filterContent;
$.ajax({
type: "POST",

View File

@ -145,6 +145,10 @@
示例<b>&lt;script&gt;</b>
<li><input type="text" id="contentEnd" class="s_input icon_key"
placeholder="小说内容结束截取字符串:"></li>
示例<b>&lt;div\s+id=&quot;content_tip&quot;&gt;\s*&lt;b&gt;([^/]+)&lt;/b&gt;\s*&lt;/div&gt;</b>
<li><textarea id="filterContent"
placeholder="过滤内容(多个内容换行)" rows="5" cols="52"></textarea></li>
<li><input type="button" onclick="updateCrawlSource()" name="btnRegister" value="提交"
id="btnRegister" class="btn_red"></li>
@ -269,6 +273,7 @@
$("#bookContentUrl").val(crawlRule.bookContentUrl);
$("#contentStart").val(crawlRule.contentStart);
$("#contentEnd").val(crawlRule.contentEnd);
$("#filterContent").val(crawlRule.filterContent);
}
}
@ -488,6 +493,9 @@
crawlRule.contentEnd = contentEnd;
var filterContent = $("#filterContent").val();
crawlRule.filterContent = filterContent;
$.ajax({
type: "POST",