mirror of
https://github.com/201206030/novel-plus.git
synced 2025-04-26 17:20:52 +00:00
perf(novel-crawl): 增加小说内容过滤
This commit is contained in:
parent
976db9420e
commit
89992dc781
@ -89,7 +89,8 @@ public class CrawlParser {
|
||||
}
|
||||
}
|
||||
|
||||
String desc = bookDetailHtml.substring(bookDetailHtml.indexOf(ruleBean.getDescStart()) + ruleBean.getDescStart().length());
|
||||
String desc = bookDetailHtml.substring(
|
||||
bookDetailHtml.indexOf(ruleBean.getDescStart()) + ruleBean.getDescStart().length());
|
||||
desc = desc.substring(0, desc.indexOf(ruleBean.getDescEnd()));
|
||||
//过滤掉简介中的特殊标签
|
||||
desc = desc.replaceAll("<a[^<]+</a>", "")
|
||||
@ -112,14 +113,16 @@ public class CrawlParser {
|
||||
}
|
||||
}
|
||||
|
||||
if (StringUtils.isNotBlank(ruleBean.getUpadateTimePatten()) && StringUtils.isNotBlank(ruleBean.getUpadateTimeFormatPatten())) {
|
||||
if (StringUtils.isNotBlank(ruleBean.getUpadateTimePatten()) && StringUtils.isNotBlank(
|
||||
ruleBean.getUpadateTimeFormatPatten())) {
|
||||
Pattern updateTimePatten = PatternFactory.getPattern(ruleBean.getUpadateTimePatten());
|
||||
Matcher updateTimeMatch = updateTimePatten.matcher(bookDetailHtml);
|
||||
boolean isFindUpdateTime = updateTimeMatch.find();
|
||||
if (isFindUpdateTime) {
|
||||
String updateTime = updateTimeMatch.group(1);
|
||||
//设置更新时间
|
||||
book.setLastIndexUpdateTime(new SimpleDateFormat(ruleBean.getUpadateTimeFormatPatten()).parse(updateTime));
|
||||
book.setLastIndexUpdateTime(
|
||||
new SimpleDateFormat(ruleBean.getUpadateTimeFormatPatten()).parse(updateTime));
|
||||
|
||||
}
|
||||
}
|
||||
@ -141,7 +144,8 @@ public class CrawlParser {
|
||||
handler.handle(book);
|
||||
}
|
||||
|
||||
public static boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map<Integer, BookIndex> existBookIndexMap, CrawlBookChapterHandler handler) {
|
||||
public static boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean,
|
||||
Map<Integer, BookIndex> existBookIndexMap, CrawlBookChapterHandler handler) {
|
||||
|
||||
Date currentDate = new Date();
|
||||
|
||||
@ -153,7 +157,8 @@ public class CrawlParser {
|
||||
|
||||
if (indexListHtml != null) {
|
||||
if (StringUtils.isNotBlank(ruleBean.getBookIndexStart())) {
|
||||
indexListHtml = indexListHtml.substring(indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length());
|
||||
indexListHtml = indexListHtml.substring(
|
||||
indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length());
|
||||
}
|
||||
|
||||
Pattern indexIdPatten = PatternFactory.getPattern(ruleBean.getIndexIdPatten());
|
||||
@ -174,14 +179,16 @@ public class CrawlParser {
|
||||
BookIndex hasIndex = existBookIndexMap.get(indexNum);
|
||||
String indexName = indexNameMatch.group(1);
|
||||
|
||||
if (hasIndex == null || !StringUtils.deleteWhitespace(hasIndex.getIndexName()).equals(StringUtils.deleteWhitespace(indexName))) {
|
||||
if (hasIndex == null || !StringUtils.deleteWhitespace(hasIndex.getIndexName())
|
||||
.equals(StringUtils.deleteWhitespace(indexName))) {
|
||||
|
||||
String sourceIndexId = indexIdMatch.group(1);
|
||||
String bookContentUrl = ruleBean.getBookContentUrl();
|
||||
int calStart = bookContentUrl.indexOf("{cal_");
|
||||
if (calStart != -1) {
|
||||
//内容页URL需要进行计算才能得到
|
||||
String calStr = bookContentUrl.substring(calStart, calStart + bookContentUrl.substring(calStart).indexOf("}"));
|
||||
String calStr = bookContentUrl.substring(calStart,
|
||||
calStart + bookContentUrl.substring(calStart).indexOf("}"));
|
||||
String[] calArr = calStr.split("_");
|
||||
int calType = Integer.parseInt(calArr[1]);
|
||||
if (calType == 1) {
|
||||
@ -206,13 +213,25 @@ public class CrawlParser {
|
||||
|
||||
}
|
||||
|
||||
String contentUrl = bookContentUrl.replace("{bookId}", sourceBookId).replace("{indexId}", sourceIndexId);
|
||||
String contentUrl = bookContentUrl.replace("{bookId}", sourceBookId)
|
||||
.replace("{indexId}", sourceIndexId);
|
||||
|
||||
//查询章节内容
|
||||
String contentHtml = getByHttpClientWithChrome(contentUrl);
|
||||
if (contentHtml != null && !contentHtml.contains("正在手打中")) {
|
||||
String content = contentHtml.substring(contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());
|
||||
String content = contentHtml.substring(
|
||||
contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());
|
||||
content = content.substring(0, content.indexOf(ruleBean.getContentEnd()));
|
||||
// 小说内容过滤
|
||||
String filterContent = ruleBean.getFilterContent();
|
||||
if (StringUtils.isNotBlank(filterContent)) {
|
||||
String[] filterRules = filterContent.replace("\r\n", "\n").split("\n");
|
||||
for (String filterRule : filterRules) {
|
||||
if (StringUtils.isNotBlank(filterRule)) {
|
||||
content = content.replaceAll(filterRule, "");
|
||||
}
|
||||
}
|
||||
}
|
||||
//插入章节目录和章节内容
|
||||
BookIndex bookIndex = new BookIndex();
|
||||
bookIndex.setIndexName(indexName);
|
||||
@ -257,7 +276,6 @@ public class CrawlParser {
|
||||
isFindIndex = indexIdMatch.find() & indexNameMatch.find();
|
||||
}
|
||||
|
||||
|
||||
if (indexList.size() > 0) {
|
||||
//如果有爬到最新章节,则设置小说主表的最新章节信息
|
||||
//获取爬取到的最新章节
|
||||
|
@ -6,6 +6,7 @@ import java.util.Map;
|
||||
|
||||
/**
|
||||
* 爬虫解析规则bean
|
||||
*
|
||||
* @author Administrator
|
||||
*/
|
||||
@Data
|
||||
@ -13,17 +14,17 @@ public class RuleBean {
|
||||
|
||||
/**
|
||||
* 小说更新列表url
|
||||
* */
|
||||
*/
|
||||
private String updateBookListUrl;
|
||||
|
||||
/**
|
||||
* 分类列表页URL规则
|
||||
* */
|
||||
*/
|
||||
private String bookListUrl;
|
||||
|
||||
private Map<String,String> catIdRule;
|
||||
private Map<String, String> catIdRule;
|
||||
|
||||
private Map<String,Byte> bookStatusRule;
|
||||
private Map<String, Byte> bookStatusRule;
|
||||
|
||||
private String bookIdPatten;
|
||||
private String pagePatten;
|
||||
@ -51,5 +52,7 @@ public class RuleBean {
|
||||
|
||||
private String bookIndexStart;
|
||||
|
||||
private String filterContent;
|
||||
|
||||
|
||||
}
|
||||
|
@ -144,6 +144,9 @@
|
||||
示例:<b><script></b>
|
||||
<li><input type="text" id="contentEnd" class="s_input icon_key"
|
||||
placeholder="小说内容结束截取字符串:"></li>
|
||||
示例:<b><div\s+id="content_tip">\s*<b>([^/]+)</b>\s*</div></b>
|
||||
<li><textarea id="filterContent"
|
||||
placeholder="过滤内容(多个内容换行)" rows="5" cols="52"></textarea></li>
|
||||
|
||||
<li><input type="button" onclick="addCrawlSource()" name="btnRegister" value="提交"
|
||||
id="btnRegister" class="btn_red"></li>
|
||||
@ -405,6 +408,9 @@
|
||||
|
||||
crawlRule.contentEnd = contentEnd;
|
||||
|
||||
var filterContent = $("#filterContent").val();
|
||||
crawlRule.filterContent = filterContent;
|
||||
|
||||
|
||||
$.ajax({
|
||||
type: "POST",
|
||||
|
@ -145,6 +145,10 @@
|
||||
示例:<b><script></b>
|
||||
<li><input type="text" id="contentEnd" class="s_input icon_key"
|
||||
placeholder="小说内容结束截取字符串:"></li>
|
||||
示例:<b><div\s+id="content_tip">\s*<b>([^/]+)</b>\s*</div></b>
|
||||
<li><textarea id="filterContent"
|
||||
placeholder="过滤内容(多个内容换行)" rows="5" cols="52"></textarea></li>
|
||||
|
||||
|
||||
<li><input type="button" onclick="updateCrawlSource()" name="btnRegister" value="提交"
|
||||
id="btnRegister" class="btn_red"></li>
|
||||
@ -269,6 +273,7 @@
|
||||
$("#bookContentUrl").val(crawlRule.bookContentUrl);
|
||||
$("#contentStart").val(crawlRule.contentStart);
|
||||
$("#contentEnd").val(crawlRule.contentEnd);
|
||||
$("#filterContent").val(crawlRule.filterContent);
|
||||
|
||||
}
|
||||
}
|
||||
@ -488,6 +493,9 @@
|
||||
|
||||
crawlRule.contentEnd = contentEnd;
|
||||
|
||||
var filterContent = $("#filterContent").val();
|
||||
crawlRule.filterContent = filterContent;
|
||||
|
||||
|
||||
$.ajax({
|
||||
type: "POST",
|
||||
|
Loading…
x
Reference in New Issue
Block a user