mirror of
https://github.com/201206030/novel-plus.git
synced 2025-06-24 04:46:37 +00:00
增加新源,过滤简介中的特殊字符,提高爬虫兼容性
This commit is contained in:
@ -5,7 +5,7 @@
|
||||
<parent>
|
||||
<artifactId>novel</artifactId>
|
||||
<groupId>com.java2nb</groupId>
|
||||
<version>2.5.0</version>
|
||||
<version>2.5.1</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
@ -96,8 +96,12 @@ public class CrawlParser {
|
||||
|
||||
String desc = bookDetailHtml.substring(bookDetailHtml.indexOf(ruleBean.getDescStart()) + ruleBean.getDescStart().length());
|
||||
desc = desc.substring(0, desc.indexOf(ruleBean.getDescEnd()));
|
||||
//过滤掉简介中的a标签
|
||||
desc = desc.replaceAll("<a[^<]+</a>","");
|
||||
//过滤掉简介中的特殊标签
|
||||
desc = desc.replaceAll("<a[^<]+</a>","")
|
||||
.replaceAll("<font[^<]+</font>","")
|
||||
.replaceAll("<p>\\s*</p>","")
|
||||
.replaceAll("<p>","")
|
||||
.replaceAll("</p>","<br/>");
|
||||
//设置书籍简介
|
||||
book.setBookDesc(desc);
|
||||
if (StringUtils.isNotBlank(ruleBean.getStatusPatten())) {
|
||||
|
Reference in New Issue
Block a user