增加新源,过滤简介中的特殊字符,提高爬虫兼容性

This commit is contained in:
xiongxiaoyang
2020-06-08 16:54:58 +08:00
parent f8a669eb01
commit 0e2e6229cd
6 changed files with 12 additions and 6 deletions

View File

@ -5,7 +5,7 @@
<parent>
<artifactId>novel</artifactId>
<groupId>com.java2nb</groupId>
<version>2.5.0</version>
<version>2.5.1</version>
</parent>
<modelVersion>4.0.0</modelVersion>

View File

@ -96,8 +96,12 @@ public class CrawlParser {
String desc = bookDetailHtml.substring(bookDetailHtml.indexOf(ruleBean.getDescStart()) + ruleBean.getDescStart().length());
desc = desc.substring(0, desc.indexOf(ruleBean.getDescEnd()));
//过滤掉简介中的a标签
desc = desc.replaceAll("<a[^<]+</a>","");
//过滤掉简介中的特殊标签
desc = desc.replaceAll("<a[^<]+</a>","")
.replaceAll("<font[^<]+</font>","")
.replaceAll("<p>\\s*</p>","")
.replaceAll("<p>","")
.replaceAll("</p>","<br/>");
//设置书籍简介
book.setBookDesc(desc);
if (StringUtils.isNotBlank(ruleBean.getStatusPatten())) {