mirror of
https://github.com/201206030/novel-plus.git
synced 2025-04-26 17:20:52 +00:00
增加新源,过滤简介中的特殊字符,提高爬虫兼容性
This commit is contained in:
parent
f8a669eb01
commit
0e2e6229cd
@ -5,7 +5,7 @@
|
||||
<parent>
|
||||
<artifactId>novel</artifactId>
|
||||
<groupId>com.java2nb</groupId>
|
||||
<version>2.5.0</version>
|
||||
<version>2.5.1</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
@ -5,7 +5,7 @@
|
||||
<parent>
|
||||
<artifactId>novel</artifactId>
|
||||
<groupId>com.java2nb</groupId>
|
||||
<version>2.5.0</version>
|
||||
<version>2.5.1</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
@ -96,8 +96,12 @@ public class CrawlParser {
|
||||
|
||||
String desc = bookDetailHtml.substring(bookDetailHtml.indexOf(ruleBean.getDescStart()) + ruleBean.getDescStart().length());
|
||||
desc = desc.substring(0, desc.indexOf(ruleBean.getDescEnd()));
|
||||
//过滤掉简介中的a标签
|
||||
desc = desc.replaceAll("<a[^<]+</a>","");
|
||||
//过滤掉简介中的特殊标签
|
||||
desc = desc.replaceAll("<a[^<]+</a>","")
|
||||
.replaceAll("<font[^<]+</font>","")
|
||||
.replaceAll("<p>\\s*</p>","")
|
||||
.replaceAll("<p>","")
|
||||
.replaceAll("</p>","<br/>");
|
||||
//设置书籍简介
|
||||
book.setBookDesc(desc);
|
||||
if (StringUtils.isNotBlank(ruleBean.getStatusPatten())) {
|
||||
|
@ -5,7 +5,7 @@
|
||||
<parent>
|
||||
<artifactId>novel</artifactId>
|
||||
<groupId>com.java2nb</groupId>
|
||||
<version>2.5.0</version>
|
||||
<version>2.5.1</version>
|
||||
</parent>
|
||||
<modelVersion>4.0.0</modelVersion>
|
||||
|
||||
|
2
pom.xml
2
pom.xml
@ -5,7 +5,7 @@
|
||||
|
||||
<groupId>com.java2nb</groupId>
|
||||
<artifactId>novel</artifactId>
|
||||
<version>2.5.0</version>
|
||||
<version>2.5.1</version>
|
||||
<modules>
|
||||
<module>novel-common</module>
|
||||
<module>novel-front</module>
|
||||
|
2
sql/20200608.sql
Normal file
2
sql/20200608.sql
Normal file
@ -0,0 +1,2 @@
|
||||
INSERT INTO `crawl_source` (`id`, `source_name`, `crawl_rule`, `source_status`, `create_time`, `update_time`) VALUES
|
||||
(6, '新笔趣阁', '{\n \"bookListUrl\": \"http://www.xbiquge.la/fenlei/{catId}_{page}.html\",\n \"catIdRule\": {\n \"catId1\": \"1\",\n \"catId2\": \"2\",\n \"catId3\": \"3\",\n \"catId4\": \"4\",\n \"catId5\": \"6\",\n \"catId6\": \"5\"\n },\n \"bookIdPatten\": \"<a\\\\s+href=\\\"http://www.xbiquge.la/(\\\\d+/\\\\d+)/\\\"\\\\s+target=\\\"_blank\\\">\",\n \"pagePatten\": \"<em\\\\s+id=\\\"pagestats\\\">(\\\\d+)/\\\\d+</em>\",\n \"totalPagePatten\": \"<em\\\\s+id=\\\"pagestats\\\">\\\\d+/(\\\\d+)</em>\",\n \"bookDetailUrl\": \"http://www.xbiquge.la/{bookId}/\",\n \"bookNamePatten\": \"<h1>([^/]+)</h1>\",\n \"authorNamePatten\": \"者:([^/]+)</p>\",\n \"picUrlPatten\": \"src=\\\"(http://www.xbiquge.la/files/article/image/\\\\d+/\\\\d+/\\\\d+s\\\\.jpg)\\\"\",\n \"bookStatusRule\": {},\n \"descStart\": \"<div id=\\\"intro\\\">\",\n \"descEnd\": \"</div>\",\n \"upadateTimePatten\": \"<p>最后更新:(\\\\d+-\\\\d+-\\\\d+\\\\s\\\\d+:\\\\d+:\\\\d+)</p>\",\n \"upadateTimeFormatPatten\": \"yyyy-MM-dd HH:mm:ss\",\n \"bookIndexUrl\": \"http://www.xbiquge.la/{bookId}/\",\n \"indexIdPatten\": \"<a\\\\s+href=\'/\\\\d+/\\\\d+/(\\\\d+)\\\\.html\'\\\\s+>[^/]+</a>\",\n \"indexNamePatten\": \"<a\\\\s+href=\'/\\\\d+/\\\\d+/\\\\d+\\\\.html\'\\\\s+>([^/]+)</a>\",\n \"bookContentUrl\": \"http://www.xbiquge.la/{bookId}/{indexId}.html\",\n \"contentStart\": \"<div id=\\\"content\\\">\",\n \"contentEnd\": \"<p>\"\n}', 0, '2020-05-23 22:46:58', '2020-05-23 22:46:58');
|
Loading…
x
Reference in New Issue
Block a user