diff --git a/novel-common/pom.xml b/novel-common/pom.xml index e038c63..72f63ad 100644 --- a/novel-common/pom.xml +++ b/novel-common/pom.xml @@ -5,7 +5,7 @@ novel com.java2nb - 2.0.1 + 2.0.2 4.0.0 diff --git a/novel-crawl/pom.xml b/novel-crawl/pom.xml index d868199..eb7bf43 100644 --- a/novel-crawl/pom.xml +++ b/novel-crawl/pom.xml @@ -5,7 +5,7 @@ novel com.java2nb - 2.0.1 + 2.0.2 4.0.0 diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java index c02195b..8ba13ee 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java @@ -256,9 +256,10 @@ public class CrawlParser { ResponseEntity forEntity = restTemplate.getForEntity(url, String.class); if (forEntity.getStatusCode() == HttpStatus.OK) { String body = forEntity.getBody(); + log.debug("body长度:"+body.length()); if(body.length() < Constants.INVALID_HTML_LENGTH){ log.debug("获取html页面内容失败"); - Thread.sleep(10 + new Random().nextInt(60)); + Thread.sleep( new Random().nextInt(10*1000)); return getByHttpClient(url); } return body; diff --git a/novel-crawl/src/main/java/com/java2nb/novel/utils/Constants.java b/novel-crawl/src/main/java/com/java2nb/novel/utils/Constants.java index 5e0230a..753ca11 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/utils/Constants.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/utils/Constants.java @@ -18,5 +18,5 @@ public class Constants { /** * 爬取小说http请求中无效的内容长度 */ - public static final int INVALID_HTML_LENGTH = 1000; + public static final int INVALID_HTML_LENGTH = 1500; } diff --git a/novel-front/pom.xml b/novel-front/pom.xml index ef43731..8752dcb 100644 --- a/novel-front/pom.xml +++ b/novel-front/pom.xml @@ -5,7 +5,7 @@ novel com.java2nb - 2.0.1 + 2.0.2 4.0.0 diff --git a/pom.xml b/pom.xml index 44b9a37..f20d784 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ com.java2nb novel - 2.0.1 + 2.0.2 novel-common novel-front diff --git a/sql/20200518.sql b/sql/20200518.sql index 25cf191..c5fe46d 100644 --- a/sql/20200518.sql +++ b/sql/20200518.sql @@ -1 +1 @@ -INSERT INTO `novel_plus`.`crawl_source` (`id`, `source_name`, `crawl_rule`, `source_status`, `create_time`, `update_time`) VALUES ('4', '书趣阁', '{\r\n \"bookListUrl\": \"http://m.shuquge.com/sort/{catId}/0_{page}.html\",\r\n \"catIdRule\": {\r\n \"catId1\": \"1\",\r\n \"catId2\": \"2\",\r\n \"catId3\": \"3\",\r\n \"catId4\": \"4\",\r\n \"catId5\": \"7\",\r\n \"catId6\": \"6\",\r\n \"catId7\": \"8\"\r\n },\r\n \"bookIdPatten\": \"href=\\\"/s/(\\\\d+)\\\\.html\\\"\",\r\n \"pagePatten\": \"第(\\\\d+)/\\\\d+页\",\r\n \"totalPagePatten\": \"第\\\\d+/(\\\\d+)页\",\r\n \"bookDetailUrl\": \"http://m.shuquge.com/s/{bookId}.html\",\r\n \"bookNamePatten\": \"

([^/]+)

\",\r\n \"authorNamePatten\": \"

作者:([^/]+)

\",\r\n \"picUrlPatten\": \"src=\\\"(http://www.shuquge.com/files/article/image/\\\\d+/\\\\d+/\\\\d+s\\\\.jpg)\\\"\",\r\n \"statusPatten\": \"

状态:([^/]+)

\",\r\n \"bookStatusRule\": {\r\n \"连载中\": 0,\r\n \"完本\": 1\r\n },\r\n \"descStart\": \"
\",\r\n \"descEnd\": \"最新章节推荐地址\",\r\n \"bookIndexUrl\": \"http://www.shuquge.com/txt/{bookId}/index.html\",\r\n \"bookIndexStart\": \"》正文卷\",\r\n \"indexIdPatten\": \"
[^/]+
\",\r\n \"indexNamePatten\": \"
([^/]+)
\",\r\n \"bookContentUrl\": \"http://www.shuquge.com/txt/{bookId}/{indexId}.html\",\r\n \"contentStart\": \"
\",\r\n \"contentEnd\": \"http://www.shuquge.com\"\r\n}', '1', '2020-05-18 12:02:34', '2020-05-18 12:02:34'); +INSERT INTO `novel_plus`.`crawl_source` (`id`, `source_name`, `crawl_rule`, `source_status`, `create_time`, `update_time`) VALUES ('4', '书趣阁', '{\r\n \"bookListUrl\": \"http://m.shuquge.com/sort/{catId}/0_{page}.html\",\r\n \"catIdRule\": {\r\n \"catId1\": \"1\",\r\n \"catId2\": \"2\",\r\n \"catId3\": \"3\",\r\n \"catId4\": \"4\",\r\n \"catId5\": \"7\",\r\n \"catId6\": \"6\",\r\n \"catId7\": \"8\"\r\n },\r\n \"bookIdPatten\": \"href=\\\"/s/(\\\\d+)\\\\.html\\\"\",\r\n \"pagePatten\": \"第(\\\\d+)/\\\\d+页\",\r\n \"totalPagePatten\": \"第\\\\d+/(\\\\d+)页\",\r\n \"bookDetailUrl\": \"http://m.shuquge.com/s/{bookId}.html\",\r\n \"bookNamePatten\": \"

([^/]+)

\",\r\n \"authorNamePatten\": \"

作者:([^/]+)

\",\r\n \"picUrlPatten\": \"src=\\\"(http://www.shuquge.com/files/article/image/\\\\d+/\\\\d+/\\\\d+s\\\\.jpg)\\\"\",\r\n \"statusPatten\": \"

状态:([^/]+)

\",\r\n \"bookStatusRule\": {\r\n \"连载中\": 0,\r\n \"完本\": 1\r\n },\r\n \"descStart\": \"
\",\r\n \"descEnd\": \"最新章节推荐地址\",\r\n \"bookIndexUrl\": \"http://www.shuquge.com/txt/{bookId}/index.html\",\r\n \"bookIndexStart\": \"》正文\",\r\n \"indexIdPatten\": \"
[^/]+
\",\r\n \"indexNamePatten\": \"
([^/]+)
\",\r\n \"bookContentUrl\": \"http://www.shuquge.com/txt/{bookId}/{indexId}.html\",\r\n \"contentStart\": \"
\",\r\n \"contentEnd\": \"http://www.shuquge.com\"\r\n}', '1', '2020-05-18 12:02:34', '2020-05-18 12:02:34'); diff --git a/sql/novel_plus.sql b/sql/novel_plus.sql index 5f19a9b..1112d34 100644 --- a/sql/novel_plus.sql +++ b/sql/novel_plus.sql @@ -356,7 +356,7 @@ CREATE TABLE `crawl_source` ( -- ---------------------------- INSERT INTO `crawl_source` VALUES ('2', '百书斋', '{\r\n \"bookListUrl\": \"https://m.baishuzhai.com/blhb/{catId}/{page}.html\",\r\n \"catIdRule\": {\r\n \"catId1\": \"1\",\r\n \"catId2\": \"2\",\r\n \"catId3\": \"3\",\r\n \"catId4\": \"4\",\r\n \"catId5\": \"5\",\r\n \"catId6\": \"6\",\r\n \"catId7\": \"7\"\r\n },\r\n \"bookIdPatten\": \"href=\\\"/ibook/(\\\\d+/\\\\d+)/\\\"\",\r\n \"pagePatten\": \"value=\\\"(\\\\d+)/\\\\d+\\\"\",\r\n \"totalPagePatten\": \"value=\\\"\\\\d+/(\\\\d+)\\\"\",\r\n \"bookDetailUrl\": \"https://m.baishuzhai.com/ibook/{bookId}/\",\r\n \"bookNamePatten\": \"([^/]+)\",\r\n \"authorNamePatten\": \">作者:([^/]+)<\",\r\n \"picUrlPatten\": \"]+)\\\"\\\\s+onerror=\\\"this.src=\",\r\n \"statusPatten\": \"状态:([^/]+)\",\r\n \"bookStatusRule\": {\r\n \"连载\": 0,\r\n \"完成\": 1\r\n },\r\n \"scorePatten\": \"([^<]+)\",\r\n \"descStart\": \"

\",\r\n \"descEnd\": \"

\",\r\n \"upadateTimePatten\": \"更新:(\\\\d+-\\\\d+-\\\\d+)\",\r\n \"upadateTimeFormatPatten\": \"yy-MM-dd\",\r\n \"bookIndexUrl\": \"https://m.baishuzhai.com/ibook/{bookId}/all.html\",\r\n \"indexIdPatten\": \"[^/]+\",\r\n \"indexNamePatten\": \"([^/]+)\",\r\n \"bookContentUrl\": \"https://baishuzhai.com/ibook/{bookId}/{indexId}.html\",\r\n \"contentStart\": \"id=\\\"content\\\">\",\r\n \"contentEnd\": \"