v2.0.2发布

This commit is contained in:
xiongxiaoyang 2020-05-18 17:31:00 +08:00
parent 5e16119880
commit c80b02caf0
4 changed files with 3 additions and 1115 deletions

View File

@ -260,9 +260,7 @@ public class CrawlParser {
ResponseEntity<String> forEntity = restTemplate.getForEntity(url, String.class);
if (forEntity.getStatusCode() == HttpStatus.OK) {
String body = forEntity.getBody();
log.debug("body长度"+body.length());
if(body.length() < Constants.INVALID_HTML_LENGTH){
log.debug("获取html页面内容失败");
return processErrorHttpResult(url);
}
//成功获得html内容

File diff suppressed because it is too large Load Diff

View File

@ -1,2 +1,2 @@
INSERT INTO `novel_plus`.`crawl_source` (`id`, `source_name`, `crawl_rule`, `source_status`, `create_time`, `update_time`) VALUES ('4', '书趣阁', '{\r\n \"bookListUrl\": \"http://m.shuquge.com/sort/{catId}/0_{page}.html\",\r\n \"catIdRule\": {\r\n \"catId1\": \"1\",\r\n \"catId2\": \"2\",\r\n \"catId3\": \"3\",\r\n \"catId4\": \"4\",\r\n \"catId5\": \"7\",\r\n \"catId6\": \"6\",\r\n \"catId7\": \"8\"\r\n },\r\n \"bookIdPatten\": \"href=\\\"/s/(\\\\d+)\\\\.html\\\"\",\r\n \"pagePatten\": \"第(\\\\d+)/\\\\d+页\",\r\n \"totalPagePatten\": \"\\\\d+/(\\\\d+)页\",\r\n \"bookDetailUrl\": \"http://m.shuquge.com/s/{bookId}.html\",\r\n \"bookNamePatten\": \"<a\\\\s+href=\\\"/s/\\\\d+\\\\.html\\\"><h2>([^/]+)</h2></a>\",\r\n \"authorNamePatten\": \"<p>作者:([^/]+)</p>\",\r\n \"picUrlPatten\": \"src=\\\"(http://www.shuquge.com/files/article/image/\\\\d+/\\\\d+/\\\\d+s\\\\.jpg)\\\"\",\r\n \"statusPatten\": \"<p>状态:([^/]+)</p>\",\r\n \"bookStatusRule\": {\r\n \"连载中\": 0,\r\n \"完本\": 1\r\n },\r\n \"descStart\": \"<div class=\\\"intro_info\\\">\",\r\n \"descEnd\": \"最新章节推荐地址\",\r\n \"bookIndexUrl\": \"http://www.shuquge.com/txt/{bookId}/index.html\",\r\n \"bookIndexStart\": \"》正文\",\r\n \"indexIdPatten\": \"<dd><a\\\\s+href=\\\"(\\\\d+)\\\\.html\\\">[^/]+</a></dd>\",\r\n \"indexNamePatten\": \"<dd><a\\\\s+href=\\\"\\\\d+\\\\.html\\\">([^/]+)</a></dd>\",\r\n \"bookContentUrl\": \"http://www.shuquge.com/txt/{bookId}/{indexId}.html\",\r\n \"contentStart\": \"<div id=\\\"content\\\" class=\\\"showtxt\\\">\",\r\n \"contentEnd\": \"http://www.shuquge.com\"\r\n}', '1', '2020-05-18 12:02:34', '2020-05-18 12:02:34');
INSERT INTO `novel_plus`.`crawl_source` (`id`, `source_name`, `crawl_rule`, `source_status`, `create_time`, `update_time`) VALUES ('5', '笔趣阁', '{\"bookListUrl\":\"http://m.mcmssc.com/xclass/{catId}/{page}.html\",\"catIdRule\":{\"catId1\":\"1\",\"catId2\":\"2\",\"catId3\":\"3\",\"catId4\":\"4\",\"catId5\":\"5\",\"catId6\":\"6\",\"catId7\":\"7\"},\"bookIdPatten\":\"href=\\\"/(\\\\d+_\\\\d+)/\\\"\",\"pagePatten\":\"class=\\\"page_txt\\\"\\\\s+value=\\\"(\\\\d+)/\\\\d+\\\"\\\\s+size=\",\"totalPagePatten\":\"class=\\\"page_txt\\\"\\\\s+value=\\\"\\\\d+/(\\\\d+)\\\"\\\\s+size=\",\"bookDetailUrl\":\"http://m.mcmssc.com/{bookId}/\",\"bookNamePatten\":\"<span\\\\s+class=\\\"title\\\">([^/]+)</span>\",\"authorNamePatten\":\"<a\\\\s+href=\\\"/author/\\\\d+/\\\">([^/]+)</a>\",\"picUrlPatten\":\"<img\\\\s+src=\\\"([^>]+)\\\"\\\\s+onerror=\",\"picUrlPrefix\":\"http://m.mcmssc.com/\",\"statusPatten\":\">状态:([^/]+)<\",\"bookStatusRule\":{\"连载\":0,\"全本\":1},\"visitCountPatten\":\">点击:(\\\\d+)<\",\"descStart\":\"<p class=\\\"review\\\">\",\"descEnd\":\"</p>\",\"bookIndexUrl\":\"http://m.mcmssc.com/{bookId}/all.html\",\"indexIdPatten\":\"<a\\\\s+href=\\\"/\\\\d+_\\\\d+/(\\\\d+)\\\\.html\\\">[^/]+</a>\",\"indexNamePatten\":\"<a\\\\s+href=\\\"/\\\\d+_\\\\d+/\\\\d+\\\\.html\\\">([^/]+)</a>\",\"bookContentUrl\":\"http://www.mcmssc.com/{bookId}/{indexId}.html\",\"contentStart\":\"</p>\",\"contentEnd\":\"<div align=\\\"center\\\">\"}', '1', '2020-05-18 15:57:41', '2020-05-18 15:57:41');
INSERT INTO `crawl_source` (`id`, `source_name`, `crawl_rule`, `source_status`, `create_time`, `update_time`) VALUES ('4', '书趣阁', '{\r\n \"bookListUrl\": \"http://m.shuquge.com/sort/{catId}/0_{page}.html\",\r\n \"catIdRule\": {\r\n \"catId1\": \"1\",\r\n \"catId2\": \"2\",\r\n \"catId3\": \"3\",\r\n \"catId4\": \"4\",\r\n \"catId5\": \"7\",\r\n \"catId6\": \"6\",\r\n \"catId7\": \"8\"\r\n },\r\n \"bookIdPatten\": \"href=\\\"/s/(\\\\d+)\\\\.html\\\"\",\r\n \"pagePatten\": \"第(\\\\d+)/\\\\d+页\",\r\n \"totalPagePatten\": \"\\\\d+/(\\\\d+)页\",\r\n \"bookDetailUrl\": \"http://m.shuquge.com/s/{bookId}.html\",\r\n \"bookNamePatten\": \"<a\\\\s+href=\\\"/s/\\\\d+\\\\.html\\\"><h2>([^/]+)</h2></a>\",\r\n \"authorNamePatten\": \"<p>作者:([^/]+)</p>\",\r\n \"picUrlPatten\": \"src=\\\"(http://www.shuquge.com/files/article/image/\\\\d+/\\\\d+/\\\\d+s\\\\.jpg)\\\"\",\r\n \"statusPatten\": \"<p>状态:([^/]+)</p>\",\r\n \"bookStatusRule\": {\r\n \"连载中\": 0,\r\n \"完本\": 1\r\n },\r\n \"descStart\": \"<div class=\\\"intro_info\\\">\",\r\n \"descEnd\": \"最新章节推荐地址\",\r\n \"bookIndexUrl\": \"http://www.shuquge.com/txt/{bookId}/index.html\",\r\n \"bookIndexStart\": \"》正文\",\r\n \"indexIdPatten\": \"<dd><a\\\\s+href=\\\"(\\\\d+)\\\\.html\\\">[^/]+</a></dd>\",\r\n \"indexNamePatten\": \"<dd><a\\\\s+href=\\\"\\\\d+\\\\.html\\\">([^/]+)</a></dd>\",\r\n \"bookContentUrl\": \"http://www.shuquge.com/txt/{bookId}/{indexId}.html\",\r\n \"contentStart\": \"<div id=\\\"content\\\" class=\\\"showtxt\\\">\",\r\n \"contentEnd\": \"http://www.shuquge.com\"\r\n}', '1', '2020-05-18 12:02:34', '2020-05-18 12:02:34');
INSERT INTO `crawl_source` (`id`, `source_name`, `crawl_rule`, `source_status`, `create_time`, `update_time`) VALUES ('5', '笔趣阁', '{\"bookListUrl\":\"http://m.mcmssc.com/xclass/{catId}/{page}.html\",\"catIdRule\":{\"catId1\":\"1\",\"catId2\":\"2\",\"catId3\":\"3\",\"catId4\":\"4\",\"catId5\":\"5\",\"catId6\":\"6\",\"catId7\":\"7\"},\"bookIdPatten\":\"href=\\\"/(\\\\d+_\\\\d+)/\\\"\",\"pagePatten\":\"class=\\\"page_txt\\\"\\\\s+value=\\\"(\\\\d+)/\\\\d+\\\"\\\\s+size=\",\"totalPagePatten\":\"class=\\\"page_txt\\\"\\\\s+value=\\\"\\\\d+/(\\\\d+)\\\"\\\\s+size=\",\"bookDetailUrl\":\"http://m.mcmssc.com/{bookId}/\",\"bookNamePatten\":\"<span\\\\s+class=\\\"title\\\">([^/]+)</span>\",\"authorNamePatten\":\"<a\\\\s+href=\\\"/author/\\\\d+/\\\">([^/]+)</a>\",\"picUrlPatten\":\"<img\\\\s+src=\\\"([^>]+)\\\"\\\\s+onerror=\",\"picUrlPrefix\":\"http://m.mcmssc.com/\",\"statusPatten\":\">状态:([^/]+)<\",\"bookStatusRule\":{\"连载\":0,\"全本\":1},\"visitCountPatten\":\">点击:(\\\\d+)<\",\"descStart\":\"<p class=\\\"review\\\">\",\"descEnd\":\"</p>\",\"bookIndexUrl\":\"http://m.mcmssc.com/{bookId}/all.html\",\"indexIdPatten\":\"<a\\\\s+href=\\\"/\\\\d+_\\\\d+/(\\\\d+)\\\\.html\\\">[^/]+</a>\",\"indexNamePatten\":\"<a\\\\s+href=\\\"/\\\\d+_\\\\d+/\\\\d+\\\\.html\\\">([^/]+)</a>\",\"bookContentUrl\":\"http://www.mcmssc.com/{bookId}/{indexId}.html\",\"contentStart\":\"</p>\",\"contentEnd\":\"<div align=\\\"center\\\">\"}', '1', '2020-05-18 15:57:41', '2020-05-18 15:57:41');

View File

@ -357,6 +357,7 @@ CREATE TABLE `crawl_source` (
INSERT INTO `crawl_source` VALUES ('2', '百书斋', '{\r\n \"bookListUrl\": \"https://m.baishuzhai.com/blhb/{catId}/{page}.html\",\r\n \"catIdRule\": {\r\n \"catId1\": \"1\",\r\n \"catId2\": \"2\",\r\n \"catId3\": \"3\",\r\n \"catId4\": \"4\",\r\n \"catId5\": \"5\",\r\n \"catId6\": \"6\",\r\n \"catId7\": \"7\"\r\n },\r\n \"bookIdPatten\": \"href=\\\"/ibook/(\\\\d+/\\\\d+)/\\\"\",\r\n \"pagePatten\": \"value=\\\"(\\\\d+)/\\\\d+\\\"\",\r\n \"totalPagePatten\": \"value=\\\"\\\\d+/(\\\\d+)\\\"\",\r\n \"bookDetailUrl\": \"https://m.baishuzhai.com/ibook/{bookId}/\",\r\n \"bookNamePatten\": \"<span class=\\\"title\\\">([^/]+)</span>\",\r\n \"authorNamePatten\": \">作者:([^/]+)<\",\r\n \"picUrlPatten\": \"<img src=\\\"([^>]+)\\\"\\\\s+onerror=\\\"this.src=\",\r\n \"statusPatten\": \"状态:([^/]+)</li>\",\r\n \"bookStatusRule\": {\r\n \"连载\": 0,\r\n \"完成\": 1\r\n },\r\n \"scorePatten\": \"<em>([^<]+)</em>\",\r\n \"descStart\": \"<p class=\\\"review\\\">\",\r\n \"descEnd\": \"</p>\",\r\n \"upadateTimePatten\": \"更新:(\\\\d+-\\\\d+-\\\\d+)</li>\",\r\n \"upadateTimeFormatPatten\": \"yy-MM-dd\",\r\n \"bookIndexUrl\": \"https://m.baishuzhai.com/ibook/{bookId}/all.html\",\r\n \"indexIdPatten\": \"<a\\\\s+style=\\\"\\\"\\\\s+href=\\\"/ibook/\\\\d+/\\\\d+/(\\\\d+)\\\\.html\\\">[^/]+</a>\",\r\n \"indexNamePatten\": \"<a\\\\s+style=\\\"\\\"\\\\s+href=\\\"/ibook/\\\\d+/\\\\d+/\\\\d+\\\\.html\\\">([^/]+)</a>\",\r\n \"bookContentUrl\": \"https://baishuzhai.com/ibook/{bookId}/{indexId}.html\",\r\n \"contentStart\": \"id=\\\"content\\\">\",\r\n \"contentEnd\": \"<script>\"\r\n}', '0', '2020-05-01 14:22:50', '2020-05-01 14:22:50');
INSERT INTO `crawl_source` VALUES ('3', '书包网', '{\r\n \"bookListUrl\": \"https://www.bookbao8.com/booklist-p_{page}-c_{catId}-t_0-o_0.html\",\r\n \"catIdRule\": {\r\n \"catId1\": \"5\",\r\n \"catId2\": \"4\",\r\n \"catId3\": \"8\",\r\n \"catId4\": \"9\",\r\n \"catId5\": \"3\",\r\n \"catId6\": \"7\"\r\n },\r\n \"bookIdPatten\": \"href=\\\"/book/(\\\\d+/\\\\d+/id_[^.]+).html\\\"\",\r\n \"pagePatten\": \"<span\\\\s+class=\\\"current\\\">([^<]+)</span>\",\r\n \"totalPagePatten\": \"/共(\\\\d+)页\",\r\n \"bookDetailUrl\": \"https://www.bookbao8.com/book/{bookId}.html\",\r\n \"bookNamePatten\": \"<div\\\\s+id=\\\"info\\\">\\\\s*<h1>([^<]+)</h1>\",\r\n \"authorNamePatten\": \"<p>作者:<a\\\\s+href=\\\"/Search/[^\\\"]+\\\"\\\\s+target=\\\"_blank\\\">([^<]+)</a></p>\",\r\n \"picUrlPatten\": \"<div\\\\s+id=\\\"fmimg\\\">\\\\s*<img\\\\s+alt=\\\"[^\\\"]+\\\"\\\\s+src=\\\"([^\\\"]+)\\\"\",\r\n \"statusPatten\": \"<p>状态:([^<]+)</p>\",\r\n \"bookStatusRule\": {\r\n \"连载中\": 0,\r\n \"已完结\": 1\r\n },\r\n \"visitCountPatten\": \"<em\\\\s+id=\\\"hits\\\">(\\\\d+)</em>\",\r\n \"descStart\": \"<div class=\\\"infocontent\\\">\",\r\n \"descEnd\": \"</div>\",\r\n \"upadateTimePatten\": \"<p>更新时间:(\\\\d+-\\\\d+-\\\\d+\\\\s\\\\d+:\\\\d+:\\\\d+)</p>\",\r\n \"upadateTimeFormatPatten\": \"yyyy-MM-dd HH:mm:ss\",\r\n \"bookIndexUrl\": \"https://www.bookbao8.com/book/{bookId}.html\",\r\n \"indexIdPatten\": \"<li>\\\\s*<a\\\\s+href=\\\"/views/\\\\d+/\\\\d+/id_[^_]+_(\\\\d+).html\\\"\\\\s+target=\\\"_blank\\\">\",\r\n \"indexNamePatten\": \"<li>\\\\s*<a\\\\s+href=\\\"/views/\\\\d+/\\\\d+/id_[^_]+_\\\\d+.html\\\"\\\\s+target=\\\"_blank\\\">([^<]+)</a>\",\r\n \"bookContentUrl\": \"https://www.bookbao8.com/views/{bookId}_{indexId}.html\",\r\n \"contentStart\": \"<dd id=\\\"contents\\\">\",\r\n \"contentEnd\": \"</dd>\"\r\n}', '0', '2020-05-04 17:42:22', '2020-05-04 17:42:22');
INSERT INTO `crawl_source` VALUES ('4', '书趣阁', '{\r\n \"bookListUrl\": \"http://m.shuquge.com/sort/{catId}/0_{page}.html\",\r\n \"catIdRule\": {\r\n \"catId1\": \"1\",\r\n \"catId2\": \"2\",\r\n \"catId3\": \"3\",\r\n \"catId4\": \"4\",\r\n \"catId5\": \"7\",\r\n \"catId6\": \"6\",\r\n \"catId7\": \"8\"\r\n },\r\n \"bookIdPatten\": \"href=\\\"/s/(\\\\d+)\\\\.html\\\"\",\r\n \"pagePatten\": \"第(\\\\d+)/\\\\d+页\",\r\n \"totalPagePatten\": \"\\\\d+/(\\\\d+)页\",\r\n \"bookDetailUrl\": \"http://m.shuquge.com/s/{bookId}.html\",\r\n \"bookNamePatten\": \"<a\\\\s+href=\\\"/s/\\\\d+\\\\.html\\\"><h2>([^/]+)</h2></a>\",\r\n \"authorNamePatten\": \"<p>作者:([^/]+)</p>\",\r\n \"picUrlPatten\": \"src=\\\"(http://www.shuquge.com/files/article/image/\\\\d+/\\\\d+/\\\\d+s\\\\.jpg)\\\"\",\r\n \"statusPatten\": \"<p>状态:([^/]+)</p>\",\r\n \"bookStatusRule\": {\r\n \"连载中\": 0,\r\n \"完本\": 1\r\n },\r\n \"descStart\": \"<div class=\\\"intro_info\\\">\",\r\n \"descEnd\": \"最新章节推荐地址\",\r\n \"bookIndexUrl\": \"http://www.shuquge.com/txt/{bookId}/index.html\",\r\n \"bookIndexStart\": \"》正文\",\r\n \"indexIdPatten\": \"<dd><a\\\\s+href=\\\"(\\\\d+)\\\\.html\\\">[^/]+</a></dd>\",\r\n \"indexNamePatten\": \"<dd><a\\\\s+href=\\\"\\\\d+\\\\.html\\\">([^/]+)</a></dd>\",\r\n \"bookContentUrl\": \"http://www.shuquge.com/txt/{bookId}/{indexId}.html\",\r\n \"contentStart\": \"<div id=\\\"content\\\" class=\\\"showtxt\\\">\",\r\n \"contentEnd\": \"http://www.shuquge.com\"\r\n}', '1', '2020-05-18 12:02:34', '2020-05-18 12:02:34');
INSERT INTO `crawl_source` (`id`, `source_name`, `crawl_rule`, `source_status`, `create_time`, `update_time`) VALUES ('5', '笔趣阁', '{\"bookListUrl\":\"http://m.mcmssc.com/xclass/{catId}/{page}.html\",\"catIdRule\":{\"catId1\":\"1\",\"catId2\":\"2\",\"catId3\":\"3\",\"catId4\":\"4\",\"catId5\":\"5\",\"catId6\":\"6\",\"catId7\":\"7\"},\"bookIdPatten\":\"href=\\\"/(\\\\d+_\\\\d+)/\\\"\",\"pagePatten\":\"class=\\\"page_txt\\\"\\\\s+value=\\\"(\\\\d+)/\\\\d+\\\"\\\\s+size=\",\"totalPagePatten\":\"class=\\\"page_txt\\\"\\\\s+value=\\\"\\\\d+/(\\\\d+)\\\"\\\\s+size=\",\"bookDetailUrl\":\"http://m.mcmssc.com/{bookId}/\",\"bookNamePatten\":\"<span\\\\s+class=\\\"title\\\">([^/]+)</span>\",\"authorNamePatten\":\"<a\\\\s+href=\\\"/author/\\\\d+/\\\">([^/]+)</a>\",\"picUrlPatten\":\"<img\\\\s+src=\\\"([^>]+)\\\"\\\\s+onerror=\",\"picUrlPrefix\":\"http://m.mcmssc.com/\",\"statusPatten\":\">状态:([^/]+)<\",\"bookStatusRule\":{\"连载\":0,\"全本\":1},\"visitCountPatten\":\">点击:(\\\\d+)<\",\"descStart\":\"<p class=\\\"review\\\">\",\"descEnd\":\"</p>\",\"bookIndexUrl\":\"http://m.mcmssc.com/{bookId}/all.html\",\"indexIdPatten\":\"<a\\\\s+href=\\\"/\\\\d+_\\\\d+/(\\\\d+)\\\\.html\\\">[^/]+</a>\",\"indexNamePatten\":\"<a\\\\s+href=\\\"/\\\\d+_\\\\d+/\\\\d+\\\\.html\\\">([^/]+)</a>\",\"bookContentUrl\":\"http://www.mcmssc.com/{bookId}/{indexId}.html\",\"contentStart\":\"</p>\",\"contentEnd\":\"<div align=\\\"center\\\">\"}', '1', '2020-05-18 15:57:41', '2020-05-18 15:57:41');
-- ----------------------------
-- Table structure for friend_link