diff --git a/.gitignore b/.gitignore index b1cc347..cdb1967 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,5 @@ /*.iml /novel-admin/*.iml .DS_Store +/novel-admin/cachedata +/novel-admin/logs diff --git a/novel-admin/logs/debug.log b/novel-admin/logs/debug.log index a4414cb..5ebdf85 100644 --- a/novel-admin/logs/debug.log +++ b/novel-admin/logs/debug.log @@ -3,3 +3,6 @@ 2020-05-13 21:52:01,131 INFO (SpringApplication.java:663)- The following profiles are active: dev 2020-05-13 21:52:54,469 DEBUG (ApplicationContextRegister.java:29)- ApplicationContext registed-->org.springframework.web.context.support.GenericWebApplicationContext@5b529706: startup date [Wed May 13 21:52:01 CST 2020]; root of context hierarchy 2020-05-13 21:53:49,622 INFO (StartupInfoLogger.java:59)- Started TestDemo in 114.268 seconds (JVM running for 124.957) +2020-05-18 09:48:03,219 INFO (StartupInfoLogger.java:50)- Starting TestDemo on DESKTOP-CPCLUI6 with PID 13172 (started by 11797 in D:\gitee\novel-plus\novel-admin) +2020-05-18 09:48:03,223 DEBUG (StartupInfoLogger.java:53)- Running with Spring Boot v2.0.1.RELEASE, Spring v5.0.5.RELEASE +2020-05-18 09:48:03,227 INFO (SpringApplication.java:663)- The following profiles are active: dev diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java index 11db043..c02195b 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java @@ -3,12 +3,17 @@ package com.java2nb.novel.core.crawl; import com.java2nb.novel.core.utils.HttpUtil; import com.java2nb.novel.core.utils.IdWorker; import com.java2nb.novel.core.utils.RandomBookInfoUtil; +import com.java2nb.novel.core.utils.RestTemplateUtil; import com.java2nb.novel.entity.Book; import com.java2nb.novel.entity.BookContent; import com.java2nb.novel.entity.BookIndex; import com.java2nb.novel.utils.Constants; import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.StringUtils; +import org.springframework.http.HttpStatus; +import org.springframework.http.ResponseEntity; +import org.springframework.web.client.RestTemplate; import java.text.SimpleDateFormat; import java.util.*; @@ -22,17 +27,20 @@ import static java.util.regex.Pattern.compile; * * @author Administrator */ +@Slf4j public class CrawlParser { public static final Integer BOOK_INDEX_LIST_KEY = 1; public static final Integer BOOK_CONTENT_LIST_KEY = 2; + private static RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8"); + @SneakyThrows public static Book parseBook(RuleBean ruleBean, String bookId) { Book book = new Book(); String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId); - String bookDetailHtml = HttpUtil.getByHttpClient(bookDetailUrl); + String bookDetailHtml = getByHttpClient(bookDetailUrl); if (bookDetailHtml != null) { Pattern bookNamePatten = compile(ruleBean.getBookNamePatten()); Matcher bookNameMatch = bookNamePatten.matcher(bookDetailHtml); @@ -54,6 +62,9 @@ public class CrawlParser { boolean isFindPicUrl = picUrlMatch.find(); if (isFindPicUrl) { String picUrl = picUrlMatch.group(1); + if(StringUtils.isNotBlank(picUrl) && StringUtils.isNotBlank(ruleBean.getPicUrlPrefix())) { + picUrl = ruleBean.getPicUrlPrefix() + picUrl; + } //设置封面图片路径 book.setPicUrl(picUrl); } @@ -136,7 +147,10 @@ public class CrawlParser { List contentList = new ArrayList<>(); //读取目录 String indexListUrl = ruleBean.getBookIndexUrl().replace("{bookId}", sourceBookId); - String indexListHtml = HttpUtil.getByHttpClient(indexListUrl); + String indexListHtml = getByHttpClient(indexListUrl); + if(StringUtils.isNotBlank(ruleBean.getBookIndexStart())){ + indexListHtml = indexListHtml.substring(indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length()); + } if (indexListHtml != null) { Pattern indexIdPatten = compile(ruleBean.getIndexIdPatten()); Matcher indexIdMatch = indexIdPatten.matcher(indexListHtml); @@ -162,7 +176,7 @@ public class CrawlParser { String contentUrl = ruleBean.getBookContentUrl().replace("{bookId}", sourceBookId).replace("{indexId}", indexIdMatch.group(1)); //查询章节内容 - String contentHtml = HttpUtil.getByHttpClient(contentUrl); + String contentHtml = getByHttpClient(contentUrl); if (contentHtml != null) { String content = contentHtml.substring(contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length()); content = content.substring(0, content.indexOf(ruleBean.getContentEnd())); @@ -237,4 +251,25 @@ public class CrawlParser { } + private static String getByHttpClient(String url) { + try { + ResponseEntity forEntity = restTemplate.getForEntity(url, String.class); + if (forEntity.getStatusCode() == HttpStatus.OK) { + String body = forEntity.getBody(); + if(body.length() < Constants.INVALID_HTML_LENGTH){ + log.debug("获取html页面内容失败"); + Thread.sleep(10 + new Random().nextInt(60)); + return getByHttpClient(url); + } + return body; + } else { + return null; + } + } catch (Exception e) { + e.printStackTrace(); + return null; + } + } + + } diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/RuleBean.java b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/RuleBean.java index 6c7e678..5a8d319 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/RuleBean.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/RuleBean.java @@ -11,6 +11,14 @@ import java.util.Map; @Data public class RuleBean { + /** + * 小说更新列表url + * */ + private String updateBookListUrl; + + /** + * 分类列表页URL规则 + * */ private String bookListUrl; private Map catIdRule; @@ -39,4 +47,9 @@ public class RuleBean { private String contentEnd; + private String picUrlPrefix; + + private String bookIndexStart; + + } diff --git a/novel-crawl/src/main/java/com/java2nb/novel/utils/Constants.java b/novel-crawl/src/main/java/com/java2nb/novel/utils/Constants.java index e7a290c..5e0230a 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/utils/Constants.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/utils/Constants.java @@ -14,4 +14,9 @@ public class Constants { * 访问量默认值 */ public static final Long VISIT_COUNT_DEFAULT = 100L; + + /** + * 爬取小说http请求中无效的内容长度 + */ + public static final int INVALID_HTML_LENGTH = 1000; } diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html b/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html index 5af6e69..74b7f20 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html @@ -57,6 +57,9 @@
  • 示例:新顶点小说网
  • + 示例:http://m.xdingdiann.com/sort/{catId}/{page}.html ({catId}代表分类ID,{page}代表分页页码)
  • @@ -95,6 +98,9 @@ 示例:<img src="([^>]+)"\s+onerror="this.src=
  • + 可空,适用于图片路径为相对路径的源站,加上小说图片路径,则为完整的可访问的图片路径 +
  • 示例:状态:([^/]+)</li>
  • @@ -125,6 +131,9 @@ 示例:http://m.xdingdiann.com/ddk{bookId}/all.html (bookId代表小说ID)
  • + 可空,适用于最新章节列表和全部章节列表在同一个页面的源站 +
  • 示例:<a\s+style=""\s+href="/ddk\d+/(\d+)\.html">[^/]+</a>
  • @@ -278,6 +287,12 @@ crawlRule.picUrlPatten = picUrlPatten; } + var picUrlPrefix = $("#picUrlPrefix").val(); + + if (picUrlPrefix.length > 0) { + crawlRule.picUrlPrefix = picUrlPrefix; + } + var statusPatten = $("#statusPatten").val(); if (statusPatten.length > 0) { crawlRule.statusPatten = statusPatten; @@ -345,6 +360,13 @@ crawlRule.bookIndexUrl = bookIndexUrl; + + var bookIndexStart = $("#bookIndexStart").val(); + + if (bookIndexStart.length > 0) { + crawlRule.bookIndexStart = bookIndexStart; + } + var indexIdPatten = $("#indexIdPatten").val(); if (indexIdPatten.length == 0) { diff --git a/novel-front/src/main/resources/static/images/default.gif b/novel-front/src/main/resources/static/images/default.gif new file mode 100644 index 0000000..4341383 Binary files /dev/null and b/novel-front/src/main/resources/static/images/default.gif differ diff --git a/sql/20200518.sql b/sql/20200518.sql new file mode 100644 index 0000000..25cf191 --- /dev/null +++ b/sql/20200518.sql @@ -0,0 +1 @@ +INSERT INTO `novel_plus`.`crawl_source` (`id`, `source_name`, `crawl_rule`, `source_status`, `create_time`, `update_time`) VALUES ('4', '书趣阁', '{\r\n \"bookListUrl\": \"http://m.shuquge.com/sort/{catId}/0_{page}.html\",\r\n \"catIdRule\": {\r\n \"catId1\": \"1\",\r\n \"catId2\": \"2\",\r\n \"catId3\": \"3\",\r\n \"catId4\": \"4\",\r\n \"catId5\": \"7\",\r\n \"catId6\": \"6\",\r\n \"catId7\": \"8\"\r\n },\r\n \"bookIdPatten\": \"href=\\\"/s/(\\\\d+)\\\\.html\\\"\",\r\n \"pagePatten\": \"第(\\\\d+)/\\\\d+页\",\r\n \"totalPagePatten\": \"第\\\\d+/(\\\\d+)页\",\r\n \"bookDetailUrl\": \"http://m.shuquge.com/s/{bookId}.html\",\r\n \"bookNamePatten\": \"

    ([^/]+)

    \",\r\n \"authorNamePatten\": \"

    作者:([^/]+)

    \",\r\n \"picUrlPatten\": \"src=\\\"(http://www.shuquge.com/files/article/image/\\\\d+/\\\\d+/\\\\d+s\\\\.jpg)\\\"\",\r\n \"statusPatten\": \"

    状态:([^/]+)

    \",\r\n \"bookStatusRule\": {\r\n \"连载中\": 0,\r\n \"完本\": 1\r\n },\r\n \"descStart\": \"
    \",\r\n \"descEnd\": \"最新章节推荐地址\",\r\n \"bookIndexUrl\": \"http://www.shuquge.com/txt/{bookId}/index.html\",\r\n \"bookIndexStart\": \"》正文卷\",\r\n \"indexIdPatten\": \"
    [^/]+
    \",\r\n \"indexNamePatten\": \"
    ([^/]+)
    \",\r\n \"bookContentUrl\": \"http://www.shuquge.com/txt/{bookId}/{indexId}.html\",\r\n \"contentStart\": \"
    \",\r\n \"contentEnd\": \"http://www.shuquge.com\"\r\n}', '1', '2020-05-18 12:02:34', '2020-05-18 12:02:34'); diff --git a/sql/novel_plus.sql b/sql/novel_plus.sql index 86d510c..5f19a9b 100644 --- a/sql/novel_plus.sql +++ b/sql/novel_plus.sql @@ -2,19 +2,65 @@ Navicat MySQL Data Transfer Source Server : localhost -Source Server Version : 50624 +Source Server Version : 50725 Source Host : localhost:3306 -Source Database : novel_biz +Source Database : novel_plus Target Server Type : MYSQL -Target Server Version : 50624 +Target Server Version : 50725 File Encoding : 65001 -Date: 2020-05-02 10:53:04 +Date: 2020-05-18 13:59:04 */ SET FOREIGN_KEY_CHECKS=0; +-- ---------------------------- +-- Table structure for author +-- ---------------------------- +DROP TABLE IF EXISTS `author`; +CREATE TABLE `author` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '主键', + `user_id` bigint(20) DEFAULT NULL COMMENT '用户ID', + `invite_code` varchar(20) DEFAULT NULL COMMENT '邀请码', + `pen_name` varchar(20) DEFAULT NULL COMMENT '笔名', + `tel_phone` varchar(20) DEFAULT NULL COMMENT '手机号码', + `chat_account` varchar(50) DEFAULT NULL COMMENT 'QQ或微信账号', + `email` varchar(50) DEFAULT NULL COMMENT '电子邮箱', + `work_direction` tinyint(4) DEFAULT NULL COMMENT '作品方向,0:男频,1:女频', + `status` tinyint(4) DEFAULT '0' COMMENT '0:正常,1:封禁', + `create_time` datetime DEFAULT NULL COMMENT '创建时间', + PRIMARY KEY (`id`) +) ENGINE=InnoDB AUTO_INCREMENT=3 DEFAULT CHARSET=utf8mb4 COMMENT='作者表'; + +-- ---------------------------- +-- Records of author +-- ---------------------------- +INSERT INTO `author` VALUES ('1', null, 'reerer', 'abc', '13560487656', '23484388', '23484388@qq.com', '0', '0', null); +INSERT INTO `author` VALUES ('2', '1255060328322027520', 'rwrr445554', '梦入神机', '13560421324', '1179705413', 'reerer@qq.com', '0', '0', '2020-05-13 14:01:31'); + +-- ---------------------------- +-- Table structure for author_code +-- ---------------------------- +DROP TABLE IF EXISTS `author_code`; +CREATE TABLE `author_code` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT COMMENT '主键', + `invite_code` varchar(100) DEFAULT NULL COMMENT '邀请码', + `validity_time` datetime DEFAULT NULL COMMENT '有效时间', + `is_use` tinyint(1) DEFAULT '0' COMMENT '是否使用过,0:未使用,1:使用过', + `create_time` datetime DEFAULT NULL COMMENT '创建时间', + `create_user_id` bigint(20) DEFAULT NULL COMMENT '创建人ID', + PRIMARY KEY (`id`), + UNIQUE KEY `key_code` (`invite_code`) USING BTREE +) ENGINE=InnoDB AUTO_INCREMENT=6 DEFAULT CHARSET=utf8mb4 COMMENT='作家邀请码表'; + +-- ---------------------------- +-- Records of author_code +-- ---------------------------- +INSERT INTO `author_code` VALUES ('3', 'reerer', '2020-05-27 22:43:45', '1', '2020-05-13 11:40:56', '1'); +INSERT INTO `author_code` VALUES ('4', '123456', '2020-05-28 00:00:00', '0', '2020-05-13 14:09:55', '1'); +INSERT INTO `author_code` VALUES ('5', 'ww34343', '2020-05-21 00:00:00', '0', '2020-05-13 14:18:58', '1'); + -- ---------------------------- -- Table structure for book -- ---------------------------- @@ -49,7 +95,7 @@ CREATE TABLE `book` ( UNIQUE KEY `key_uq_bookName_authorName` (`book_name`,`author_name`) USING BTREE, KEY `key_lastIndexUpdateTime` (`last_index_update_time`) USING BTREE, KEY `key_createTime` (`create_time`) USING BTREE -) ENGINE=InnoDB AUTO_INCREMENT=1256127379949019137 DEFAULT CHARSET=utf8mb4 COMMENT='小说表'; +) ENGINE=InnoDB AUTO_INCREMENT=1262260513468559361 DEFAULT CHARSET=utf8mb4 COMMENT='小说表'; -- ---------------------------- -- Records of book @@ -156,7 +202,7 @@ CREATE TABLE `book_content` ( `content` mediumtext COMMENT '小说章节内容', PRIMARY KEY (`id`), UNIQUE KEY `key_uq_indexId` (`index_id`) USING BTREE -) ENGINE=InnoDB AUTO_INCREMENT=3342428 DEFAULT CHARSET=utf8mb4 COMMENT='小说内容表'; +) ENGINE=InnoDB AUTO_INCREMENT=3347665 DEFAULT CHARSET=utf8mb4 COMMENT='小说内容表'; -- ---------------------------- -- Records of book_content @@ -179,7 +225,7 @@ CREATE TABLE `book_index` ( UNIQUE KEY `key_uq_bookId_indexNum` (`book_id`,`index_num`) USING BTREE, KEY `key_bookId` (`book_id`) USING BTREE, KEY `key_indexNum` (`index_num`) USING BTREE -) ENGINE=InnoDB AUTO_INCREMENT=1256373101432717313 DEFAULT CHARSET=utf8mb4 COMMENT='小说目录表'; +) ENGINE=InnoDB AUTO_INCREMENT=1262260612777095169 DEFAULT CHARSET=utf8mb4 COMMENT='小说目录表'; -- ---------------------------- -- Records of book_index @@ -303,13 +349,14 @@ CREATE TABLE `crawl_source` ( `create_time` datetime DEFAULT NULL COMMENT '创建时间', `update_time` datetime DEFAULT NULL COMMENT '更新时间', PRIMARY KEY (`id`) -) ENGINE=InnoDB AUTO_INCREMENT=4 DEFAULT CHARSET=utf8mb4 COMMENT='爬虫源表'; +) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=utf8mb4 COMMENT='爬虫源表'; -- ---------------------------- -- Records of crawl_source -- ---------------------------- INSERT INTO `crawl_source` VALUES ('2', '百书斋', '{\r\n \"bookListUrl\": \"https://m.baishuzhai.com/blhb/{catId}/{page}.html\",\r\n \"catIdRule\": {\r\n \"catId1\": \"1\",\r\n \"catId2\": \"2\",\r\n \"catId3\": \"3\",\r\n \"catId4\": \"4\",\r\n \"catId5\": \"5\",\r\n \"catId6\": \"6\",\r\n \"catId7\": \"7\"\r\n },\r\n \"bookIdPatten\": \"href=\\\"/ibook/(\\\\d+/\\\\d+)/\\\"\",\r\n \"pagePatten\": \"value=\\\"(\\\\d+)/\\\\d+\\\"\",\r\n \"totalPagePatten\": \"value=\\\"\\\\d+/(\\\\d+)\\\"\",\r\n \"bookDetailUrl\": \"https://m.baishuzhai.com/ibook/{bookId}/\",\r\n \"bookNamePatten\": \"([^/]+)\",\r\n \"authorNamePatten\": \">作者:([^/]+)<\",\r\n \"picUrlPatten\": \"]+)\\\"\\\\s+onerror=\\\"this.src=\",\r\n \"statusPatten\": \"状态:([^/]+)\",\r\n \"bookStatusRule\": {\r\n \"连载\": 0,\r\n \"完成\": 1\r\n },\r\n \"scorePatten\": \"([^<]+)\",\r\n \"descStart\": \"

    \",\r\n \"descEnd\": \"

    \",\r\n \"upadateTimePatten\": \"更新:(\\\\d+-\\\\d+-\\\\d+)\",\r\n \"upadateTimeFormatPatten\": \"yy-MM-dd\",\r\n \"bookIndexUrl\": \"https://m.baishuzhai.com/ibook/{bookId}/all.html\",\r\n \"indexIdPatten\": \"[^/]+\",\r\n \"indexNamePatten\": \"([^/]+)\",\r\n \"bookContentUrl\": \"https://baishuzhai.com/ibook/{bookId}/{indexId}.html\",\r\n \"contentStart\": \"id=\\\"content\\\">\",\r\n \"contentEnd\": \"