mirror of
https://github.com/201206030/novel-plus.git
synced 2025-04-26 17:20:52 +00:00
添加书包网书源
This commit is contained in:
parent
d243739a4e
commit
93c2c8262c
@ -153,66 +153,69 @@ public class CrawlServiceImpl implements CrawlService {
|
||||
while (page <= totalPage) {
|
||||
|
||||
try {
|
||||
//拼接分类URL
|
||||
String catBookListUrl = ruleBean.getBookListUrl()
|
||||
.replace("{catId}", ruleBean.getCatIdRule().get("catId" + catId))
|
||||
.replace("{page}", page + "");
|
||||
|
||||
String bookListHtml = getByHttpClient(catBookListUrl);
|
||||
if (bookListHtml != null) {
|
||||
Pattern bookIdPatten = Pattern.compile(ruleBean.getBookIdPatten());
|
||||
Matcher bookIdMatcher = bookIdPatten.matcher(bookListHtml);
|
||||
boolean isFindBookId = bookIdMatcher.find();
|
||||
while (isFindBookId) {
|
||||
try {
|
||||
String bookId = bookIdMatcher.group(1);
|
||||
Book book = CrawlParser.parseBook(ruleBean, bookId);
|
||||
//这里只做新书入库,查询是否存在这本书
|
||||
Book existBook = bookService.queryBookByBookNameAndAuthorName(book.getBookName(), book.getAuthorName());
|
||||
//如果该小说不存在,则可以解析入库,但是标记该小说正在入库,30分钟之后才允许再次入库
|
||||
if (existBook == null) {
|
||||
//没有该书,可以入库
|
||||
book.setCatId(catId);
|
||||
//根据分类ID查询分类
|
||||
book.setCatName(bookService.queryCatNameByCatId(catId));
|
||||
if (catId == 7) {
|
||||
//女频
|
||||
book.setWorkDirection((byte) 1);
|
||||
if(StringUtils.isNotBlank(ruleBean.getCatIdRule().get("catId" + catId))) {
|
||||
//拼接分类URL
|
||||
String catBookListUrl = ruleBean.getBookListUrl()
|
||||
.replace("{catId}", ruleBean.getCatIdRule().get("catId" + catId))
|
||||
.replace("{page}", page + "");
|
||||
|
||||
String bookListHtml = getByHttpClient(catBookListUrl);
|
||||
if (bookListHtml != null) {
|
||||
Pattern bookIdPatten = Pattern.compile(ruleBean.getBookIdPatten());
|
||||
Matcher bookIdMatcher = bookIdPatten.matcher(bookListHtml);
|
||||
boolean isFindBookId = bookIdMatcher.find();
|
||||
while (isFindBookId) {
|
||||
try {
|
||||
String bookId = bookIdMatcher.group(1);
|
||||
Book book = CrawlParser.parseBook(ruleBean, bookId);
|
||||
//这里只做新书入库,查询是否存在这本书
|
||||
Book existBook = bookService.queryBookByBookNameAndAuthorName(book.getBookName(), book.getAuthorName());
|
||||
//如果该小说不存在,则可以解析入库,但是标记该小说正在入库,30分钟之后才允许再次入库
|
||||
if (existBook == null) {
|
||||
//没有该书,可以入库
|
||||
book.setCatId(catId);
|
||||
//根据分类ID查询分类
|
||||
book.setCatName(bookService.queryCatNameByCatId(catId));
|
||||
if (catId == 7) {
|
||||
//女频
|
||||
book.setWorkDirection((byte) 1);
|
||||
} else {
|
||||
//男频
|
||||
book.setWorkDirection((byte) 0);
|
||||
}
|
||||
book.setCrawlBookId(bookId);
|
||||
book.setCrawlSourceId(sourceId);
|
||||
book.setCrawlLastTime(new Date());
|
||||
book.setId(new IdWorker().nextId());
|
||||
//解析章节目录
|
||||
Map<Integer, List> indexAndContentList = CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0));
|
||||
|
||||
bookService.saveBookAndIndexAndContent(book, (List<BookIndex>) indexAndContentList.get(CrawlParser.BOOK_INDEX_LIST_KEY), (List<BookContent>) indexAndContentList.get(CrawlParser.BOOK_CONTENT_LIST_KEY));
|
||||
|
||||
} else {
|
||||
//男频
|
||||
book.setWorkDirection((byte) 0);
|
||||
//只更新书籍的爬虫相关字段
|
||||
bookService.updateCrawlProperties(existBook.getId(), sourceId, bookId);
|
||||
}
|
||||
book.setCrawlBookId(bookId);
|
||||
book.setCrawlSourceId(sourceId);
|
||||
book.setCrawlLastTime(new Date());
|
||||
book.setId(new IdWorker().nextId());
|
||||
//解析章节目录
|
||||
Map<Integer, List> indexAndContentList = CrawlParser.parseBookIndexAndContent(bookId,book, ruleBean, new HashMap<>(0));
|
||||
|
||||
bookService.saveBookAndIndexAndContent(book, (List<BookIndex>) indexAndContentList.get(CrawlParser.BOOK_INDEX_LIST_KEY), (List<BookContent>) indexAndContentList.get(CrawlParser.BOOK_CONTENT_LIST_KEY));
|
||||
|
||||
} else {
|
||||
//只更新书籍的爬虫相关字段
|
||||
bookService.updateCrawlProperties(existBook.getId(),sourceId, bookId);
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
}
|
||||
}catch (Exception e){
|
||||
log.error(e.getMessage(),e);
|
||||
|
||||
|
||||
isFindBookId = bookIdMatcher.find();
|
||||
}
|
||||
|
||||
Pattern totalPagePatten = Pattern.compile(ruleBean.getTotalPagePatten());
|
||||
Matcher totalPageMatcher = totalPagePatten.matcher(bookListHtml);
|
||||
boolean isFindTotalPage = totalPageMatcher.find();
|
||||
if (isFindTotalPage) {
|
||||
|
||||
totalPage = Integer.parseInt(totalPageMatcher.group(1));
|
||||
|
||||
}
|
||||
|
||||
|
||||
isFindBookId = bookIdMatcher.find();
|
||||
}
|
||||
|
||||
Pattern totalPagePatten = Pattern.compile(ruleBean.getTotalPagePatten());
|
||||
Matcher totalPageMatcher = totalPagePatten.matcher(bookListHtml);
|
||||
boolean isFindTotalPage = totalPageMatcher.find();
|
||||
if (isFindTotalPage) {
|
||||
|
||||
totalPage = Integer.parseInt(totalPageMatcher.group(1));
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
}catch (Exception e){
|
||||
log.error(e.getMessage(),e);
|
||||
|
@ -239,7 +239,7 @@
|
||||
crawlRule.pagePatten = pagePatten;
|
||||
}
|
||||
|
||||
var totalPagePatten = $("#pagePatten").val();
|
||||
var totalPagePatten = $("#totalPagePatten").val();
|
||||
|
||||
if (totalPagePatten.length > 0) {
|
||||
crawlRule.totalPagePatten = totalPagePatten;
|
||||
|
@ -298,17 +298,18 @@ DROP TABLE IF EXISTS `crawl_source`;
|
||||
CREATE TABLE `crawl_source` (
|
||||
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键',
|
||||
`source_name` varchar(50) DEFAULT NULL COMMENT '源站名',
|
||||
`crawl_rule` varchar(2048) DEFAULT NULL COMMENT '爬取规则(json串)',
|
||||
`crawl_rule` text COMMENT '爬取规则(json串)',
|
||||
`source_status` tinyint(1) DEFAULT '0' COMMENT '爬虫源状态,0:关闭,1:开启',
|
||||
`create_time` datetime DEFAULT NULL COMMENT '创建时间',
|
||||
`update_time` datetime DEFAULT NULL COMMENT '更新时间',
|
||||
PRIMARY KEY (`id`)
|
||||
) ENGINE=InnoDB AUTO_INCREMENT=3 DEFAULT CHARSET=utf8mb4 COMMENT='爬虫源表';
|
||||
) ENGINE=InnoDB AUTO_INCREMENT=4 DEFAULT CHARSET=utf8mb4 COMMENT='爬虫源表';
|
||||
|
||||
-- ----------------------------
|
||||
-- Records of crawl_source
|
||||
-- ----------------------------
|
||||
INSERT INTO `crawl_source` VALUES ('2', '百书斋', '{\r\n \"bookListUrl\": \"https://m.baishuzhai.com/blhb/{catId}/{page}.html\",\r\n \"catIdRule\": {\r\n \"catId1\": \"1\",\r\n \"catId2\": \"2\",\r\n \"catId3\": \"3\",\r\n \"catId4\": \"4\",\r\n \"catId5\": \"5\",\r\n \"catId6\": \"6\",\r\n \"catId7\": \"7\"\r\n },\r\n \"bookIdPatten\": \"href=\\\"/ibook/(\\\\d+/\\\\d+)/\\\"\",\r\n \"pagePatten\": \"value=\\\"(\\\\d+)/\\\\d+\\\"\",\r\n \"totalPagePatten\": \"value=\\\"\\\\d+/(\\\\d+)\\\"\",\r\n \"bookDetailUrl\": \"https://m.baishuzhai.com/ibook/{bookId}/\",\r\n \"bookNamePatten\": \"<span class=\\\"title\\\">([^/]+)</span>\",\r\n \"authorNamePatten\": \">作者:([^/]+)<\",\r\n \"picUrlPatten\": \"<img src=\\\"([^>]+)\\\"\\\\s+onerror=\\\"this.src=\",\r\n \"statusPatten\": \"状态:([^/]+)</li>\",\r\n \"bookStatusRule\": {\r\n \"连载\": 0,\r\n \"完成\": 1\r\n },\r\n \"scorePatten\": \"<div\\\\s+class=\\\"score\\\">(\\\\d+\\\\.\\\\d+)分</div>\",\r\n \"descStart\": \"<p class=\\\"review\\\">\",\r\n \"descEnd\": \"</p>\",\r\n \"upadateTimePatten\": \"更新:(\\\\d+-\\\\d+-\\\\d+)</li>\",\r\n \"upadateTimeFormatPatten\": \"yy-MM-dd\",\r\n \"bookIndexUrl\": \"https://m.baishuzhai.com/ibook/{bookId}/all.html\",\r\n \"indexIdPatten\": \"<a\\\\s+style=\\\"\\\"\\\\s+href=\\\"/ibook/\\\\d+/\\\\d+/(\\\\d+)\\\\.html\\\">[^/]+</a>\",\r\n \"indexNamePatten\": \"<a\\\\s+style=\\\"\\\"\\\\s+href=\\\"/ibook/\\\\d+/\\\\d+/\\\\d+\\\\.html\\\">([^/]+)</a>\",\r\n \"bookContentUrl\": \"https://baishuzhai.com/ibook/{bookId}/{indexId}.html\",\r\n \"contentStart\": \"id=\\\"content\\\">\",\r\n \"contentEnd\": \"<script>\"\r\n}', '1', '2020-05-01 14:22:50', '2020-05-01 14:22:50');
|
||||
INSERT INTO `crawl_source` VALUES ('2', '百书斋', '{\r\n \"bookListUrl\": \"https://m.baishuzhai.com/blhb/{catId}/{page}.html\",\r\n \"catIdRule\": {\r\n \"catId1\": \"1\",\r\n \"catId2\": \"2\",\r\n \"catId3\": \"3\",\r\n \"catId4\": \"4\",\r\n \"catId5\": \"5\",\r\n \"catId6\": \"6\",\r\n \"catId7\": \"7\"\r\n },\r\n \"bookIdPatten\": \"href=\\\"/ibook/(\\\\d+/\\\\d+)/\\\"\",\r\n \"pagePatten\": \"value=\\\"(\\\\d+)/\\\\d+\\\"\",\r\n \"totalPagePatten\": \"value=\\\"\\\\d+/(\\\\d+)\\\"\",\r\n \"bookDetailUrl\": \"https://m.baishuzhai.com/ibook/{bookId}/\",\r\n \"bookNamePatten\": \"<span class=\\\"title\\\">([^/]+)</span>\",\r\n \"authorNamePatten\": \">作者:([^/]+)<\",\r\n \"picUrlPatten\": \"<img src=\\\"([^>]+)\\\"\\\\s+onerror=\\\"this.src=\",\r\n \"statusPatten\": \"状态:([^/]+)</li>\",\r\n \"bookStatusRule\": {\r\n \"连载\": 0,\r\n \"完成\": 1\r\n },\r\n \"scorePatten\": \"<em>([^<]+)</em>\",\r\n \"descStart\": \"<p class=\\\"review\\\">\",\r\n \"descEnd\": \"</p>\",\r\n \"upadateTimePatten\": \"更新:(\\\\d+-\\\\d+-\\\\d+)</li>\",\r\n \"upadateTimeFormatPatten\": \"yy-MM-dd\",\r\n \"bookIndexUrl\": \"https://m.baishuzhai.com/ibook/{bookId}/all.html\",\r\n \"indexIdPatten\": \"<a\\\\s+style=\\\"\\\"\\\\s+href=\\\"/ibook/\\\\d+/\\\\d+/(\\\\d+)\\\\.html\\\">[^/]+</a>\",\r\n \"indexNamePatten\": \"<a\\\\s+style=\\\"\\\"\\\\s+href=\\\"/ibook/\\\\d+/\\\\d+/\\\\d+\\\\.html\\\">([^/]+)</a>\",\r\n \"bookContentUrl\": \"https://baishuzhai.com/ibook/{bookId}/{indexId}.html\",\r\n \"contentStart\": \"id=\\\"content\\\">\",\r\n \"contentEnd\": \"<script>\"\r\n}', '0', '2020-05-01 14:22:50', '2020-05-01 14:22:50');
|
||||
INSERT INTO `crawl_source` VALUES ('3', '书包网', '{\r\n \"bookListUrl\": \"https://www.bookbao8.com/booklist-p_{page}-c_{catId}-t_0-o_0.html\",\r\n \"catIdRule\": {\r\n \"catId1\": \"5\",\r\n \"catId2\": \"4\",\r\n \"catId3\": \"8\",\r\n \"catId4\": \"9\",\r\n \"catId5\": \"3\",\r\n \"catId6\": \"7\"\r\n },\r\n \"bookIdPatten\": \"href=\\\"/book/(\\\\d+/\\\\d+/id_[^.]+).html\\\"\",\r\n \"pagePatten\": \"<span\\\\s+class=\\\"current\\\">([^<]+)</span>\",\r\n \"totalPagePatten\": \"/共(\\\\d+)页\",\r\n \"bookDetailUrl\": \"https://www.bookbao8.com/book/{bookId}.html\",\r\n \"bookNamePatten\": \"<div\\\\s+id=\\\"info\\\">\\\\s*<h1>([^<]+)</h1>\",\r\n \"authorNamePatten\": \"<p>作者:<a\\\\s+href=\\\"/Search/[^\\\"]+\\\"\\\\s+target=\\\"_blank\\\">([^<]+)</a></p>\",\r\n \"picUrlPatten\": \"<div\\\\s+id=\\\"fmimg\\\">\\\\s*<img\\\\s+alt=\\\"[^\\\"]+\\\"\\\\s+src=\\\"([^\\\"]+)\\\"\",\r\n \"statusPatten\": \"<p>状态:([^<]+)</p>\",\r\n \"bookStatusRule\": {\r\n \"连载中\": 0,\r\n \"已完结\": 1\r\n },\r\n \"visitCountPatten\": \"<em\\\\s+id=\\\"hits\\\">(\\\\d+)</em>\",\r\n \"descStart\": \"<div class=\\\"infocontent\\\">\",\r\n \"descEnd\": \"</div>\",\r\n \"upadateTimePatten\": \"<p>更新时间:(\\\\d+-\\\\d+-\\\\d+\\\\s\\\\d+:\\\\d+:\\\\d+)</p>\",\r\n \"upadateTimeFormatPatten\": \"yyyy-MM-dd HH:mm:ss\",\r\n \"bookIndexUrl\": \"https://www.bookbao8.com/book/{bookId}.html\",\r\n \"indexIdPatten\": \"<li>\\\\s*<a\\\\s+href=\\\"/views/\\\\d+/\\\\d+/id_[^_]+_(\\\\d+).html\\\"\\\\s+target=\\\"_blank\\\">\",\r\n \"indexNamePatten\": \"<li>\\\\s*<a\\\\s+href=\\\"/views/\\\\d+/\\\\d+/id_[^_]+_\\\\d+.html\\\"\\\\s+target=\\\"_blank\\\">([^<]+)</a>\",\r\n \"bookContentUrl\": \"https://www.bookbao8.com/views/{bookId}_{indexId}.html\",\r\n \"contentStart\": \"<dd id=\\\"contents\\\">\",\r\n \"contentEnd\": \"</dd>\"\r\n}', '0', '2020-05-04 17:42:22', '2020-05-04 17:42:22');
|
||||
|
||||
-- ----------------------------
|
||||
-- Table structure for friend_link
|
||||
|
Loading…
x
Reference in New Issue
Block a user