From 93c2c8262c8ab8c89cdd4936997948deb15a7914 Mon Sep 17 00:00:00 2001 From: xxy <773861846@qq.com> Date: Mon, 4 May 2020 18:40:38 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E4=B9=A6=E5=8C=85=E7=BD=91?= =?UTF-8?q?=E4=B9=A6=E6=BA=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../novel/service/impl/CrawlServiceImpl.java | 107 +++++++++--------- .../templates/crawl/crawlSource_add.html | 2 +- sql/novel_plus.sql | 7 +- 3 files changed, 60 insertions(+), 56 deletions(-) diff --git a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java index 1c6e0b3..36d90bc 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java @@ -153,66 +153,69 @@ public class CrawlServiceImpl implements CrawlService { while (page <= totalPage) { try { - //拼接分类URL - String catBookListUrl = ruleBean.getBookListUrl() - .replace("{catId}", ruleBean.getCatIdRule().get("catId" + catId)) - .replace("{page}", page + ""); - String bookListHtml = getByHttpClient(catBookListUrl); - if (bookListHtml != null) { - Pattern bookIdPatten = Pattern.compile(ruleBean.getBookIdPatten()); - Matcher bookIdMatcher = bookIdPatten.matcher(bookListHtml); - boolean isFindBookId = bookIdMatcher.find(); - while (isFindBookId) { - try { - String bookId = bookIdMatcher.group(1); - Book book = CrawlParser.parseBook(ruleBean, bookId); - //这里只做新书入库,查询是否存在这本书 - Book existBook = bookService.queryBookByBookNameAndAuthorName(book.getBookName(), book.getAuthorName()); - //如果该小说不存在,则可以解析入库,但是标记该小说正在入库,30分钟之后才允许再次入库 - if (existBook == null) { - //没有该书,可以入库 - book.setCatId(catId); - //根据分类ID查询分类 - book.setCatName(bookService.queryCatNameByCatId(catId)); - if (catId == 7) { - //女频 - book.setWorkDirection((byte) 1); + if(StringUtils.isNotBlank(ruleBean.getCatIdRule().get("catId" + catId))) { + //拼接分类URL + String catBookListUrl = ruleBean.getBookListUrl() + .replace("{catId}", ruleBean.getCatIdRule().get("catId" + catId)) + .replace("{page}", page + ""); + + String bookListHtml = getByHttpClient(catBookListUrl); + if (bookListHtml != null) { + Pattern bookIdPatten = Pattern.compile(ruleBean.getBookIdPatten()); + Matcher bookIdMatcher = bookIdPatten.matcher(bookListHtml); + boolean isFindBookId = bookIdMatcher.find(); + while (isFindBookId) { + try { + String bookId = bookIdMatcher.group(1); + Book book = CrawlParser.parseBook(ruleBean, bookId); + //这里只做新书入库,查询是否存在这本书 + Book existBook = bookService.queryBookByBookNameAndAuthorName(book.getBookName(), book.getAuthorName()); + //如果该小说不存在,则可以解析入库,但是标记该小说正在入库,30分钟之后才允许再次入库 + if (existBook == null) { + //没有该书,可以入库 + book.setCatId(catId); + //根据分类ID查询分类 + book.setCatName(bookService.queryCatNameByCatId(catId)); + if (catId == 7) { + //女频 + book.setWorkDirection((byte) 1); + } else { + //男频 + book.setWorkDirection((byte) 0); + } + book.setCrawlBookId(bookId); + book.setCrawlSourceId(sourceId); + book.setCrawlLastTime(new Date()); + book.setId(new IdWorker().nextId()); + //解析章节目录 + Map indexAndContentList = CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0)); + + bookService.saveBookAndIndexAndContent(book, (List) indexAndContentList.get(CrawlParser.BOOK_INDEX_LIST_KEY), (List) indexAndContentList.get(CrawlParser.BOOK_CONTENT_LIST_KEY)); + } else { - //男频 - book.setWorkDirection((byte) 0); + //只更新书籍的爬虫相关字段 + bookService.updateCrawlProperties(existBook.getId(), sourceId, bookId); } - book.setCrawlBookId(bookId); - book.setCrawlSourceId(sourceId); - book.setCrawlLastTime(new Date()); - book.setId(new IdWorker().nextId()); - //解析章节目录 - Map indexAndContentList = CrawlParser.parseBookIndexAndContent(bookId,book, ruleBean, new HashMap<>(0)); - - bookService.saveBookAndIndexAndContent(book, (List) indexAndContentList.get(CrawlParser.BOOK_INDEX_LIST_KEY), (List) indexAndContentList.get(CrawlParser.BOOK_CONTENT_LIST_KEY)); - - } else { - //只更新书籍的爬虫相关字段 - bookService.updateCrawlProperties(existBook.getId(),sourceId, bookId); + } catch (Exception e) { + log.error(e.getMessage(), e); } - }catch (Exception e){ - log.error(e.getMessage(),e); + + + isFindBookId = bookIdMatcher.find(); + } + + Pattern totalPagePatten = Pattern.compile(ruleBean.getTotalPagePatten()); + Matcher totalPageMatcher = totalPagePatten.matcher(bookListHtml); + boolean isFindTotalPage = totalPageMatcher.find(); + if (isFindTotalPage) { + + totalPage = Integer.parseInt(totalPageMatcher.group(1)); + } - isFindBookId = bookIdMatcher.find(); } - - Pattern totalPagePatten = Pattern.compile(ruleBean.getTotalPagePatten()); - Matcher totalPageMatcher = totalPagePatten.matcher(bookListHtml); - boolean isFindTotalPage = totalPageMatcher.find(); - if (isFindTotalPage) { - - totalPage = Integer.parseInt(totalPageMatcher.group(1)); - - } - - } }catch (Exception e){ log.error(e.getMessage(),e); diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html b/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html index 0fb6eb3..5af6e69 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html @@ -239,7 +239,7 @@ crawlRule.pagePatten = pagePatten; } - var totalPagePatten = $("#pagePatten").val(); + var totalPagePatten = $("#totalPagePatten").val(); if (totalPagePatten.length > 0) { crawlRule.totalPagePatten = totalPagePatten; diff --git a/sql/novel_plus.sql b/sql/novel_plus.sql index 372d0b5..86d510c 100644 --- a/sql/novel_plus.sql +++ b/sql/novel_plus.sql @@ -298,17 +298,18 @@ DROP TABLE IF EXISTS `crawl_source`; CREATE TABLE `crawl_source` ( `id` int(11) NOT NULL AUTO_INCREMENT COMMENT '主键', `source_name` varchar(50) DEFAULT NULL COMMENT '源站名', - `crawl_rule` varchar(2048) DEFAULT NULL COMMENT '爬取规则(json串)', + `crawl_rule` text COMMENT '爬取规则(json串)', `source_status` tinyint(1) DEFAULT '0' COMMENT '爬虫源状态,0:关闭,1:开启', `create_time` datetime DEFAULT NULL COMMENT '创建时间', `update_time` datetime DEFAULT NULL COMMENT '更新时间', PRIMARY KEY (`id`) -) ENGINE=InnoDB AUTO_INCREMENT=3 DEFAULT CHARSET=utf8mb4 COMMENT='爬虫源表'; +) ENGINE=InnoDB AUTO_INCREMENT=4 DEFAULT CHARSET=utf8mb4 COMMENT='爬虫源表'; -- ---------------------------- -- Records of crawl_source -- ---------------------------- -INSERT INTO `crawl_source` VALUES ('2', '百书斋', '{\r\n \"bookListUrl\": \"https://m.baishuzhai.com/blhb/{catId}/{page}.html\",\r\n \"catIdRule\": {\r\n \"catId1\": \"1\",\r\n \"catId2\": \"2\",\r\n \"catId3\": \"3\",\r\n \"catId4\": \"4\",\r\n \"catId5\": \"5\",\r\n \"catId6\": \"6\",\r\n \"catId7\": \"7\"\r\n },\r\n \"bookIdPatten\": \"href=\\\"/ibook/(\\\\d+/\\\\d+)/\\\"\",\r\n \"pagePatten\": \"value=\\\"(\\\\d+)/\\\\d+\\\"\",\r\n \"totalPagePatten\": \"value=\\\"\\\\d+/(\\\\d+)\\\"\",\r\n \"bookDetailUrl\": \"https://m.baishuzhai.com/ibook/{bookId}/\",\r\n \"bookNamePatten\": \"([^/]+)\",\r\n \"authorNamePatten\": \">作者:([^/]+)<\",\r\n \"picUrlPatten\": \"]+)\\\"\\\\s+onerror=\\\"this.src=\",\r\n \"statusPatten\": \"状态:([^/]+)\",\r\n \"bookStatusRule\": {\r\n \"连载\": 0,\r\n \"完成\": 1\r\n },\r\n \"scorePatten\": \"(\\\\d+\\\\.\\\\d+)分\",\r\n \"descStart\": \"

\",\r\n \"descEnd\": \"

\",\r\n \"upadateTimePatten\": \"更新:(\\\\d+-\\\\d+-\\\\d+)\",\r\n \"upadateTimeFormatPatten\": \"yy-MM-dd\",\r\n \"bookIndexUrl\": \"https://m.baishuzhai.com/ibook/{bookId}/all.html\",\r\n \"indexIdPatten\": \"[^/]+\",\r\n \"indexNamePatten\": \"([^/]+)\",\r\n \"bookContentUrl\": \"https://baishuzhai.com/ibook/{bookId}/{indexId}.html\",\r\n \"contentStart\": \"id=\\\"content\\\">\",\r\n \"contentEnd\": \"