From 5e1611988090992f386a5540bd589ba4fb39b570 Mon Sep 17 00:00:00 2001 From: xiongxiaoyang <773861846@qq.com> Date: Mon, 18 May 2020 17:08:43 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E7=AC=94=E8=B6=A3=E9=98=81?= =?UTF-8?q?=E4=B9=A6=E6=BA=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java2nb/novel/core/crawl/CrawlParser.java | 35 ++++++++++++++----- .../com/java2nb/novel/utils/Constants.java | 5 +++ sql/20200518.sql | 1 + 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java index 8ba13ee..76a8188 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java @@ -36,6 +36,8 @@ public class CrawlParser { private static RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8"); + private static ThreadLocal retryCount = new ThreadLocal<>(); + @SneakyThrows public static Book parseBook(RuleBean ruleBean, String bookId) { Book book = new Book(); @@ -148,10 +150,12 @@ public class CrawlParser { //读取目录 String indexListUrl = ruleBean.getBookIndexUrl().replace("{bookId}", sourceBookId); String indexListHtml = getByHttpClient(indexListUrl); - if(StringUtils.isNotBlank(ruleBean.getBookIndexStart())){ - indexListHtml = indexListHtml.substring(indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length()); - } + if (indexListHtml != null) { + if(StringUtils.isNotBlank(ruleBean.getBookIndexStart())){ + indexListHtml = indexListHtml.substring(indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length()); + } + Pattern indexIdPatten = compile(ruleBean.getIndexIdPatten()); Matcher indexIdMatch = indexIdPatten.matcher(indexListHtml); @@ -177,7 +181,7 @@ public class CrawlParser { //查询章节内容 String contentHtml = getByHttpClient(contentUrl); - if (contentHtml != null) { + if (contentHtml != null && !contentHtml.contains("正在手打中")) { String content = contentHtml.substring(contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length()); content = content.substring(0, content.indexOf(ruleBean.getContentEnd())); //TODO插入章节目录和章节内容 @@ -259,17 +263,30 @@ public class CrawlParser { log.debug("body长度:"+body.length()); if(body.length() < Constants.INVALID_HTML_LENGTH){ log.debug("获取html页面内容失败"); - Thread.sleep( new Random().nextInt(10*1000)); - return getByHttpClient(url); + return processErrorHttpResult(url); } + //成功获得html内容 return body; - } else { - return null; } } catch (Exception e) { e.printStackTrace(); - return null; } + return processErrorHttpResult(url); + + } + + @SneakyThrows + private static String processErrorHttpResult(String url){ + Integer count = retryCount.get(); + if(count == null){ + count = 0; + } + if(count < Constants.HTTP_FAIL_RETRY_COUNT){ + Thread.sleep( new Random().nextInt(10*1000)); + retryCount.set(++count); + return getByHttpClient(url); + } + return null; } diff --git a/novel-crawl/src/main/java/com/java2nb/novel/utils/Constants.java b/novel-crawl/src/main/java/com/java2nb/novel/utils/Constants.java index 753ca11..d278c19 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/utils/Constants.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/utils/Constants.java @@ -19,4 +19,9 @@ public class Constants { * 爬取小说http请求中无效的内容长度 */ public static final int INVALID_HTML_LENGTH = 1500; + + /** + * 爬取小说http请求失败重试次数 + */ + public static final Integer HTTP_FAIL_RETRY_COUNT = 5; } diff --git a/sql/20200518.sql b/sql/20200518.sql index c5fe46d..320b348 100644 --- a/sql/20200518.sql +++ b/sql/20200518.sql @@ -1 +1,2 @@ INSERT INTO `novel_plus`.`crawl_source` (`id`, `source_name`, `crawl_rule`, `source_status`, `create_time`, `update_time`) VALUES ('4', '书趣阁', '{\r\n \"bookListUrl\": \"http://m.shuquge.com/sort/{catId}/0_{page}.html\",\r\n \"catIdRule\": {\r\n \"catId1\": \"1\",\r\n \"catId2\": \"2\",\r\n \"catId3\": \"3\",\r\n \"catId4\": \"4\",\r\n \"catId5\": \"7\",\r\n \"catId6\": \"6\",\r\n \"catId7\": \"8\"\r\n },\r\n \"bookIdPatten\": \"href=\\\"/s/(\\\\d+)\\\\.html\\\"\",\r\n \"pagePatten\": \"第(\\\\d+)/\\\\d+页\",\r\n \"totalPagePatten\": \"第\\\\d+/(\\\\d+)页\",\r\n \"bookDetailUrl\": \"http://m.shuquge.com/s/{bookId}.html\",\r\n \"bookNamePatten\": \"

([^/]+)

\",\r\n \"authorNamePatten\": \"

作者:([^/]+)

\",\r\n \"picUrlPatten\": \"src=\\\"(http://www.shuquge.com/files/article/image/\\\\d+/\\\\d+/\\\\d+s\\\\.jpg)\\\"\",\r\n \"statusPatten\": \"

状态:([^/]+)

\",\r\n \"bookStatusRule\": {\r\n \"连载中\": 0,\r\n \"完本\": 1\r\n },\r\n \"descStart\": \"
\",\r\n \"descEnd\": \"最新章节推荐地址\",\r\n \"bookIndexUrl\": \"http://www.shuquge.com/txt/{bookId}/index.html\",\r\n \"bookIndexStart\": \"》正文\",\r\n \"indexIdPatten\": \"
[^/]+
\",\r\n \"indexNamePatten\": \"
([^/]+)
\",\r\n \"bookContentUrl\": \"http://www.shuquge.com/txt/{bookId}/{indexId}.html\",\r\n \"contentStart\": \"
\",\r\n \"contentEnd\": \"http://www.shuquge.com\"\r\n}', '1', '2020-05-18 12:02:34', '2020-05-18 12:02:34'); +INSERT INTO `novel_plus`.`crawl_source` (`id`, `source_name`, `crawl_rule`, `source_status`, `create_time`, `update_time`) VALUES ('5', '笔趣阁', '{\"bookListUrl\":\"http://m.mcmssc.com/xclass/{catId}/{page}.html\",\"catIdRule\":{\"catId1\":\"1\",\"catId2\":\"2\",\"catId3\":\"3\",\"catId4\":\"4\",\"catId5\":\"5\",\"catId6\":\"6\",\"catId7\":\"7\"},\"bookIdPatten\":\"href=\\\"/(\\\\d+_\\\\d+)/\\\"\",\"pagePatten\":\"class=\\\"page_txt\\\"\\\\s+value=\\\"(\\\\d+)/\\\\d+\\\"\\\\s+size=\",\"totalPagePatten\":\"class=\\\"page_txt\\\"\\\\s+value=\\\"\\\\d+/(\\\\d+)\\\"\\\\s+size=\",\"bookDetailUrl\":\"http://m.mcmssc.com/{bookId}/\",\"bookNamePatten\":\"([^/]+)\",\"authorNamePatten\":\"([^/]+)\",\"picUrlPatten\":\"]+)\\\"\\\\s+onerror=\",\"picUrlPrefix\":\"http://m.mcmssc.com/\",\"statusPatten\":\">状态:([^/]+)<\",\"bookStatusRule\":{\"连载\":0,\"全本\":1},\"visitCountPatten\":\">点击:(\\\\d+)<\",\"descStart\":\"

\",\"descEnd\":\"

\",\"bookIndexUrl\":\"http://m.mcmssc.com/{bookId}/all.html\",\"indexIdPatten\":\"[^/]+\",\"indexNamePatten\":\"([^/]+)\",\"bookContentUrl\":\"http://www.mcmssc.com/{bookId}/{indexId}.html\",\"contentStart\":\"

\",\"contentEnd\":\"
\"}', '1', '2020-05-18 15:57:41', '2020-05-18 15:57:41');