mirror of
https://github.com/201206030/novel-plus.git
synced 2025-04-27 01:30:51 +00:00
增加笔趣阁书源
This commit is contained in:
parent
231b94f1da
commit
5e16119880
novel-crawl/src/main/java/com/java2nb/novel
sql
@ -36,6 +36,8 @@ public class CrawlParser {
|
||||
|
||||
private static RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8");
|
||||
|
||||
private static ThreadLocal <Integer> retryCount = new ThreadLocal<>();
|
||||
|
||||
@SneakyThrows
|
||||
public static Book parseBook(RuleBean ruleBean, String bookId) {
|
||||
Book book = new Book();
|
||||
@ -148,10 +150,12 @@ public class CrawlParser {
|
||||
//读取目录
|
||||
String indexListUrl = ruleBean.getBookIndexUrl().replace("{bookId}", sourceBookId);
|
||||
String indexListHtml = getByHttpClient(indexListUrl);
|
||||
if(StringUtils.isNotBlank(ruleBean.getBookIndexStart())){
|
||||
indexListHtml = indexListHtml.substring(indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length());
|
||||
}
|
||||
|
||||
if (indexListHtml != null) {
|
||||
if(StringUtils.isNotBlank(ruleBean.getBookIndexStart())){
|
||||
indexListHtml = indexListHtml.substring(indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length());
|
||||
}
|
||||
|
||||
Pattern indexIdPatten = compile(ruleBean.getIndexIdPatten());
|
||||
Matcher indexIdMatch = indexIdPatten.matcher(indexListHtml);
|
||||
|
||||
@ -177,7 +181,7 @@ public class CrawlParser {
|
||||
|
||||
//查询章节内容
|
||||
String contentHtml = getByHttpClient(contentUrl);
|
||||
if (contentHtml != null) {
|
||||
if (contentHtml != null && !contentHtml.contains("正在手打中")) {
|
||||
String content = contentHtml.substring(contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());
|
||||
content = content.substring(0, content.indexOf(ruleBean.getContentEnd()));
|
||||
//TODO插入章节目录和章节内容
|
||||
@ -259,17 +263,30 @@ public class CrawlParser {
|
||||
log.debug("body长度:"+body.length());
|
||||
if(body.length() < Constants.INVALID_HTML_LENGTH){
|
||||
log.debug("获取html页面内容失败");
|
||||
Thread.sleep( new Random().nextInt(10*1000));
|
||||
return getByHttpClient(url);
|
||||
return processErrorHttpResult(url);
|
||||
}
|
||||
//成功获得html内容
|
||||
return body;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
return null;
|
||||
}
|
||||
return processErrorHttpResult(url);
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private static String processErrorHttpResult(String url){
|
||||
Integer count = retryCount.get();
|
||||
if(count == null){
|
||||
count = 0;
|
||||
}
|
||||
if(count < Constants.HTTP_FAIL_RETRY_COUNT){
|
||||
Thread.sleep( new Random().nextInt(10*1000));
|
||||
retryCount.set(++count);
|
||||
return getByHttpClient(url);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
|
@ -19,4 +19,9 @@ public class Constants {
|
||||
* 爬取小说http请求中无效的内容长度
|
||||
*/
|
||||
public static final int INVALID_HTML_LENGTH = 1500;
|
||||
|
||||
/**
|
||||
* 爬取小说http请求失败重试次数
|
||||
*/
|
||||
public static final Integer HTTP_FAIL_RETRY_COUNT = 5;
|
||||
}
|
||||
|
@ -1 +1,2 @@
|
||||
INSERT INTO `novel_plus`.`crawl_source` (`id`, `source_name`, `crawl_rule`, `source_status`, `create_time`, `update_time`) VALUES ('4', '书趣阁', '{\r\n \"bookListUrl\": \"http://m.shuquge.com/sort/{catId}/0_{page}.html\",\r\n \"catIdRule\": {\r\n \"catId1\": \"1\",\r\n \"catId2\": \"2\",\r\n \"catId3\": \"3\",\r\n \"catId4\": \"4\",\r\n \"catId5\": \"7\",\r\n \"catId6\": \"6\",\r\n \"catId7\": \"8\"\r\n },\r\n \"bookIdPatten\": \"href=\\\"/s/(\\\\d+)\\\\.html\\\"\",\r\n \"pagePatten\": \"第(\\\\d+)/\\\\d+页\",\r\n \"totalPagePatten\": \"第\\\\d+/(\\\\d+)页\",\r\n \"bookDetailUrl\": \"http://m.shuquge.com/s/{bookId}.html\",\r\n \"bookNamePatten\": \"<a\\\\s+href=\\\"/s/\\\\d+\\\\.html\\\"><h2>([^/]+)</h2></a>\",\r\n \"authorNamePatten\": \"<p>作者:([^/]+)</p>\",\r\n \"picUrlPatten\": \"src=\\\"(http://www.shuquge.com/files/article/image/\\\\d+/\\\\d+/\\\\d+s\\\\.jpg)\\\"\",\r\n \"statusPatten\": \"<p>状态:([^/]+)</p>\",\r\n \"bookStatusRule\": {\r\n \"连载中\": 0,\r\n \"完本\": 1\r\n },\r\n \"descStart\": \"<div class=\\\"intro_info\\\">\",\r\n \"descEnd\": \"最新章节推荐地址\",\r\n \"bookIndexUrl\": \"http://www.shuquge.com/txt/{bookId}/index.html\",\r\n \"bookIndexStart\": \"》正文\",\r\n \"indexIdPatten\": \"<dd><a\\\\s+href=\\\"(\\\\d+)\\\\.html\\\">[^/]+</a></dd>\",\r\n \"indexNamePatten\": \"<dd><a\\\\s+href=\\\"\\\\d+\\\\.html\\\">([^/]+)</a></dd>\",\r\n \"bookContentUrl\": \"http://www.shuquge.com/txt/{bookId}/{indexId}.html\",\r\n \"contentStart\": \"<div id=\\\"content\\\" class=\\\"showtxt\\\">\",\r\n \"contentEnd\": \"http://www.shuquge.com\"\r\n}', '1', '2020-05-18 12:02:34', '2020-05-18 12:02:34');
|
||||
INSERT INTO `novel_plus`.`crawl_source` (`id`, `source_name`, `crawl_rule`, `source_status`, `create_time`, `update_time`) VALUES ('5', '笔趣阁', '{\"bookListUrl\":\"http://m.mcmssc.com/xclass/{catId}/{page}.html\",\"catIdRule\":{\"catId1\":\"1\",\"catId2\":\"2\",\"catId3\":\"3\",\"catId4\":\"4\",\"catId5\":\"5\",\"catId6\":\"6\",\"catId7\":\"7\"},\"bookIdPatten\":\"href=\\\"/(\\\\d+_\\\\d+)/\\\"\",\"pagePatten\":\"class=\\\"page_txt\\\"\\\\s+value=\\\"(\\\\d+)/\\\\d+\\\"\\\\s+size=\",\"totalPagePatten\":\"class=\\\"page_txt\\\"\\\\s+value=\\\"\\\\d+/(\\\\d+)\\\"\\\\s+size=\",\"bookDetailUrl\":\"http://m.mcmssc.com/{bookId}/\",\"bookNamePatten\":\"<span\\\\s+class=\\\"title\\\">([^/]+)</span>\",\"authorNamePatten\":\"<a\\\\s+href=\\\"/author/\\\\d+/\\\">([^/]+)</a>\",\"picUrlPatten\":\"<img\\\\s+src=\\\"([^>]+)\\\"\\\\s+onerror=\",\"picUrlPrefix\":\"http://m.mcmssc.com/\",\"statusPatten\":\">状态:([^/]+)<\",\"bookStatusRule\":{\"连载\":0,\"全本\":1},\"visitCountPatten\":\">点击:(\\\\d+)<\",\"descStart\":\"<p class=\\\"review\\\">\",\"descEnd\":\"</p>\",\"bookIndexUrl\":\"http://m.mcmssc.com/{bookId}/all.html\",\"indexIdPatten\":\"<a\\\\s+href=\\\"/\\\\d+_\\\\d+/(\\\\d+)\\\\.html\\\">[^/]+</a>\",\"indexNamePatten\":\"<a\\\\s+href=\\\"/\\\\d+_\\\\d+/\\\\d+\\\\.html\\\">([^/]+)</a>\",\"bookContentUrl\":\"http://www.mcmssc.com/{bookId}/{indexId}.html\",\"contentStart\":\"</p>\",\"contentEnd\":\"<div align=\\\"center\\\">\"}', '1', '2020-05-18 15:57:41', '2020-05-18 15:57:41');
|
||||
|
Loading…
x
Reference in New Issue
Block a user