mirror of
https://github.com/201206030/novel-plus.git
synced 2025-06-24 12:46:38 +00:00
1.优化爬虫编写规则,兼容更多网站 2.新增书趣阁书源
This commit is contained in:
@ -3,12 +3,17 @@ package com.java2nb.novel.core.crawl;
|
||||
import com.java2nb.novel.core.utils.HttpUtil;
|
||||
import com.java2nb.novel.core.utils.IdWorker;
|
||||
import com.java2nb.novel.core.utils.RandomBookInfoUtil;
|
||||
import com.java2nb.novel.core.utils.RestTemplateUtil;
|
||||
import com.java2nb.novel.entity.Book;
|
||||
import com.java2nb.novel.entity.BookContent;
|
||||
import com.java2nb.novel.entity.BookIndex;
|
||||
import com.java2nb.novel.utils.Constants;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.client.RestTemplate;
|
||||
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.*;
|
||||
@ -22,17 +27,20 @@ import static java.util.regex.Pattern.compile;
|
||||
*
|
||||
* @author Administrator
|
||||
*/
|
||||
@Slf4j
|
||||
public class CrawlParser {
|
||||
|
||||
public static final Integer BOOK_INDEX_LIST_KEY = 1;
|
||||
|
||||
public static final Integer BOOK_CONTENT_LIST_KEY = 2;
|
||||
|
||||
private static RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8");
|
||||
|
||||
@SneakyThrows
|
||||
public static Book parseBook(RuleBean ruleBean, String bookId) {
|
||||
Book book = new Book();
|
||||
String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
|
||||
String bookDetailHtml = HttpUtil.getByHttpClient(bookDetailUrl);
|
||||
String bookDetailHtml = getByHttpClient(bookDetailUrl);
|
||||
if (bookDetailHtml != null) {
|
||||
Pattern bookNamePatten = compile(ruleBean.getBookNamePatten());
|
||||
Matcher bookNameMatch = bookNamePatten.matcher(bookDetailHtml);
|
||||
@ -54,6 +62,9 @@ public class CrawlParser {
|
||||
boolean isFindPicUrl = picUrlMatch.find();
|
||||
if (isFindPicUrl) {
|
||||
String picUrl = picUrlMatch.group(1);
|
||||
if(StringUtils.isNotBlank(picUrl) && StringUtils.isNotBlank(ruleBean.getPicUrlPrefix())) {
|
||||
picUrl = ruleBean.getPicUrlPrefix() + picUrl;
|
||||
}
|
||||
//设置封面图片路径
|
||||
book.setPicUrl(picUrl);
|
||||
}
|
||||
@ -136,7 +147,10 @@ public class CrawlParser {
|
||||
List<BookContent> contentList = new ArrayList<>();
|
||||
//读取目录
|
||||
String indexListUrl = ruleBean.getBookIndexUrl().replace("{bookId}", sourceBookId);
|
||||
String indexListHtml = HttpUtil.getByHttpClient(indexListUrl);
|
||||
String indexListHtml = getByHttpClient(indexListUrl);
|
||||
if(StringUtils.isNotBlank(ruleBean.getBookIndexStart())){
|
||||
indexListHtml = indexListHtml.substring(indexListHtml.indexOf(ruleBean.getBookIndexStart()) + ruleBean.getBookIndexStart().length());
|
||||
}
|
||||
if (indexListHtml != null) {
|
||||
Pattern indexIdPatten = compile(ruleBean.getIndexIdPatten());
|
||||
Matcher indexIdMatch = indexIdPatten.matcher(indexListHtml);
|
||||
@ -162,7 +176,7 @@ public class CrawlParser {
|
||||
String contentUrl = ruleBean.getBookContentUrl().replace("{bookId}", sourceBookId).replace("{indexId}", indexIdMatch.group(1));
|
||||
|
||||
//查询章节内容
|
||||
String contentHtml = HttpUtil.getByHttpClient(contentUrl);
|
||||
String contentHtml = getByHttpClient(contentUrl);
|
||||
if (contentHtml != null) {
|
||||
String content = contentHtml.substring(contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());
|
||||
content = content.substring(0, content.indexOf(ruleBean.getContentEnd()));
|
||||
@ -237,4 +251,25 @@ public class CrawlParser {
|
||||
}
|
||||
|
||||
|
||||
private static String getByHttpClient(String url) {
|
||||
try {
|
||||
ResponseEntity<String> forEntity = restTemplate.getForEntity(url, String.class);
|
||||
if (forEntity.getStatusCode() == HttpStatus.OK) {
|
||||
String body = forEntity.getBody();
|
||||
if(body.length() < Constants.INVALID_HTML_LENGTH){
|
||||
log.debug("获取html页面内容失败");
|
||||
Thread.sleep(10 + new Random().nextInt(60));
|
||||
return getByHttpClient(url);
|
||||
}
|
||||
return body;
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -11,6 +11,14 @@ import java.util.Map;
|
||||
@Data
|
||||
public class RuleBean {
|
||||
|
||||
/**
|
||||
* 小说更新列表url
|
||||
* */
|
||||
private String updateBookListUrl;
|
||||
|
||||
/**
|
||||
* 分类列表页URL规则
|
||||
* */
|
||||
private String bookListUrl;
|
||||
|
||||
private Map<String,String> catIdRule;
|
||||
@ -39,4 +47,9 @@ public class RuleBean {
|
||||
private String contentEnd;
|
||||
|
||||
|
||||
private String picUrlPrefix;
|
||||
|
||||
private String bookIndexStart;
|
||||
|
||||
|
||||
}
|
||||
|
@ -14,4 +14,9 @@ public class Constants {
|
||||
* 访问量默认值
|
||||
*/
|
||||
public static final Long VISIT_COUNT_DEFAULT = 100L;
|
||||
|
||||
/**
|
||||
* 爬取小说http请求中无效的内容长度
|
||||
*/
|
||||
public static final int INVALID_HTML_LENGTH = 1000;
|
||||
}
|
||||
|
@ -57,6 +57,9 @@
|
||||
<li><span id="LabErr"></span></li>
|
||||
示例:<b>新顶点小说网</b>
|
||||
<li><input type="text" id="sourceName" class="s_input icon_name" placeholder="源站名"></li>
|
||||
<!--示例:<b>https://m.xdingdiann.com/sort/0/1.html</b>
|
||||
<li><input type="text" id="updateBookListUrl" class="s_input icon_key"
|
||||
placeholder="小说更新列表url"></li>-->
|
||||
示例:<b>http://m.xdingdiann.com/sort/{catId}/{page}.html</b> ({catId}代表分类ID,{page}代表分页页码)
|
||||
<li><input type="text" id="bookListUrl" class="s_input icon_key"
|
||||
placeholder="分类列表页URL规则"></li>
|
||||
@ -95,6 +98,9 @@
|
||||
示例:<b><img src="([^>]+)"\s+onerror="this.src=</b>
|
||||
<li><input type="text" id="picUrlPatten" class="s_input icon_key"
|
||||
placeholder="小说图片路径的正则表达式:"></li>
|
||||
<b>可空,适用于图片路径为相对路径的源站,加上小说图片路径,则为完整的可访问的图片路径</b>
|
||||
<li><input type="text" id="picUrlPrefix" class="s_input icon_key"
|
||||
placeholder="小说图片访问路径前缀:"></li>
|
||||
示例:<b>状态:([^/]+)</li></b>
|
||||
<li><input type="text" id="statusPatten" class="s_input icon_key"
|
||||
placeholder="小说状态的正则表达式:"></li>
|
||||
@ -125,6 +131,9 @@
|
||||
示例:<b>http://m.xdingdiann.com/ddk{bookId}/all.html</b> (bookId代表小说ID)
|
||||
<li><input type="text" id="bookIndexUrl" class="s_input icon_key"
|
||||
placeholder="小说目录页的URL规则:"></li>
|
||||
<b>可空,适用于最新章节列表和全部章节列表在同一个页面的源站</b>
|
||||
<li><input type="text" id="bookIndexStart" class="s_input icon_key"
|
||||
placeholder="小说目录页内容开始截取字符串:"></li>
|
||||
示例:<b><a\s+style=""\s+href="/ddk\d+/(\d+)\.html">[^/]+</a></b>
|
||||
<li><input type="text" id="indexIdPatten" class="s_input icon_key"
|
||||
placeholder="目录页目录ID正则表达式:"></li>
|
||||
@ -278,6 +287,12 @@
|
||||
crawlRule.picUrlPatten = picUrlPatten;
|
||||
}
|
||||
|
||||
var picUrlPrefix = $("#picUrlPrefix").val();
|
||||
|
||||
if (picUrlPrefix.length > 0) {
|
||||
crawlRule.picUrlPrefix = picUrlPrefix;
|
||||
}
|
||||
|
||||
var statusPatten = $("#statusPatten").val();
|
||||
if (statusPatten.length > 0) {
|
||||
crawlRule.statusPatten = statusPatten;
|
||||
@ -345,6 +360,13 @@
|
||||
|
||||
crawlRule.bookIndexUrl = bookIndexUrl;
|
||||
|
||||
|
||||
var bookIndexStart = $("#bookIndexStart").val();
|
||||
|
||||
if (bookIndexStart.length > 0) {
|
||||
crawlRule.bookIndexStart = bookIndexStart;
|
||||
}
|
||||
|
||||
var indexIdPatten = $("#indexIdPatten").val();
|
||||
|
||||
if (indexIdPatten.length == 0) {
|
||||
|
Reference in New Issue
Block a user