mirror of
https://github.com/201206030/novel-plus.git
synced 2025-04-27 01:30:51 +00:00
feat: 支持非utf-8编码的网站采集
This commit is contained in:
parent
85b64bbc10
commit
73502a279b
@ -4,22 +4,35 @@ import lombok.extern.slf4j.Slf4j;
|
|||||||
import org.springframework.http.*;
|
import org.springframework.http.*;
|
||||||
import org.springframework.web.client.RestTemplate;
|
import org.springframework.web.client.RestTemplate;
|
||||||
|
|
||||||
|
import java.nio.charset.Charset;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @author Administrator
|
* @author Administrator
|
||||||
*/
|
*/
|
||||||
@Slf4j
|
@Slf4j
|
||||||
public class HttpUtil {
|
public class HttpUtil {
|
||||||
|
|
||||||
private static final RestTemplate REST_TEMPLATE = RestTemplates.newInstance("utf-8");
|
private static final String DEFAULT_CHARSET = "utf-8";
|
||||||
|
|
||||||
public static String getByHttpClientWithChrome(String url) {
|
private static final Map<String, RestTemplate> REST_TEMPLATE_MAP = new HashMap<>();
|
||||||
try {
|
|
||||||
|
public static String getByHttpClientWithChrome(String url, String charset) {
|
||||||
log.debug("Get url:{}", url);
|
log.debug("Get url:{}", url);
|
||||||
|
if (!Charset.isSupported(charset)) {
|
||||||
|
log.error("字符编码{}无效!", charset);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
RestTemplate restTemplate = REST_TEMPLATE_MAP.computeIfAbsent(charset,
|
||||||
|
k -> RestTemplates.newInstance(charset));
|
||||||
|
try {
|
||||||
HttpHeaders headers = new HttpHeaders();
|
HttpHeaders headers = new HttpHeaders();
|
||||||
headers.add("user-agent",
|
headers.add("user-agent",
|
||||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36");
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36");
|
||||||
HttpEntity<String> requestEntity = new HttpEntity<>(null, headers);
|
HttpEntity<String> requestEntity = new HttpEntity<>(null, headers);
|
||||||
ResponseEntity<String> forEntity = REST_TEMPLATE.exchange(url, HttpMethod.GET, requestEntity, String.class);
|
ResponseEntity<String> forEntity = restTemplate.exchange(url, HttpMethod.GET, requestEntity,
|
||||||
|
String.class);
|
||||||
log.debug("Response code:{}", forEntity.getStatusCode());
|
log.debug("Response code:{}", forEntity.getStatusCode());
|
||||||
if (forEntity.getStatusCode() == HttpStatus.OK) {
|
if (forEntity.getStatusCode() == HttpStatus.OK) {
|
||||||
return forEntity.getBody();
|
return forEntity.getBody();
|
||||||
@ -32,4 +45,8 @@ public class HttpUtil {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String getByHttpClientWithChrome(String url) {
|
||||||
|
return getByHttpClientWithChrome(url, DEFAULT_CHARSET);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -38,7 +38,7 @@ public class CrawlParser {
|
|||||||
public void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) {
|
public void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) {
|
||||||
Book book = new Book();
|
Book book = new Book();
|
||||||
String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
|
String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
|
||||||
String bookDetailHtml = crawlHttpClient.get(bookDetailUrl);
|
String bookDetailHtml = crawlHttpClient.get(bookDetailUrl, ruleBean.getCharset());
|
||||||
if (bookDetailHtml != null) {
|
if (bookDetailHtml != null) {
|
||||||
Pattern bookNamePatten = PatternFactory.getPattern(ruleBean.getBookNamePatten());
|
Pattern bookNamePatten = PatternFactory.getPattern(ruleBean.getBookNamePatten());
|
||||||
Matcher bookNameMatch = bookNamePatten.matcher(bookDetailHtml);
|
Matcher bookNameMatch = bookNamePatten.matcher(bookDetailHtml);
|
||||||
@ -152,7 +152,7 @@ public class CrawlParser {
|
|||||||
List<BookContent> contentList = new ArrayList<>();
|
List<BookContent> contentList = new ArrayList<>();
|
||||||
//读取目录
|
//读取目录
|
||||||
String indexListUrl = ruleBean.getBookIndexUrl().replace("{bookId}", sourceBookId);
|
String indexListUrl = ruleBean.getBookIndexUrl().replace("{bookId}", sourceBookId);
|
||||||
String indexListHtml = crawlHttpClient.get(indexListUrl);
|
String indexListHtml = crawlHttpClient.get(indexListUrl, ruleBean.getCharset());
|
||||||
|
|
||||||
if (indexListHtml != null) {
|
if (indexListHtml != null) {
|
||||||
if (StringUtils.isNotBlank(ruleBean.getBookIndexStart())) {
|
if (StringUtils.isNotBlank(ruleBean.getBookIndexStart())) {
|
||||||
@ -216,7 +216,7 @@ public class CrawlParser {
|
|||||||
.replace("{indexId}", sourceIndexId);
|
.replace("{indexId}", sourceIndexId);
|
||||||
|
|
||||||
//查询章节内容
|
//查询章节内容
|
||||||
String contentHtml = crawlHttpClient.get(contentUrl);
|
String contentHtml = crawlHttpClient.get(contentUrl, ruleBean.getCharset());
|
||||||
if (contentHtml != null && !contentHtml.contains("正在手打中")) {
|
if (contentHtml != null && !contentHtml.contains("正在手打中")) {
|
||||||
String content = contentHtml.substring(
|
String content = contentHtml.substring(
|
||||||
contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());
|
contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package com.java2nb.novel.core.crawl;
|
package com.java2nb.novel.core.crawl;
|
||||||
|
|
||||||
|
import com.java2nb.novel.utils.Constants;
|
||||||
import lombok.Data;
|
import lombok.Data;
|
||||||
|
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
@ -12,6 +13,12 @@ import java.util.Map;
|
|||||||
@Data
|
@Data
|
||||||
public class RuleBean {
|
public class RuleBean {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 网页字符编码
|
||||||
|
*/
|
||||||
|
private String charset = Constants.CRAWL_DEFAULT_CHARSET;
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* 小说更新列表url
|
* 小说更新列表url
|
||||||
*/
|
*/
|
||||||
|
@ -271,7 +271,7 @@ public class CrawlServiceImpl implements CrawlService {
|
|||||||
}
|
}
|
||||||
log.info("catBookListUrl:{}", catBookListUrl);
|
log.info("catBookListUrl:{}", catBookListUrl);
|
||||||
|
|
||||||
String bookListHtml = crawlHttpClient.get(catBookListUrl);
|
String bookListHtml = crawlHttpClient.get(catBookListUrl, ruleBean.getCharset());
|
||||||
if (bookListHtml != null) {
|
if (bookListHtml != null) {
|
||||||
Pattern bookIdPatten = Pattern.compile(ruleBean.getBookIdPatten());
|
Pattern bookIdPatten = Pattern.compile(ruleBean.getBookIdPatten());
|
||||||
Matcher bookIdMatcher = bookIdPatten.matcher(bookListHtml);
|
Matcher bookIdMatcher = bookIdPatten.matcher(bookListHtml);
|
||||||
|
@ -24,4 +24,9 @@ public class Constants {
|
|||||||
* 爬取小说http请求失败重试次数
|
* 爬取小说http请求失败重试次数
|
||||||
*/
|
*/
|
||||||
public static final Integer HTTP_FAIL_RETRY_COUNT = 3;
|
public static final Integer HTTP_FAIL_RETRY_COUNT = 3;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* 爬虫默认编码
|
||||||
|
*/
|
||||||
|
public static final String CRAWL_DEFAULT_CHARSET = "UTF-8";
|
||||||
}
|
}
|
||||||
|
@ -25,7 +25,7 @@ public class CrawlHttpClient {
|
|||||||
|
|
||||||
private static final ThreadLocal<Integer> RETRY_COUNT = new ThreadLocal<>();
|
private static final ThreadLocal<Integer> RETRY_COUNT = new ThreadLocal<>();
|
||||||
|
|
||||||
public String get(String url) {
|
public String get(String url, String charset) {
|
||||||
if (Objects.nonNull(intervalMin) && Objects.nonNull(intervalMax) && intervalMax > intervalMin) {
|
if (Objects.nonNull(intervalMin) && Objects.nonNull(intervalMax) && intervalMax > intervalMin) {
|
||||||
try {
|
try {
|
||||||
Thread.sleep(random.nextInt(intervalMax - intervalMin + 1) + intervalMin);
|
Thread.sleep(random.nextInt(intervalMax - intervalMin + 1) + intervalMin);
|
||||||
@ -33,22 +33,22 @@ public class CrawlHttpClient {
|
|||||||
log.error(e.getMessage(), e);
|
log.error(e.getMessage(), e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
String body = HttpUtil.getByHttpClientWithChrome(url);
|
String body = HttpUtil.getByHttpClientWithChrome(url, charset);
|
||||||
if (Objects.isNull(body) || body.length() < Constants.INVALID_HTML_LENGTH) {
|
if (Objects.isNull(body) || body.length() < Constants.INVALID_HTML_LENGTH) {
|
||||||
return processErrorHttpResult(url);
|
return processErrorHttpResult(url, charset);
|
||||||
}
|
}
|
||||||
//成功获得html内容
|
//成功获得html内容
|
||||||
return body;
|
return body;
|
||||||
}
|
}
|
||||||
|
|
||||||
private String processErrorHttpResult(String url) {
|
private String processErrorHttpResult(String url, String charset) {
|
||||||
Integer count = RETRY_COUNT.get();
|
Integer count = RETRY_COUNT.get();
|
||||||
if (count == null) {
|
if (count == null) {
|
||||||
count = 0;
|
count = 0;
|
||||||
}
|
}
|
||||||
if (count < Constants.HTTP_FAIL_RETRY_COUNT) {
|
if (count < Constants.HTTP_FAIL_RETRY_COUNT) {
|
||||||
RETRY_COUNT.set(++count);
|
RETRY_COUNT.set(++count);
|
||||||
return get(url);
|
return get(url, charset);
|
||||||
}
|
}
|
||||||
RETRY_COUNT.remove();
|
RETRY_COUNT.remove();
|
||||||
return null;
|
return null;
|
||||||
|
@ -54,6 +54,8 @@
|
|||||||
<!--示例:<b>https://m.xdingdiann.com/sort/0/1.html</b>
|
<!--示例:<b>https://m.xdingdiann.com/sort/0/1.html</b>
|
||||||
<li><input type="text" id="updateBookListUrl" class="s_input icon_key"
|
<li><input type="text" id="updateBookListUrl" class="s_input icon_key"
|
||||||
placeholder="小说更新列表url"></li>-->
|
placeholder="小说更新列表url"></li>-->
|
||||||
|
示例:<b>utf-8</b>
|
||||||
|
<li><input type="text" id="charset" class="s_input icon_name" placeholder="网站编码"></li>
|
||||||
示例:<b>http://m.xdingdiann.com/sort/{catId}/{page}.html</b> ({catId}代表分类ID,{page}代表分页页码)
|
示例:<b>http://m.xdingdiann.com/sort/{catId}/{page}.html</b> ({catId}代表分类ID,{page}代表分页页码)
|
||||||
<li><input type="text" id="bookListUrl" class="s_input icon_key"
|
<li><input type="text" id="bookListUrl" class="s_input icon_key"
|
||||||
placeholder="分类列表页URL规则"></li>
|
placeholder="分类列表页URL规则"></li>
|
||||||
@ -411,6 +413,11 @@
|
|||||||
var filterContent = $("#filterContent").val();
|
var filterContent = $("#filterContent").val();
|
||||||
crawlRule.filterContent = filterContent;
|
crawlRule.filterContent = filterContent;
|
||||||
|
|
||||||
|
var charset = $('#charset').val();
|
||||||
|
if (charset) {
|
||||||
|
crawlRule.charset = charset;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
$.ajax({
|
$.ajax({
|
||||||
type: "POST",
|
type: "POST",
|
||||||
|
@ -55,6 +55,8 @@
|
|||||||
<!--示例:<b>https://m.xdingdiann.com/sort/0/1.html</b>
|
<!--示例:<b>https://m.xdingdiann.com/sort/0/1.html</b>
|
||||||
<li><input type="text" id="updateBookListUrl" class="s_input icon_key"
|
<li><input type="text" id="updateBookListUrl" class="s_input icon_key"
|
||||||
placeholder="小说更新列表url"></li>-->
|
placeholder="小说更新列表url"></li>-->
|
||||||
|
示例:<b>utf-8</b>
|
||||||
|
<li><input type="text" id="charset" class="s_input icon_name" placeholder="网站编码"></li>
|
||||||
示例:<b>http://m.xdingdiann.com/sort/{catId}/{page}.html</b> ({catId}代表分类ID,{page}代表分页页码)
|
示例:<b>http://m.xdingdiann.com/sort/{catId}/{page}.html</b> ({catId}代表分类ID,{page}代表分页页码)
|
||||||
<li><input type="text" id="bookListUrl" class="s_input icon_key"
|
<li><input type="text" id="bookListUrl" class="s_input icon_key"
|
||||||
placeholder="分类列表页URL规则"></li>
|
placeholder="分类列表页URL规则"></li>
|
||||||
@ -274,6 +276,7 @@
|
|||||||
$("#contentStart").val(crawlRule.contentStart);
|
$("#contentStart").val(crawlRule.contentStart);
|
||||||
$("#contentEnd").val(crawlRule.contentEnd);
|
$("#contentEnd").val(crawlRule.contentEnd);
|
||||||
$("#filterContent").val(crawlRule.filterContent);
|
$("#filterContent").val(crawlRule.filterContent);
|
||||||
|
$("#charset").val(crawlRule.charset);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -496,6 +499,11 @@
|
|||||||
var filterContent = $("#filterContent").val();
|
var filterContent = $("#filterContent").val();
|
||||||
crawlRule.filterContent = filterContent;
|
crawlRule.filterContent = filterContent;
|
||||||
|
|
||||||
|
var charset = $('#charset').val();
|
||||||
|
if (charset) {
|
||||||
|
crawlRule.charset = charset;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
$.ajax({
|
$.ajax({
|
||||||
type: "POST",
|
type: "POST",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user