mirror of
https://github.com/201206030/novel-plus.git
synced 2025-04-26 17:20:52 +00:00
feat: 支持非utf-8编码的网站采集
This commit is contained in:
parent
85b64bbc10
commit
73502a279b
@ -4,22 +4,35 @@ import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.http.*;
|
||||
import org.springframework.web.client.RestTemplate;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* @author Administrator
|
||||
*/
|
||||
@Slf4j
|
||||
public class HttpUtil {
|
||||
|
||||
private static final RestTemplate REST_TEMPLATE = RestTemplates.newInstance("utf-8");
|
||||
private static final String DEFAULT_CHARSET = "utf-8";
|
||||
|
||||
public static String getByHttpClientWithChrome(String url) {
|
||||
private static final Map<String, RestTemplate> REST_TEMPLATE_MAP = new HashMap<>();
|
||||
|
||||
public static String getByHttpClientWithChrome(String url, String charset) {
|
||||
log.debug("Get url:{}", url);
|
||||
if (!Charset.isSupported(charset)) {
|
||||
log.error("字符编码{}无效!", charset);
|
||||
return null;
|
||||
}
|
||||
RestTemplate restTemplate = REST_TEMPLATE_MAP.computeIfAbsent(charset,
|
||||
k -> RestTemplates.newInstance(charset));
|
||||
try {
|
||||
log.debug("Get url:{}", url);
|
||||
HttpHeaders headers = new HttpHeaders();
|
||||
headers.add("user-agent",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36");
|
||||
HttpEntity<String> requestEntity = new HttpEntity<>(null, headers);
|
||||
ResponseEntity<String> forEntity = REST_TEMPLATE.exchange(url, HttpMethod.GET, requestEntity, String.class);
|
||||
ResponseEntity<String> forEntity = restTemplate.exchange(url, HttpMethod.GET, requestEntity,
|
||||
String.class);
|
||||
log.debug("Response code:{}", forEntity.getStatusCode());
|
||||
if (forEntity.getStatusCode() == HttpStatus.OK) {
|
||||
return forEntity.getBody();
|
||||
@ -32,4 +45,8 @@ public class HttpUtil {
|
||||
}
|
||||
}
|
||||
|
||||
public static String getByHttpClientWithChrome(String url) {
|
||||
return getByHttpClientWithChrome(url, DEFAULT_CHARSET);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -38,7 +38,7 @@ public class CrawlParser {
|
||||
public void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) {
|
||||
Book book = new Book();
|
||||
String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
|
||||
String bookDetailHtml = crawlHttpClient.get(bookDetailUrl);
|
||||
String bookDetailHtml = crawlHttpClient.get(bookDetailUrl, ruleBean.getCharset());
|
||||
if (bookDetailHtml != null) {
|
||||
Pattern bookNamePatten = PatternFactory.getPattern(ruleBean.getBookNamePatten());
|
||||
Matcher bookNameMatch = bookNamePatten.matcher(bookDetailHtml);
|
||||
@ -152,7 +152,7 @@ public class CrawlParser {
|
||||
List<BookContent> contentList = new ArrayList<>();
|
||||
//读取目录
|
||||
String indexListUrl = ruleBean.getBookIndexUrl().replace("{bookId}", sourceBookId);
|
||||
String indexListHtml = crawlHttpClient.get(indexListUrl);
|
||||
String indexListHtml = crawlHttpClient.get(indexListUrl, ruleBean.getCharset());
|
||||
|
||||
if (indexListHtml != null) {
|
||||
if (StringUtils.isNotBlank(ruleBean.getBookIndexStart())) {
|
||||
@ -216,7 +216,7 @@ public class CrawlParser {
|
||||
.replace("{indexId}", sourceIndexId);
|
||||
|
||||
//查询章节内容
|
||||
String contentHtml = crawlHttpClient.get(contentUrl);
|
||||
String contentHtml = crawlHttpClient.get(contentUrl, ruleBean.getCharset());
|
||||
if (contentHtml != null && !contentHtml.contains("正在手打中")) {
|
||||
String content = contentHtml.substring(
|
||||
contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());
|
||||
|
@ -1,5 +1,6 @@
|
||||
package com.java2nb.novel.core.crawl;
|
||||
|
||||
import com.java2nb.novel.utils.Constants;
|
||||
import lombok.Data;
|
||||
|
||||
import java.util.Map;
|
||||
@ -12,6 +13,12 @@ import java.util.Map;
|
||||
@Data
|
||||
public class RuleBean {
|
||||
|
||||
/**
|
||||
* 网页字符编码
|
||||
*/
|
||||
private String charset = Constants.CRAWL_DEFAULT_CHARSET;
|
||||
|
||||
|
||||
/**
|
||||
* 小说更新列表url
|
||||
*/
|
||||
|
@ -271,7 +271,7 @@ public class CrawlServiceImpl implements CrawlService {
|
||||
}
|
||||
log.info("catBookListUrl:{}", catBookListUrl);
|
||||
|
||||
String bookListHtml = crawlHttpClient.get(catBookListUrl);
|
||||
String bookListHtml = crawlHttpClient.get(catBookListUrl, ruleBean.getCharset());
|
||||
if (bookListHtml != null) {
|
||||
Pattern bookIdPatten = Pattern.compile(ruleBean.getBookIdPatten());
|
||||
Matcher bookIdMatcher = bookIdPatten.matcher(bookListHtml);
|
||||
|
@ -24,4 +24,9 @@ public class Constants {
|
||||
* 爬取小说http请求失败重试次数
|
||||
*/
|
||||
public static final Integer HTTP_FAIL_RETRY_COUNT = 3;
|
||||
|
||||
/**
|
||||
* 爬虫默认编码
|
||||
*/
|
||||
public static final String CRAWL_DEFAULT_CHARSET = "UTF-8";
|
||||
}
|
||||
|
@ -25,7 +25,7 @@ public class CrawlHttpClient {
|
||||
|
||||
private static final ThreadLocal<Integer> RETRY_COUNT = new ThreadLocal<>();
|
||||
|
||||
public String get(String url) {
|
||||
public String get(String url, String charset) {
|
||||
if (Objects.nonNull(intervalMin) && Objects.nonNull(intervalMax) && intervalMax > intervalMin) {
|
||||
try {
|
||||
Thread.sleep(random.nextInt(intervalMax - intervalMin + 1) + intervalMin);
|
||||
@ -33,22 +33,22 @@ public class CrawlHttpClient {
|
||||
log.error(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
String body = HttpUtil.getByHttpClientWithChrome(url);
|
||||
String body = HttpUtil.getByHttpClientWithChrome(url, charset);
|
||||
if (Objects.isNull(body) || body.length() < Constants.INVALID_HTML_LENGTH) {
|
||||
return processErrorHttpResult(url);
|
||||
return processErrorHttpResult(url, charset);
|
||||
}
|
||||
//成功获得html内容
|
||||
return body;
|
||||
}
|
||||
|
||||
private String processErrorHttpResult(String url) {
|
||||
private String processErrorHttpResult(String url, String charset) {
|
||||
Integer count = RETRY_COUNT.get();
|
||||
if (count == null) {
|
||||
count = 0;
|
||||
}
|
||||
if (count < Constants.HTTP_FAIL_RETRY_COUNT) {
|
||||
RETRY_COUNT.set(++count);
|
||||
return get(url);
|
||||
return get(url, charset);
|
||||
}
|
||||
RETRY_COUNT.remove();
|
||||
return null;
|
||||
|
@ -54,6 +54,8 @@
|
||||
<!--示例:<b>https://m.xdingdiann.com/sort/0/1.html</b>
|
||||
<li><input type="text" id="updateBookListUrl" class="s_input icon_key"
|
||||
placeholder="小说更新列表url"></li>-->
|
||||
示例:<b>utf-8</b>
|
||||
<li><input type="text" id="charset" class="s_input icon_name" placeholder="网站编码"></li>
|
||||
示例:<b>http://m.xdingdiann.com/sort/{catId}/{page}.html</b> ({catId}代表分类ID,{page}代表分页页码)
|
||||
<li><input type="text" id="bookListUrl" class="s_input icon_key"
|
||||
placeholder="分类列表页URL规则"></li>
|
||||
@ -411,6 +413,11 @@
|
||||
var filterContent = $("#filterContent").val();
|
||||
crawlRule.filterContent = filterContent;
|
||||
|
||||
var charset = $('#charset').val();
|
||||
if (charset) {
|
||||
crawlRule.charset = charset;
|
||||
}
|
||||
|
||||
|
||||
$.ajax({
|
||||
type: "POST",
|
||||
|
@ -55,6 +55,8 @@
|
||||
<!--示例:<b>https://m.xdingdiann.com/sort/0/1.html</b>
|
||||
<li><input type="text" id="updateBookListUrl" class="s_input icon_key"
|
||||
placeholder="小说更新列表url"></li>-->
|
||||
示例:<b>utf-8</b>
|
||||
<li><input type="text" id="charset" class="s_input icon_name" placeholder="网站编码"></li>
|
||||
示例:<b>http://m.xdingdiann.com/sort/{catId}/{page}.html</b> ({catId}代表分类ID,{page}代表分页页码)
|
||||
<li><input type="text" id="bookListUrl" class="s_input icon_key"
|
||||
placeholder="分类列表页URL规则"></li>
|
||||
@ -274,6 +276,7 @@
|
||||
$("#contentStart").val(crawlRule.contentStart);
|
||||
$("#contentEnd").val(crawlRule.contentEnd);
|
||||
$("#filterContent").val(crawlRule.filterContent);
|
||||
$("#charset").val(crawlRule.charset);
|
||||
|
||||
}
|
||||
}
|
||||
@ -496,6 +499,11 @@
|
||||
var filterContent = $("#filterContent").val();
|
||||
crawlRule.filterContent = filterContent;
|
||||
|
||||
var charset = $('#charset').val();
|
||||
if (charset) {
|
||||
crawlRule.charset = charset;
|
||||
}
|
||||
|
||||
|
||||
$.ajax({
|
||||
type: "POST",
|
||||
|
Loading…
x
Reference in New Issue
Block a user