feat: 支持非utf-8编码的网站采集

This commit is contained in:
xiongxiaoyang 2025-03-14 20:39:57 +08:00
parent 85b64bbc10
commit 73502a279b
8 changed files with 57 additions and 13 deletions

View File

@ -4,22 +4,35 @@ import lombok.extern.slf4j.Slf4j;
import org.springframework.http.*;
import org.springframework.web.client.RestTemplate;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
/**
* @author Administrator
*/
@Slf4j
public class HttpUtil {
private static final RestTemplate REST_TEMPLATE = RestTemplates.newInstance("utf-8");
private static final String DEFAULT_CHARSET = "utf-8";
public static String getByHttpClientWithChrome(String url) {
try {
private static final Map<String, RestTemplate> REST_TEMPLATE_MAP = new HashMap<>();
public static String getByHttpClientWithChrome(String url, String charset) {
log.debug("Get url{}", url);
if (!Charset.isSupported(charset)) {
log.error("字符编码{}无效!", charset);
return null;
}
RestTemplate restTemplate = REST_TEMPLATE_MAP.computeIfAbsent(charset,
k -> RestTemplates.newInstance(charset));
try {
HttpHeaders headers = new HttpHeaders();
headers.add("user-agent",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36");
HttpEntity<String> requestEntity = new HttpEntity<>(null, headers);
ResponseEntity<String> forEntity = REST_TEMPLATE.exchange(url, HttpMethod.GET, requestEntity, String.class);
ResponseEntity<String> forEntity = restTemplate.exchange(url, HttpMethod.GET, requestEntity,
String.class);
log.debug("Response code{}", forEntity.getStatusCode());
if (forEntity.getStatusCode() == HttpStatus.OK) {
return forEntity.getBody();
@ -32,4 +45,8 @@ public class HttpUtil {
}
}
public static String getByHttpClientWithChrome(String url) {
return getByHttpClientWithChrome(url, DEFAULT_CHARSET);
}
}

View File

@ -38,7 +38,7 @@ public class CrawlParser {
public void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) {
Book book = new Book();
String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
String bookDetailHtml = crawlHttpClient.get(bookDetailUrl);
String bookDetailHtml = crawlHttpClient.get(bookDetailUrl, ruleBean.getCharset());
if (bookDetailHtml != null) {
Pattern bookNamePatten = PatternFactory.getPattern(ruleBean.getBookNamePatten());
Matcher bookNameMatch = bookNamePatten.matcher(bookDetailHtml);
@ -152,7 +152,7 @@ public class CrawlParser {
List<BookContent> contentList = new ArrayList<>();
//读取目录
String indexListUrl = ruleBean.getBookIndexUrl().replace("{bookId}", sourceBookId);
String indexListHtml = crawlHttpClient.get(indexListUrl);
String indexListHtml = crawlHttpClient.get(indexListUrl, ruleBean.getCharset());
if (indexListHtml != null) {
if (StringUtils.isNotBlank(ruleBean.getBookIndexStart())) {
@ -216,7 +216,7 @@ public class CrawlParser {
.replace("{indexId}", sourceIndexId);
//查询章节内容
String contentHtml = crawlHttpClient.get(contentUrl);
String contentHtml = crawlHttpClient.get(contentUrl, ruleBean.getCharset());
if (contentHtml != null && !contentHtml.contains("正在手打中")) {
String content = contentHtml.substring(
contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());

View File

@ -1,5 +1,6 @@
package com.java2nb.novel.core.crawl;
import com.java2nb.novel.utils.Constants;
import lombok.Data;
import java.util.Map;
@ -12,6 +13,12 @@ import java.util.Map;
@Data
public class RuleBean {
/**
* 网页字符编码
*/
private String charset = Constants.CRAWL_DEFAULT_CHARSET;
/**
* 小说更新列表url
*/

View File

@ -271,7 +271,7 @@ public class CrawlServiceImpl implements CrawlService {
}
log.info("catBookListUrl{}", catBookListUrl);
String bookListHtml = crawlHttpClient.get(catBookListUrl);
String bookListHtml = crawlHttpClient.get(catBookListUrl, ruleBean.getCharset());
if (bookListHtml != null) {
Pattern bookIdPatten = Pattern.compile(ruleBean.getBookIdPatten());
Matcher bookIdMatcher = bookIdPatten.matcher(bookListHtml);

View File

@ -24,4 +24,9 @@ public class Constants {
* 爬取小说http请求失败重试次数
*/
public static final Integer HTTP_FAIL_RETRY_COUNT = 3;
/**
* 爬虫默认编码
*/
public static final String CRAWL_DEFAULT_CHARSET = "UTF-8";
}

View File

@ -25,7 +25,7 @@ public class CrawlHttpClient {
private static final ThreadLocal<Integer> RETRY_COUNT = new ThreadLocal<>();
public String get(String url) {
public String get(String url, String charset) {
if (Objects.nonNull(intervalMin) && Objects.nonNull(intervalMax) && intervalMax > intervalMin) {
try {
Thread.sleep(random.nextInt(intervalMax - intervalMin + 1) + intervalMin);
@ -33,22 +33,22 @@ public class CrawlHttpClient {
log.error(e.getMessage(), e);
}
}
String body = HttpUtil.getByHttpClientWithChrome(url);
String body = HttpUtil.getByHttpClientWithChrome(url, charset);
if (Objects.isNull(body) || body.length() < Constants.INVALID_HTML_LENGTH) {
return processErrorHttpResult(url);
return processErrorHttpResult(url, charset);
}
//成功获得html内容
return body;
}
private String processErrorHttpResult(String url) {
private String processErrorHttpResult(String url, String charset) {
Integer count = RETRY_COUNT.get();
if (count == null) {
count = 0;
}
if (count < Constants.HTTP_FAIL_RETRY_COUNT) {
RETRY_COUNT.set(++count);
return get(url);
return get(url, charset);
}
RETRY_COUNT.remove();
return null;

View File

@ -54,6 +54,8 @@
<!--示例<b>https://m.xdingdiann.com/sort/0/1.html</b>
<li><input type="text" id="updateBookListUrl" class="s_input icon_key"
placeholder="小说更新列表url"></li>-->
示例<b>utf-8</b>
<li><input type="text" id="charset" class="s_input icon_name" placeholder="网站编码"></li>
示例<b>http://m.xdingdiann.com/sort/{catId}/{page}.html</b> ({catId}代表分类ID{page}代表分页页码)
<li><input type="text" id="bookListUrl" class="s_input icon_key"
placeholder="分类列表页URL规则"></li>
@ -411,6 +413,11 @@
var filterContent = $("#filterContent").val();
crawlRule.filterContent = filterContent;
var charset = $('#charset').val();
if (charset) {
crawlRule.charset = charset;
}
$.ajax({
type: "POST",

View File

@ -55,6 +55,8 @@
<!--示例<b>https://m.xdingdiann.com/sort/0/1.html</b>
<li><input type="text" id="updateBookListUrl" class="s_input icon_key"
placeholder="小说更新列表url"></li>-->
示例<b>utf-8</b>
<li><input type="text" id="charset" class="s_input icon_name" placeholder="网站编码"></li>
示例<b>http://m.xdingdiann.com/sort/{catId}/{page}.html</b> ({catId}代表分类ID{page}代表分页页码)
<li><input type="text" id="bookListUrl" class="s_input icon_key"
placeholder="分类列表页URL规则"></li>
@ -274,6 +276,7 @@
$("#contentStart").val(crawlRule.contentStart);
$("#contentEnd").val(crawlRule.contentEnd);
$("#filterContent").val(crawlRule.filterContent);
$("#charset").val(crawlRule.charset);
}
}
@ -496,6 +499,11 @@
var filterContent = $("#filterContent").val();
crawlRule.filterContent = filterContent;
var charset = $('#charset').val();
if (charset) {
crawlRule.charset = charset;
}
$.ajax({
type: "POST",