feat: 支持非utf-8编码的网站采集

This commit is contained in:
xiongxiaoyang
2025-03-14 20:39:57 +08:00
parent 85b64bbc10
commit 73502a279b
8 changed files with 57 additions and 13 deletions

View File

@ -38,7 +38,7 @@ public class CrawlParser {
public void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) {
Book book = new Book();
String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
String bookDetailHtml = crawlHttpClient.get(bookDetailUrl);
String bookDetailHtml = crawlHttpClient.get(bookDetailUrl, ruleBean.getCharset());
if (bookDetailHtml != null) {
Pattern bookNamePatten = PatternFactory.getPattern(ruleBean.getBookNamePatten());
Matcher bookNameMatch = bookNamePatten.matcher(bookDetailHtml);
@ -152,7 +152,7 @@ public class CrawlParser {
List<BookContent> contentList = new ArrayList<>();
//读取目录
String indexListUrl = ruleBean.getBookIndexUrl().replace("{bookId}", sourceBookId);
String indexListHtml = crawlHttpClient.get(indexListUrl);
String indexListHtml = crawlHttpClient.get(indexListUrl, ruleBean.getCharset());
if (indexListHtml != null) {
if (StringUtils.isNotBlank(ruleBean.getBookIndexStart())) {
@ -216,7 +216,7 @@ public class CrawlParser {
.replace("{indexId}", sourceIndexId);
//查询章节内容
String contentHtml = crawlHttpClient.get(contentUrl);
String contentHtml = crawlHttpClient.get(contentUrl, ruleBean.getCharset());
if (contentHtml != null && !contentHtml.contains("正在手打中")) {
String content = contentHtml.substring(
contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());

View File

@ -1,5 +1,6 @@
package com.java2nb.novel.core.crawl;
import com.java2nb.novel.utils.Constants;
import lombok.Data;
import java.util.Map;
@ -12,6 +13,12 @@ import java.util.Map;
@Data
public class RuleBean {
/**
* 网页字符编码
*/
private String charset = Constants.CRAWL_DEFAULT_CHARSET;
/**
* 小说更新列表url
*/

View File

@ -271,7 +271,7 @@ public class CrawlServiceImpl implements CrawlService {
}
log.info("catBookListUrl{}", catBookListUrl);
String bookListHtml = crawlHttpClient.get(catBookListUrl);
String bookListHtml = crawlHttpClient.get(catBookListUrl, ruleBean.getCharset());
if (bookListHtml != null) {
Pattern bookIdPatten = Pattern.compile(ruleBean.getBookIdPatten());
Matcher bookIdMatcher = bookIdPatten.matcher(bookListHtml);

View File

@ -24,4 +24,9 @@ public class Constants {
* 爬取小说http请求失败重试次数
*/
public static final Integer HTTP_FAIL_RETRY_COUNT = 3;
/**
* 爬虫默认编码
*/
public static final String CRAWL_DEFAULT_CHARSET = "UTF-8";
}

View File

@ -25,7 +25,7 @@ public class CrawlHttpClient {
private static final ThreadLocal<Integer> RETRY_COUNT = new ThreadLocal<>();
public String get(String url) {
public String get(String url, String charset) {
if (Objects.nonNull(intervalMin) && Objects.nonNull(intervalMax) && intervalMax > intervalMin) {
try {
Thread.sleep(random.nextInt(intervalMax - intervalMin + 1) + intervalMin);
@ -33,22 +33,22 @@ public class CrawlHttpClient {
log.error(e.getMessage(), e);
}
}
String body = HttpUtil.getByHttpClientWithChrome(url);
String body = HttpUtil.getByHttpClientWithChrome(url, charset);
if (Objects.isNull(body) || body.length() < Constants.INVALID_HTML_LENGTH) {
return processErrorHttpResult(url);
return processErrorHttpResult(url, charset);
}
//成功获得html内容
return body;
}
private String processErrorHttpResult(String url) {
private String processErrorHttpResult(String url, String charset) {
Integer count = RETRY_COUNT.get();
if (count == null) {
count = 0;
}
if (count < Constants.HTTP_FAIL_RETRY_COUNT) {
RETRY_COUNT.set(++count);
return get(url);
return get(url, charset);
}
RETRY_COUNT.remove();
return null;

View File

@ -54,6 +54,8 @@
<!--示例:<b>https://m.xdingdiann.com/sort/0/1.html</b>
<li><input type="text" id="updateBookListUrl" class="s_input icon_key"
placeholder="小说更新列表url"></li>-->
示例:<b>utf-8</b>
<li><input type="text" id="charset" class="s_input icon_name" placeholder="网站编码"></li>
示例:<b>http://m.xdingdiann.com/sort/{catId}/{page}.html</b> ({catId}代表分类ID{page}代表分页页码)
<li><input type="text" id="bookListUrl" class="s_input icon_key"
placeholder="分类列表页URL规则"></li>
@ -411,6 +413,11 @@
var filterContent = $("#filterContent").val();
crawlRule.filterContent = filterContent;
var charset = $('#charset').val();
if (charset) {
crawlRule.charset = charset;
}
$.ajax({
type: "POST",

View File

@ -55,6 +55,8 @@
<!--示例:<b>https://m.xdingdiann.com/sort/0/1.html</b>
<li><input type="text" id="updateBookListUrl" class="s_input icon_key"
placeholder="小说更新列表url"></li>-->
示例:<b>utf-8</b>
<li><input type="text" id="charset" class="s_input icon_name" placeholder="网站编码"></li>
示例:<b>http://m.xdingdiann.com/sort/{catId}/{page}.html</b> ({catId}代表分类ID{page}代表分页页码)
<li><input type="text" id="bookListUrl" class="s_input icon_key"
placeholder="分类列表页URL规则"></li>
@ -274,6 +276,7 @@
$("#contentStart").val(crawlRule.contentStart);
$("#contentEnd").val(crawlRule.contentEnd);
$("#filterContent").val(crawlRule.filterContent);
$("#charset").val(crawlRule.charset);
}
}
@ -496,6 +499,11 @@
var filterContent = $("#filterContent").val();
crawlRule.filterContent = filterContent;
var charset = $('#charset').val();
if (charset) {
crawlRule.charset = charset;
}
$.ajax({
type: "POST",