From 73502a279bfae1e5db2f82a1d5c02d5fd5f93153 Mon Sep 17 00:00:00 2001 From: xiongxiaoyang <1179705413@qq.com> Date: Fri, 14 Mar 2025 20:39:57 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=94=AF=E6=8C=81=E9=9D=9Eutf-8?= =?UTF-8?q?=E7=BC=96=E7=A0=81=E7=9A=84=E7=BD=91=E7=AB=99=E9=87=87=E9=9B=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../java2nb/novel/core/utils/HttpUtil.java | 25 ++++++++++++++++--- .../java2nb/novel/core/crawl/CrawlParser.java | 6 ++--- .../java2nb/novel/core/crawl/RuleBean.java | 7 ++++++ .../novel/service/impl/CrawlServiceImpl.java | 2 +- .../com/java2nb/novel/utils/Constants.java | 5 ++++ .../java2nb/novel/utils/CrawlHttpClient.java | 10 ++++---- .../templates/crawl/crawlSource_add.html | 7 ++++++ .../templates/crawl/crawlSource_update.html | 8 ++++++ 8 files changed, 57 insertions(+), 13 deletions(-) diff --git a/novel-common/src/main/java/com/java2nb/novel/core/utils/HttpUtil.java b/novel-common/src/main/java/com/java2nb/novel/core/utils/HttpUtil.java index 47f900a..daabff1 100644 --- a/novel-common/src/main/java/com/java2nb/novel/core/utils/HttpUtil.java +++ b/novel-common/src/main/java/com/java2nb/novel/core/utils/HttpUtil.java @@ -4,22 +4,35 @@ import lombok.extern.slf4j.Slf4j; import org.springframework.http.*; import org.springframework.web.client.RestTemplate; +import java.nio.charset.Charset; +import java.util.HashMap; +import java.util.Map; + /** * @author Administrator */ @Slf4j public class HttpUtil { - private static final RestTemplate REST_TEMPLATE = RestTemplates.newInstance("utf-8"); + private static final String DEFAULT_CHARSET = "utf-8"; - public static String getByHttpClientWithChrome(String url) { + private static final Map REST_TEMPLATE_MAP = new HashMap<>(); + + public static String getByHttpClientWithChrome(String url, String charset) { + log.debug("Get url:{}", url); + if (!Charset.isSupported(charset)) { + log.error("字符编码{}无效!", charset); + return null; + } + RestTemplate restTemplate = REST_TEMPLATE_MAP.computeIfAbsent(charset, + k -> RestTemplates.newInstance(charset)); try { - log.debug("Get url:{}", url); HttpHeaders headers = new HttpHeaders(); headers.add("user-agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36"); HttpEntity requestEntity = new HttpEntity<>(null, headers); - ResponseEntity forEntity = REST_TEMPLATE.exchange(url, HttpMethod.GET, requestEntity, String.class); + ResponseEntity forEntity = restTemplate.exchange(url, HttpMethod.GET, requestEntity, + String.class); log.debug("Response code:{}", forEntity.getStatusCode()); if (forEntity.getStatusCode() == HttpStatus.OK) { return forEntity.getBody(); @@ -32,4 +45,8 @@ public class HttpUtil { } } + public static String getByHttpClientWithChrome(String url) { + return getByHttpClientWithChrome(url, DEFAULT_CHARSET); + } + } diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java index 3068ffe..312ca43 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java @@ -38,7 +38,7 @@ public class CrawlParser { public void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) { Book book = new Book(); String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId); - String bookDetailHtml = crawlHttpClient.get(bookDetailUrl); + String bookDetailHtml = crawlHttpClient.get(bookDetailUrl, ruleBean.getCharset()); if (bookDetailHtml != null) { Pattern bookNamePatten = PatternFactory.getPattern(ruleBean.getBookNamePatten()); Matcher bookNameMatch = bookNamePatten.matcher(bookDetailHtml); @@ -152,7 +152,7 @@ public class CrawlParser { List contentList = new ArrayList<>(); //读取目录 String indexListUrl = ruleBean.getBookIndexUrl().replace("{bookId}", sourceBookId); - String indexListHtml = crawlHttpClient.get(indexListUrl); + String indexListHtml = crawlHttpClient.get(indexListUrl, ruleBean.getCharset()); if (indexListHtml != null) { if (StringUtils.isNotBlank(ruleBean.getBookIndexStart())) { @@ -216,7 +216,7 @@ public class CrawlParser { .replace("{indexId}", sourceIndexId); //查询章节内容 - String contentHtml = crawlHttpClient.get(contentUrl); + String contentHtml = crawlHttpClient.get(contentUrl, ruleBean.getCharset()); if (contentHtml != null && !contentHtml.contains("正在手打中")) { String content = contentHtml.substring( contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length()); diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/RuleBean.java b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/RuleBean.java index 58a4efb..e6f134d 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/RuleBean.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/RuleBean.java @@ -1,5 +1,6 @@ package com.java2nb.novel.core.crawl; +import com.java2nb.novel.utils.Constants; import lombok.Data; import java.util.Map; @@ -12,6 +13,12 @@ import java.util.Map; @Data public class RuleBean { + /** + * 网页字符编码 + */ + private String charset = Constants.CRAWL_DEFAULT_CHARSET; + + /** * 小说更新列表url */ diff --git a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java index 0a30be5..0e6f849 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java @@ -271,7 +271,7 @@ public class CrawlServiceImpl implements CrawlService { } log.info("catBookListUrl:{}", catBookListUrl); - String bookListHtml = crawlHttpClient.get(catBookListUrl); + String bookListHtml = crawlHttpClient.get(catBookListUrl, ruleBean.getCharset()); if (bookListHtml != null) { Pattern bookIdPatten = Pattern.compile(ruleBean.getBookIdPatten()); Matcher bookIdMatcher = bookIdPatten.matcher(bookListHtml); diff --git a/novel-crawl/src/main/java/com/java2nb/novel/utils/Constants.java b/novel-crawl/src/main/java/com/java2nb/novel/utils/Constants.java index 5d3c59c..b84f91a 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/utils/Constants.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/utils/Constants.java @@ -24,4 +24,9 @@ public class Constants { * 爬取小说http请求失败重试次数 */ public static final Integer HTTP_FAIL_RETRY_COUNT = 3; + + /** + * 爬虫默认编码 + */ + public static final String CRAWL_DEFAULT_CHARSET = "UTF-8"; } diff --git a/novel-crawl/src/main/java/com/java2nb/novel/utils/CrawlHttpClient.java b/novel-crawl/src/main/java/com/java2nb/novel/utils/CrawlHttpClient.java index 515f005..38e492b 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/utils/CrawlHttpClient.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/utils/CrawlHttpClient.java @@ -25,7 +25,7 @@ public class CrawlHttpClient { private static final ThreadLocal RETRY_COUNT = new ThreadLocal<>(); - public String get(String url) { + public String get(String url, String charset) { if (Objects.nonNull(intervalMin) && Objects.nonNull(intervalMax) && intervalMax > intervalMin) { try { Thread.sleep(random.nextInt(intervalMax - intervalMin + 1) + intervalMin); @@ -33,22 +33,22 @@ public class CrawlHttpClient { log.error(e.getMessage(), e); } } - String body = HttpUtil.getByHttpClientWithChrome(url); + String body = HttpUtil.getByHttpClientWithChrome(url, charset); if (Objects.isNull(body) || body.length() < Constants.INVALID_HTML_LENGTH) { - return processErrorHttpResult(url); + return processErrorHttpResult(url, charset); } //成功获得html内容 return body; } - private String processErrorHttpResult(String url) { + private String processErrorHttpResult(String url, String charset) { Integer count = RETRY_COUNT.get(); if (count == null) { count = 0; } if (count < Constants.HTTP_FAIL_RETRY_COUNT) { RETRY_COUNT.set(++count); - return get(url); + return get(url, charset); } RETRY_COUNT.remove(); return null; diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html b/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html index 5162663..4b35329 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSource_add.html @@ -54,6 +54,8 @@ + 示例:utf-8 +
  • 示例:http://m.xdingdiann.com/sort/{catId}/{page}.html ({catId}代表分类ID,{page}代表分页页码)
  • @@ -411,6 +413,11 @@ var filterContent = $("#filterContent").val(); crawlRule.filterContent = filterContent; + var charset = $('#charset').val(); + if (charset) { + crawlRule.charset = charset; + } + $.ajax({ type: "POST", diff --git a/novel-crawl/src/main/resources/templates/crawl/crawlSource_update.html b/novel-crawl/src/main/resources/templates/crawl/crawlSource_update.html index 08dd048..eec916b 100644 --- a/novel-crawl/src/main/resources/templates/crawl/crawlSource_update.html +++ b/novel-crawl/src/main/resources/templates/crawl/crawlSource_update.html @@ -55,6 +55,8 @@ + 示例:utf-8 +
  • 示例:http://m.xdingdiann.com/sort/{catId}/{page}.html ({catId}代表分类ID,{page}代表分页页码)
  • @@ -274,6 +276,7 @@ $("#contentStart").val(crawlRule.contentStart); $("#contentEnd").val(crawlRule.contentEnd); $("#filterContent").val(crawlRule.filterContent); + $("#charset").val(crawlRule.charset); } } @@ -496,6 +499,11 @@ var filterContent = $("#filterContent").val(); crawlRule.filterContent = filterContent; + var charset = $('#charset').val(); + if (charset) { + crawlRule.charset = charset; + } + $.ajax({ type: "POST",