feat: 支持非utf-8编码的网站采集

This commit is contained in:
xiongxiaoyang
2025-03-14 20:39:57 +08:00
parent 85b64bbc10
commit 73502a279b
8 changed files with 57 additions and 13 deletions

View File

@ -4,22 +4,35 @@ import lombok.extern.slf4j.Slf4j;
import org.springframework.http.*;
import org.springframework.web.client.RestTemplate;
import java.nio.charset.Charset;
import java.util.HashMap;
import java.util.Map;
/**
* @author Administrator
*/
@Slf4j
public class HttpUtil {
private static final RestTemplate REST_TEMPLATE = RestTemplates.newInstance("utf-8");
private static final String DEFAULT_CHARSET = "utf-8";
public static String getByHttpClientWithChrome(String url) {
private static final Map<String, RestTemplate> REST_TEMPLATE_MAP = new HashMap<>();
public static String getByHttpClientWithChrome(String url, String charset) {
log.debug("Get url{}", url);
if (!Charset.isSupported(charset)) {
log.error("字符编码{}无效!", charset);
return null;
}
RestTemplate restTemplate = REST_TEMPLATE_MAP.computeIfAbsent(charset,
k -> RestTemplates.newInstance(charset));
try {
log.debug("Get url{}", url);
HttpHeaders headers = new HttpHeaders();
headers.add("user-agent",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36");
HttpEntity<String> requestEntity = new HttpEntity<>(null, headers);
ResponseEntity<String> forEntity = REST_TEMPLATE.exchange(url, HttpMethod.GET, requestEntity, String.class);
ResponseEntity<String> forEntity = restTemplate.exchange(url, HttpMethod.GET, requestEntity,
String.class);
log.debug("Response code{}", forEntity.getStatusCode());
if (forEntity.getStatusCode() == HttpStatus.OK) {
return forEntity.getBody();
@ -32,4 +45,8 @@ public class HttpUtil {
}
}
public static String getByHttpClientWithChrome(String url) {
return getByHttpClientWithChrome(url, DEFAULT_CHARSET);
}
}