perf: 爬虫采集流程优化

This commit is contained in:
xiongxiaoyang 2025-03-14 19:27:46 +08:00
parent 6d0ab33757
commit 85b64bbc10
2 changed files with 6 additions and 1 deletions

View File

@ -14,12 +14,13 @@ public class HttpUtil {
public static String getByHttpClientWithChrome(String url) {
try {
log.debug("Get url{}", url);
HttpHeaders headers = new HttpHeaders();
headers.add("user-agent",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36");
HttpEntity<String> requestEntity = new HttpEntity<>(null, headers);
ResponseEntity<String> forEntity = REST_TEMPLATE.exchange(url, HttpMethod.GET, requestEntity, String.class);
log.debug("Response code{}", forEntity.getStatusCode());
if (forEntity.getStatusCode() == HttpStatus.OK) {
return forEntity.getBody();
} else {

View File

@ -309,6 +309,10 @@ public class CrawlServiceImpl implements CrawlService {
} catch (Exception e) {
log.error(e.getMessage(), e);
}
if (page == totalPage) {
// 第一遍采集完成翻到第一页继续第二次采集适用于分页数比较少的最近更新列表
page = 0;
}
page += 1;
}