This commit is contained in:
xiongxiaoyang 2024-09-21 10:15:45 +08:00
parent 5c35f7af0a
commit a8219253e9
9 changed files with 175 additions and 158 deletions

View File

@ -14,7 +14,10 @@ import org.springframework.http.ResponseEntity;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.*;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Date;
import java.util.Objects;
@ -37,10 +40,13 @@ public class FileUtil {
//本地图片保存
HttpHeaders headers = new HttpHeaders();
HttpEntity<String> requestEntity = new HttpEntity<>(null, headers);
ResponseEntity<Resource> resEntity = RestTemplateUtil.getInstance(Charsets.ISO_8859_1.name()).exchange(picSrc, HttpMethod.GET, requestEntity, Resource.class);
ResponseEntity<Resource> resEntity = RestTemplates.newInstance(Charsets.ISO_8859_1.name())
.exchange(picSrc, HttpMethod.GET, requestEntity, Resource.class);
input = Objects.requireNonNull(resEntity.getBody()).getInputStream();
Date currentDate = new Date();
picSrc = visitPrefix + DateUtils.formatDate(currentDate, "yyyy") + "/" + DateUtils.formatDate(currentDate, "MM") + "/" + DateUtils.formatDate(currentDate, "dd") + "/"
picSrc =
visitPrefix + DateUtils.formatDate(currentDate, "yyyy") + "/" + DateUtils.formatDate(currentDate, "MM")
+ "/" + DateUtils.formatDate(currentDate, "dd") + "/"
+ UUIDUtil.getUUID32()
+ picSrc.substring(picSrc.lastIndexOf("."));
File picFile = new File(picSavePath + picSrc);
@ -67,7 +73,6 @@ public class FileUtil {
closeStream(input, out);
}
return picSrc;
}

View File

@ -1,38 +1,24 @@
package com.java2nb.novel.core.utils;
import lombok.extern.slf4j.Slf4j;
import org.springframework.http.*;
import org.springframework.web.client.RestTemplate;
/**
* @author Administrator
*/
@Slf4j
public class HttpUtil {
private static RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8");
public static String getByHttpClient(String url) {
try {
ResponseEntity<String> forEntity = restTemplate.getForEntity(url, String.class);
if (forEntity.getStatusCode() == HttpStatus.OK) {
return forEntity.getBody();
} else {
return null;
}
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
private static final RestTemplate REST_TEMPLATE = RestTemplates.newInstance("utf-8");
public static String getByHttpClientWithChrome(String url) {
try {
HttpHeaders headers = new HttpHeaders();
headers.add("user-agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36");
headers.add("user-agent",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36");
HttpEntity<String> requestEntity = new HttpEntity<>(null, headers);
ResponseEntity<String> forEntity = restTemplate.exchange(url.toString(), HttpMethod.GET, requestEntity, String.class);
ResponseEntity<String> forEntity = REST_TEMPLATE.exchange(url, HttpMethod.GET, requestEntity, String.class);
if (forEntity.getStatusCode() == HttpStatus.OK) {
return forEntity.getBody();
@ -40,8 +26,9 @@ public class HttpUtil {
return null;
}
} catch (Exception e) {
e.printStackTrace();
log.error(e.getMessage(), e);
return null;
}
}
}

View File

@ -26,16 +26,16 @@ import java.util.List;
import java.util.Objects;
@Component
public class RestTemplateUtil {
public class RestTemplates {
private static HttpProxyProperties httpProxyProperties;
RestTemplateUtil(HttpProxyProperties properties) {
RestTemplates(HttpProxyProperties properties) {
httpProxyProperties = properties;
}
@SneakyThrows
public static RestTemplate getInstance(String charset) {
public static RestTemplate newInstance(String charset) {
TrustStrategy acceptingTrustStrategy = (X509Certificate[] chain, String authType) -> true;

View File

@ -1,23 +1,23 @@
package com.java2nb.novel.core.crawl;
import com.java2nb.novel.core.utils.HttpUtil;
import com.java2nb.novel.core.utils.RandomBookInfoUtil;
import com.java2nb.novel.core.utils.RestTemplateUtil;
import com.java2nb.novel.core.utils.StringUtil;
import com.java2nb.novel.entity.Book;
import com.java2nb.novel.entity.BookContent;
import com.java2nb.novel.entity.BookIndex;
import com.java2nb.novel.utils.Constants;
import com.java2nb.novel.utils.CrawlHttpClient;
import io.github.xxyopen.util.IdWorker;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.web.client.RestTemplate;
import org.springframework.stereotype.Component;
import java.text.SimpleDateFormat;
import java.util.*;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -26,20 +26,19 @@ import java.util.regex.Pattern;
*
* @author Administrator
*/
@Slf4j
@Component
@RequiredArgsConstructor
public class CrawlParser {
private static final IdWorker idWorker = IdWorker.INSTANCE;
private final IdWorker ID_WORKER = IdWorker.INSTANCE;
private static final RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8");
private static final ThreadLocal<Integer> retryCount = new ThreadLocal<>();
private final CrawlHttpClient crawlHttpClient;
@SneakyThrows
public static void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) {
public void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) {
Book book = new Book();
String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
String bookDetailHtml = getByHttpClientWithChrome(bookDetailUrl);
String bookDetailHtml = crawlHttpClient.get(bookDetailUrl);
if (bookDetailHtml != null) {
Pattern bookNamePatten = PatternFactory.getPattern(ruleBean.getBookNamePatten());
Matcher bookNameMatch = bookNamePatten.matcher(bookDetailHtml);
@ -144,7 +143,7 @@ public class CrawlParser {
handler.handle(book);
}
public static boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean,
public boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean,
Map<Integer, BookIndex> existBookIndexMap, CrawlBookChapterHandler handler) {
Date currentDate = new Date();
@ -153,7 +152,7 @@ public class CrawlParser {
List<BookContent> contentList = new ArrayList<>();
//读取目录
String indexListUrl = ruleBean.getBookIndexUrl().replace("{bookId}", sourceBookId);
String indexListHtml = getByHttpClientWithChrome(indexListUrl);
String indexListHtml = crawlHttpClient.get(indexListUrl);
if (indexListHtml != null) {
if (StringUtils.isNotBlank(ruleBean.getBookIndexStart())) {
@ -217,7 +216,7 @@ public class CrawlParser {
.replace("{indexId}", sourceIndexId);
//查询章节内容
String contentHtml = getByHttpClientWithChrome(contentUrl);
String contentHtml = crawlHttpClient.get(contentUrl);
if (contentHtml != null && !contentHtml.contains("正在手打中")) {
String content = contentHtml.substring(
contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());
@ -254,7 +253,7 @@ public class CrawlParser {
} else {
//章节插入
//设置目录和章节内容
Long indexId = idWorker.nextId();
Long indexId = ID_WORKER.nextId();
bookIndex.setId(indexId);
bookIndex.setBookId(book.getId());
@ -308,56 +307,4 @@ public class CrawlParser {
return false;
}
private static String getByHttpClient(String url) {
try {
ResponseEntity<String> forEntity = restTemplate.getForEntity(url, String.class);
if (forEntity.getStatusCode() == HttpStatus.OK) {
String body = forEntity.getBody();
assert body != null;
if (body.length() < Constants.INVALID_HTML_LENGTH) {
return processErrorHttpResult(url);
}
//成功获得html内容
return body;
}
} catch (Exception e) {
e.printStackTrace();
}
return processErrorHttpResult(url);
}
private static String getByHttpClientWithChrome(String url) {
try {
String body = HttpUtil.getByHttpClientWithChrome(url);
if (body != null && body.length() < Constants.INVALID_HTML_LENGTH) {
return processErrorHttpResult(url);
}
//成功获得html内容
return body;
} catch (Exception e) {
e.printStackTrace();
}
return processErrorHttpResult(url);
}
@SneakyThrows
private static String processErrorHttpResult(String url) {
Integer count = retryCount.get();
if (count == null) {
count = 0;
}
if (count < Constants.HTTP_FAIL_RETRY_COUNT) {
Thread.sleep(new Random().nextInt(10 * 1000));
retryCount.set(++count);
return getByHttpClient(url);
}
return null;
}
}

View File

@ -1,10 +1,12 @@
package com.java2nb.novel.core.listener;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.java2nb.novel.core.crawl.ChapterBean;
import com.java2nb.novel.core.crawl.CrawlParser;
import com.java2nb.novel.core.crawl.RuleBean;
import com.java2nb.novel.entity.*;
import com.java2nb.novel.entity.Book;
import com.java2nb.novel.entity.BookIndex;
import com.java2nb.novel.entity.CrawlSingleTask;
import com.java2nb.novel.entity.CrawlSource;
import com.java2nb.novel.service.BookService;
import com.java2nb.novel.service.CrawlService;
import com.java2nb.novel.utils.Constants;
@ -33,6 +35,8 @@ public class StarterListener implements ServletContextListener {
private final CrawlService crawlService;
private final CrawlParser crawlParser;
@Value("${crawl.update.thread}")
private int updateThreadCount;
@ -56,20 +60,24 @@ public class StarterListener implements ServletContextListener {
CrawlSource source = crawlService.queryCrawlSource(needUpdateBook.getCrawlSourceId());
RuleBean ruleBean = new ObjectMapper().readValue(source.getCrawlRule(), RuleBean.class);
//解析小说基本信息
CrawlParser.parseBook(ruleBean, needUpdateBook.getCrawlBookId(),book -> {
crawlParser.parseBook(ruleBean, needUpdateBook.getCrawlBookId(), book -> {
//这里只做老书更新
book.setId(needUpdateBook.getId());
book.setWordCount(needUpdateBook.getWordCount());
if (needUpdateBook.getPicUrl() != null && needUpdateBook.getPicUrl().contains(Constants.LOCAL_PIC_PREFIX)) {
if (needUpdateBook.getPicUrl() != null && needUpdateBook.getPicUrl()
.contains(Constants.LOCAL_PIC_PREFIX)) {
//本地图片则不更新
book.setPicUrl(null);
}
//查询已存在的章节
Map<Integer, BookIndex> existBookIndexMap = bookService.queryExistBookIndexMap(needUpdateBook.getId());
Map<Integer, BookIndex> existBookIndexMap = bookService.queryExistBookIndexMap(
needUpdateBook.getId());
//解析章节目录
CrawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, ruleBean, existBookIndexMap,chapter -> {
bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList(), existBookIndexMap);
});
crawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book,
ruleBean, existBookIndexMap, chapter -> {
bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(),
chapter.getBookContentList(), existBookIndexMap);
});
});
} catch (Exception e) {
log.error(e.getMessage(), e);
@ -88,7 +96,6 @@ public class StarterListener implements ServletContextListener {
}
new Thread(() -> {
log.info("程序启动,开始执行单本采集任务线程。。。");
while (true) {
@ -103,7 +110,8 @@ public class StarterListener implements ServletContextListener {
CrawlSource source = crawlService.queryCrawlSource(task.getSourceId());
RuleBean ruleBean = new ObjectMapper().readValue(source.getCrawlRule(), RuleBean.class);
if (crawlService.parseBookAndSave(task.getCatId(), ruleBean, task.getSourceId(), task.getSourceBookId())) {
if (crawlService.parseBookAndSave(task.getCatId(), ruleBean, task.getSourceId(),
task.getSourceBookId())) {
//采集成功
crawlStatus = 1;
}

View File

@ -2,17 +2,11 @@ package com.java2nb.novel.service.impl;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.github.pagehelper.PageHelper;
import io.github.xxyopen.model.page.PageBean;
import com.java2nb.novel.core.cache.CacheKey;
import com.java2nb.novel.core.cache.CacheService;
import com.java2nb.novel.core.crawl.CrawlParser;
import com.java2nb.novel.core.crawl.RuleBean;
import com.java2nb.novel.core.enums.ResponseStatus;
import io.github.xxyopen.model.page.builder.pagehelper.PageBuilder;
import io.github.xxyopen.util.IdWorker;
import io.github.xxyopen.util.ThreadUtil;
import io.github.xxyopen.web.exception.BusinessException;
import io.github.xxyopen.web.util.BeanUtil;
import com.java2nb.novel.entity.Book;
import com.java2nb.novel.entity.CrawlSingleTask;
import com.java2nb.novel.entity.CrawlSource;
@ -24,6 +18,12 @@ import com.java2nb.novel.service.BookService;
import com.java2nb.novel.service.CrawlService;
import com.java2nb.novel.vo.CrawlSingleTaskVO;
import com.java2nb.novel.vo.CrawlSourceVO;
import io.github.xxyopen.model.page.PageBean;
import io.github.xxyopen.model.page.builder.pagehelper.PageBuilder;
import io.github.xxyopen.util.IdWorker;
import io.github.xxyopen.util.ThreadUtil;
import io.github.xxyopen.web.exception.BusinessException;
import io.github.xxyopen.web.util.BeanUtil;
import io.github.xxyopen.web.util.SpringUtil;
import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows;
@ -51,6 +51,7 @@ import static org.mybatis.dynamic.sql.select.SelectDSL.select;
@Slf4j
public class CrawlServiceImpl implements CrawlService {
private final CrawlParser crawlParser;
private final CrawlSourceMapper crawlSourceMapper;
@ -71,15 +72,16 @@ public class CrawlServiceImpl implements CrawlService {
crawlSourceMapper.insertSelective(source);
}
@Override
public void updateCrawlSource(CrawlSource source) {
if(source.getId()!=null){
Optional<CrawlSource> opt=crawlSourceMapper.selectByPrimaryKey(source.getId());
if(opt.isPresent()) {
CrawlSource crawlSource =opt.get();
if (source.getId() != null) {
Optional<CrawlSource> opt = crawlSourceMapper.selectByPrimaryKey(source.getId());
if (opt.isPresent()) {
CrawlSource crawlSource = opt.get();
if (crawlSource.getSourceStatus() == (byte) 1) {
//关闭
openOrCloseCrawl(crawlSource.getId(),(byte)0);
openOrCloseCrawl(crawlSource.getId(), (byte) 0);
}
Date currentDate = new Date();
crawlSource.setUpdateTime(currentDate);
@ -89,14 +91,15 @@ public class CrawlServiceImpl implements CrawlService {
}
}
}
@Override
public PageBean<CrawlSource> listCrawlByPage(int page, int pageSize) {
PageHelper.startPage(page, pageSize);
SelectStatementProvider render = select(id, sourceName, sourceStatus, createTime, updateTime)
.from(crawlSource)
.orderBy(updateTime)
.build()
.render(RenderingStrategies.MYBATIS3);
.from(crawlSource)
.orderBy(updateTime)
.build()
.render(RenderingStrategies.MYBATIS3);
List<CrawlSource> crawlSources = crawlSourceMapper.selectMany(render);
PageBean<CrawlSource> pageBean = PageBuilder.build(crawlSources);
pageBean.setList(BeanUtil.copyList(crawlSources, CrawlSourceVO.class));
@ -113,7 +116,8 @@ public class CrawlServiceImpl implements CrawlService {
if (sourceStatus == (byte) 0) {
//关闭,直接修改数据库状态并直接修改数据库状态后获取该爬虫正在运行的线程集合全部停止
SpringUtil.getBean(CrawlService.class).updateCrawlSourceStatus(sourceId, sourceStatus);
Set<Long> runningCrawlThreadId = (Set<Long>) cacheService.getObject(CacheKey.RUNNING_CRAWL_THREAD_KEY_PREFIX + sourceId);
Set<Long> runningCrawlThreadId = (Set<Long>) cacheService.getObject(
CacheKey.RUNNING_CRAWL_THREAD_KEY_PREFIX + sourceId);
if (runningCrawlThreadId != null) {
for (Long ThreadId : runningCrawlThreadId) {
Thread thread = ThreadUtil.findThread(ThreadId);
@ -157,11 +161,12 @@ public class CrawlServiceImpl implements CrawlService {
@Override
public CrawlSource queryCrawlSource(Integer sourceId) {
SelectStatementProvider render = select(CrawlSourceDynamicSqlSupport.sourceStatus, CrawlSourceDynamicSqlSupport.crawlRule)
.from(crawlSource)
.where(id, isEqualTo(sourceId))
.build()
.render(RenderingStrategies.MYBATIS3);
SelectStatementProvider render = select(CrawlSourceDynamicSqlSupport.sourceStatus,
CrawlSourceDynamicSqlSupport.crawlRule)
.from(crawlSource)
.where(id, isEqualTo(sourceId))
.build()
.render(RenderingStrategies.MYBATIS3);
return crawlSourceMapper.selectMany(render).get(0);
}
@ -182,10 +187,10 @@ public class CrawlServiceImpl implements CrawlService {
public PageBean<CrawlSingleTask> listCrawlSingleTaskByPage(int page, int pageSize) {
PageHelper.startPage(page, pageSize);
SelectStatementProvider render = select(CrawlSingleTaskDynamicSqlSupport.crawlSingleTask.allColumns())
.from(CrawlSingleTaskDynamicSqlSupport.crawlSingleTask)
.orderBy(CrawlSingleTaskDynamicSqlSupport.createTime.descending())
.build()
.render(RenderingStrategies.MYBATIS3);
.from(CrawlSingleTaskDynamicSqlSupport.crawlSingleTask)
.orderBy(CrawlSingleTaskDynamicSqlSupport.createTime.descending())
.build()
.render(RenderingStrategies.MYBATIS3);
List<CrawlSingleTask> crawlSingleTasks = crawlSingleTaskMapper.selectMany(render);
PageBean<CrawlSingleTask> pageBean = PageBuilder.build(crawlSingleTasks);
pageBean.setList(BeanUtil.copyList(crawlSingleTasks, CrawlSingleTaskVO.class));
@ -200,7 +205,8 @@ public class CrawlServiceImpl implements CrawlService {
@Override
public CrawlSingleTask getCrawlSingleTask() {
List<CrawlSingleTask> list = crawlSingleTaskMapper.selectMany(select(CrawlSingleTaskDynamicSqlSupport.crawlSingleTask.allColumns())
List<CrawlSingleTask> list = crawlSingleTaskMapper.selectMany(
select(CrawlSingleTaskDynamicSqlSupport.crawlSingleTask.allColumns())
.from(CrawlSingleTaskDynamicSqlSupport.crawlSingleTask)
.where(CrawlSingleTaskDynamicSqlSupport.taskStatus, isEqualTo((byte) 2))
.orderBy(CrawlSingleTaskDynamicSqlSupport.createTime)
@ -226,12 +232,12 @@ public class CrawlServiceImpl implements CrawlService {
@Override
public CrawlSource getCrawlSource(Integer id) {
Optional<CrawlSource> opt=crawlSourceMapper.selectByPrimaryKey(id);
if(opt.isPresent()) {
CrawlSource crawlSource =opt.get();
return crawlSource;
}
return null;
Optional<CrawlSource> opt = crawlSourceMapper.selectByPrimaryKey(id);
if (opt.isPresent()) {
CrawlSource crawlSource = opt.get();
return crawlSource;
}
return null;
}
/**
@ -251,8 +257,8 @@ public class CrawlServiceImpl implements CrawlService {
if (StringUtils.isNotBlank(ruleBean.getCatIdRule().get("catId" + catId))) {
//拼接分类URL
String catBookListUrl = ruleBean.getBookListUrl()
.replace("{catId}", ruleBean.getCatIdRule().get("catId" + catId))
.replace("{page}", page + "");
.replace("{catId}", ruleBean.getCatIdRule().get("catId" + catId))
.replace("{page}", page + "");
String bookListHtml = getByHttpClientWithChrome(catBookListUrl);
if (bookListHtml != null) {
@ -268,14 +274,12 @@ public class CrawlServiceImpl implements CrawlService {
return;
}
String bookId = bookIdMatcher.group(1);
parseBookAndSave(catId, ruleBean, sourceId, bookId);
} catch (Exception e) {
log.error(e.getMessage(), e);
}
isFindBookId = bookIdMatcher.find();
}
@ -306,7 +310,7 @@ public class CrawlServiceImpl implements CrawlService {
final AtomicBoolean parseResult = new AtomicBoolean(false);
CrawlParser.parseBook(ruleBean, bookId, book -> {
crawlParser.parseBook(ruleBean, bookId, book -> {
if (book.getBookName() == null || book.getAuthorName() == null) {
return;
}
@ -330,9 +334,11 @@ public class CrawlServiceImpl implements CrawlService {
book.setCrawlLastTime(new Date());
book.setId(idWorker.nextId());
//解析章节目录
boolean parseIndexContentResult = CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0), chapter -> {
bookService.saveBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList());
});
boolean parseIndexContentResult = crawlParser.parseBookIndexAndContent(bookId, book, ruleBean,
new HashMap<>(0), chapter -> {
bookService.saveBookAndIndexAndContent(book, chapter.getBookIndexList(),
chapter.getBookContentList());
});
parseResult.set(parseIndexContentResult);
} else {
@ -356,11 +362,12 @@ public class CrawlServiceImpl implements CrawlService {
@Override
public List<CrawlSource> queryCrawlSourceByStatus(Byte sourceStatus) {
SelectStatementProvider render = select(CrawlSourceDynamicSqlSupport.id, CrawlSourceDynamicSqlSupport.sourceStatus, CrawlSourceDynamicSqlSupport.crawlRule)
.from(crawlSource)
.where(CrawlSourceDynamicSqlSupport.sourceStatus, isEqualTo(sourceStatus))
.build()
.render(RenderingStrategies.MYBATIS3);
SelectStatementProvider render = select(CrawlSourceDynamicSqlSupport.id,
CrawlSourceDynamicSqlSupport.sourceStatus, CrawlSourceDynamicSqlSupport.crawlRule)
.from(crawlSource)
.where(CrawlSourceDynamicSqlSupport.sourceStatus, isEqualTo(sourceStatus))
.build()
.render(RenderingStrategies.MYBATIS3);
return crawlSourceMapper.selectMany(render);
}
}

View File

@ -7,7 +7,7 @@ public class Constants {
/**
* 本地图片保存前缀
* */
*/
public static final String LOCAL_PIC_PREFIX = "/localPic/";
/**
@ -23,5 +23,5 @@ public class Constants {
/**
* 爬取小说http请求失败重试次数
*/
public static final Integer HTTP_FAIL_RETRY_COUNT = 5;
public static final Integer HTTP_FAIL_RETRY_COUNT = 3;
}

View File

@ -0,0 +1,57 @@
package com.java2nb.novel.utils;
import com.java2nb.novel.core.utils.HttpUtil;
import lombok.extern.slf4j.Slf4j;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.stereotype.Component;
import java.util.Objects;
import java.util.Random;
/**
* @author Administrator
*/
@Slf4j
@Component
public class CrawlHttpClient {
@Value("${crawl.interval.min}")
private Integer intervalMin;
@Value("${crawl.interval.max}")
private Integer intervalMax;
private final Random random = new Random();
private static final ThreadLocal<Integer> RETRY_COUNT = new ThreadLocal<>();
public String get(String url) {
if (Objects.nonNull(intervalMin) && Objects.nonNull(intervalMax) && intervalMax > intervalMin) {
try {
Thread.sleep(random.nextInt(intervalMax - intervalMin + 1) + intervalMin);
} catch (InterruptedException e) {
log.error(e.getMessage(), e);
}
}
String body = HttpUtil.getByHttpClientWithChrome(url);
if (Objects.isNull(body) || body.length() < Constants.INVALID_HTML_LENGTH) {
return processErrorHttpResult(url);
}
//成功获得html内容
return body;
}
private String processErrorHttpResult(String url) {
Integer count = RETRY_COUNT.get();
if (count == null) {
count = 0;
}
if (count < Constants.HTTP_FAIL_RETRY_COUNT) {
RETRY_COUNT.set(++count);
return get(url);
}
RETRY_COUNT.remove();
return null;
}
}

View File

@ -14,12 +14,18 @@ admin:
username: admin
password: admin
#爬虫自动更新的线程数
#建议小说数量不多或者正在运行新书入库爬虫的情况下设置为1即可
#随着小说数量的增多可以逐渐增加但建议不要超出CPU的线程数
crawl:
update:
#爬虫自动更新的线程数
#建议小说数量不多或者正在运行新书入库爬虫的情况下设置为1即可
#随着小说数量的增多可以逐渐增加但建议不要超出CPU的线程数
thread: 1
# 采集间隔时间单位毫秒
interval:
min: 300
max: 500