diff --git a/novel-common/src/main/java/com/java2nb/novel/core/utils/FileUtil.java b/novel-common/src/main/java/com/java2nb/novel/core/utils/FileUtil.java index f423fe0..cbe1b06 100644 --- a/novel-common/src/main/java/com/java2nb/novel/core/utils/FileUtil.java +++ b/novel-common/src/main/java/com/java2nb/novel/core/utils/FileUtil.java @@ -14,7 +14,10 @@ import org.springframework.http.ResponseEntity; import javax.imageio.ImageIO; import java.awt.image.BufferedImage; -import java.io.*; +import java.io.File; +import java.io.FileOutputStream; +import java.io.InputStream; +import java.io.OutputStream; import java.util.Date; import java.util.Objects; @@ -37,10 +40,13 @@ public class FileUtil { //本地图片保存 HttpHeaders headers = new HttpHeaders(); HttpEntity requestEntity = new HttpEntity<>(null, headers); - ResponseEntity resEntity = RestTemplateUtil.getInstance(Charsets.ISO_8859_1.name()).exchange(picSrc, HttpMethod.GET, requestEntity, Resource.class); + ResponseEntity resEntity = RestTemplates.newInstance(Charsets.ISO_8859_1.name()) + .exchange(picSrc, HttpMethod.GET, requestEntity, Resource.class); input = Objects.requireNonNull(resEntity.getBody()).getInputStream(); Date currentDate = new Date(); - picSrc = visitPrefix + DateUtils.formatDate(currentDate, "yyyy") + "/" + DateUtils.formatDate(currentDate, "MM") + "/" + DateUtils.formatDate(currentDate, "dd") + "/" + picSrc = + visitPrefix + DateUtils.formatDate(currentDate, "yyyy") + "/" + DateUtils.formatDate(currentDate, "MM") + + "/" + DateUtils.formatDate(currentDate, "dd") + "/" + UUIDUtil.getUUID32() + picSrc.substring(picSrc.lastIndexOf(".")); File picFile = new File(picSavePath + picSrc); @@ -67,7 +73,6 @@ public class FileUtil { closeStream(input, out); } - return picSrc; } diff --git a/novel-common/src/main/java/com/java2nb/novel/core/utils/HttpUtil.java b/novel-common/src/main/java/com/java2nb/novel/core/utils/HttpUtil.java index d1d65a1..d85f9c4 100644 --- a/novel-common/src/main/java/com/java2nb/novel/core/utils/HttpUtil.java +++ b/novel-common/src/main/java/com/java2nb/novel/core/utils/HttpUtil.java @@ -1,38 +1,24 @@ package com.java2nb.novel.core.utils; +import lombok.extern.slf4j.Slf4j; import org.springframework.http.*; import org.springframework.web.client.RestTemplate; /** * @author Administrator */ +@Slf4j public class HttpUtil { - private static RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8"); - - - public static String getByHttpClient(String url) { - try { - - ResponseEntity forEntity = restTemplate.getForEntity(url, String.class); - if (forEntity.getStatusCode() == HttpStatus.OK) { - return forEntity.getBody(); - } else { - return null; - } - } catch (Exception e) { - e.printStackTrace(); - return null; - } - } + private static final RestTemplate REST_TEMPLATE = RestTemplates.newInstance("utf-8"); public static String getByHttpClientWithChrome(String url) { try { - HttpHeaders headers = new HttpHeaders(); - headers.add("user-agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36"); + headers.add("user-agent", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36"); HttpEntity requestEntity = new HttpEntity<>(null, headers); - ResponseEntity forEntity = restTemplate.exchange(url.toString(), HttpMethod.GET, requestEntity, String.class); + ResponseEntity forEntity = REST_TEMPLATE.exchange(url, HttpMethod.GET, requestEntity, String.class); if (forEntity.getStatusCode() == HttpStatus.OK) { return forEntity.getBody(); @@ -40,8 +26,9 @@ public class HttpUtil { return null; } } catch (Exception e) { - e.printStackTrace(); + log.error(e.getMessage(), e); return null; } } + } diff --git a/novel-common/src/main/java/com/java2nb/novel/core/utils/RestTemplateUtil.java b/novel-common/src/main/java/com/java2nb/novel/core/utils/RestTemplates.java similarity index 96% rename from novel-common/src/main/java/com/java2nb/novel/core/utils/RestTemplateUtil.java rename to novel-common/src/main/java/com/java2nb/novel/core/utils/RestTemplates.java index d4d2684..80575ab 100644 --- a/novel-common/src/main/java/com/java2nb/novel/core/utils/RestTemplateUtil.java +++ b/novel-common/src/main/java/com/java2nb/novel/core/utils/RestTemplates.java @@ -26,16 +26,16 @@ import java.util.List; import java.util.Objects; @Component -public class RestTemplateUtil { +public class RestTemplates { private static HttpProxyProperties httpProxyProperties; - RestTemplateUtil(HttpProxyProperties properties) { + RestTemplates(HttpProxyProperties properties) { httpProxyProperties = properties; } @SneakyThrows - public static RestTemplate getInstance(String charset) { + public static RestTemplate newInstance(String charset) { TrustStrategy acceptingTrustStrategy = (X509Certificate[] chain, String authType) -> true; diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java index 7a76987..3068ffe 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/crawl/CrawlParser.java @@ -1,23 +1,23 @@ package com.java2nb.novel.core.crawl; -import com.java2nb.novel.core.utils.HttpUtil; import com.java2nb.novel.core.utils.RandomBookInfoUtil; -import com.java2nb.novel.core.utils.RestTemplateUtil; import com.java2nb.novel.core.utils.StringUtil; import com.java2nb.novel.entity.Book; import com.java2nb.novel.entity.BookContent; import com.java2nb.novel.entity.BookIndex; import com.java2nb.novel.utils.Constants; +import com.java2nb.novel.utils.CrawlHttpClient; import io.github.xxyopen.util.IdWorker; +import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; -import lombok.extern.slf4j.Slf4j; import org.apache.commons.lang3.StringUtils; -import org.springframework.http.HttpStatus; -import org.springframework.http.ResponseEntity; -import org.springframework.web.client.RestTemplate; +import org.springframework.stereotype.Component; import java.text.SimpleDateFormat; -import java.util.*; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -26,20 +26,19 @@ import java.util.regex.Pattern; * * @author Administrator */ -@Slf4j +@Component +@RequiredArgsConstructor public class CrawlParser { - private static final IdWorker idWorker = IdWorker.INSTANCE; + private final IdWorker ID_WORKER = IdWorker.INSTANCE; - private static final RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8"); - - private static final ThreadLocal retryCount = new ThreadLocal<>(); + private final CrawlHttpClient crawlHttpClient; @SneakyThrows - public static void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) { + public void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) { Book book = new Book(); String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId); - String bookDetailHtml = getByHttpClientWithChrome(bookDetailUrl); + String bookDetailHtml = crawlHttpClient.get(bookDetailUrl); if (bookDetailHtml != null) { Pattern bookNamePatten = PatternFactory.getPattern(ruleBean.getBookNamePatten()); Matcher bookNameMatch = bookNamePatten.matcher(bookDetailHtml); @@ -144,7 +143,7 @@ public class CrawlParser { handler.handle(book); } - public static boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, + public boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, Map existBookIndexMap, CrawlBookChapterHandler handler) { Date currentDate = new Date(); @@ -153,7 +152,7 @@ public class CrawlParser { List contentList = new ArrayList<>(); //读取目录 String indexListUrl = ruleBean.getBookIndexUrl().replace("{bookId}", sourceBookId); - String indexListHtml = getByHttpClientWithChrome(indexListUrl); + String indexListHtml = crawlHttpClient.get(indexListUrl); if (indexListHtml != null) { if (StringUtils.isNotBlank(ruleBean.getBookIndexStart())) { @@ -217,7 +216,7 @@ public class CrawlParser { .replace("{indexId}", sourceIndexId); //查询章节内容 - String contentHtml = getByHttpClientWithChrome(contentUrl); + String contentHtml = crawlHttpClient.get(contentUrl); if (contentHtml != null && !contentHtml.contains("正在手打中")) { String content = contentHtml.substring( contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length()); @@ -254,7 +253,7 @@ public class CrawlParser { } else { //章节插入 //设置目录和章节内容 - Long indexId = idWorker.nextId(); + Long indexId = ID_WORKER.nextId(); bookIndex.setId(indexId); bookIndex.setBookId(book.getId()); @@ -308,56 +307,4 @@ public class CrawlParser { return false; } - - - private static String getByHttpClient(String url) { - try { - ResponseEntity forEntity = restTemplate.getForEntity(url, String.class); - if (forEntity.getStatusCode() == HttpStatus.OK) { - String body = forEntity.getBody(); - assert body != null; - if (body.length() < Constants.INVALID_HTML_LENGTH) { - return processErrorHttpResult(url); - } - //成功获得html内容 - return body; - } - } catch (Exception e) { - e.printStackTrace(); - } - return processErrorHttpResult(url); - - } - - private static String getByHttpClientWithChrome(String url) { - try { - - String body = HttpUtil.getByHttpClientWithChrome(url); - if (body != null && body.length() < Constants.INVALID_HTML_LENGTH) { - return processErrorHttpResult(url); - } - //成功获得html内容 - return body; - } catch (Exception e) { - e.printStackTrace(); - } - return processErrorHttpResult(url); - - } - - @SneakyThrows - private static String processErrorHttpResult(String url) { - Integer count = retryCount.get(); - if (count == null) { - count = 0; - } - if (count < Constants.HTTP_FAIL_RETRY_COUNT) { - Thread.sleep(new Random().nextInt(10 * 1000)); - retryCount.set(++count); - return getByHttpClient(url); - } - return null; - } - - } diff --git a/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java b/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java index ad57357..1fd998c 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/core/listener/StarterListener.java @@ -1,10 +1,12 @@ package com.java2nb.novel.core.listener; import com.fasterxml.jackson.databind.ObjectMapper; -import com.java2nb.novel.core.crawl.ChapterBean; import com.java2nb.novel.core.crawl.CrawlParser; import com.java2nb.novel.core.crawl.RuleBean; -import com.java2nb.novel.entity.*; +import com.java2nb.novel.entity.Book; +import com.java2nb.novel.entity.BookIndex; +import com.java2nb.novel.entity.CrawlSingleTask; +import com.java2nb.novel.entity.CrawlSource; import com.java2nb.novel.service.BookService; import com.java2nb.novel.service.CrawlService; import com.java2nb.novel.utils.Constants; @@ -33,6 +35,8 @@ public class StarterListener implements ServletContextListener { private final CrawlService crawlService; + private final CrawlParser crawlParser; + @Value("${crawl.update.thread}") private int updateThreadCount; @@ -56,20 +60,24 @@ public class StarterListener implements ServletContextListener { CrawlSource source = crawlService.queryCrawlSource(needUpdateBook.getCrawlSourceId()); RuleBean ruleBean = new ObjectMapper().readValue(source.getCrawlRule(), RuleBean.class); //解析小说基本信息 - CrawlParser.parseBook(ruleBean, needUpdateBook.getCrawlBookId(),book -> { + crawlParser.parseBook(ruleBean, needUpdateBook.getCrawlBookId(), book -> { //这里只做老书更新 book.setId(needUpdateBook.getId()); book.setWordCount(needUpdateBook.getWordCount()); - if (needUpdateBook.getPicUrl() != null && needUpdateBook.getPicUrl().contains(Constants.LOCAL_PIC_PREFIX)) { + if (needUpdateBook.getPicUrl() != null && needUpdateBook.getPicUrl() + .contains(Constants.LOCAL_PIC_PREFIX)) { //本地图片则不更新 book.setPicUrl(null); } //查询已存在的章节 - Map existBookIndexMap = bookService.queryExistBookIndexMap(needUpdateBook.getId()); + Map existBookIndexMap = bookService.queryExistBookIndexMap( + needUpdateBook.getId()); //解析章节目录 - CrawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, ruleBean, existBookIndexMap,chapter -> { - bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList(), existBookIndexMap); - }); + crawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, + ruleBean, existBookIndexMap, chapter -> { + bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(), + chapter.getBookContentList(), existBookIndexMap); + }); }); } catch (Exception e) { log.error(e.getMessage(), e); @@ -88,7 +96,6 @@ public class StarterListener implements ServletContextListener { } - new Thread(() -> { log.info("程序启动,开始执行单本采集任务线程。。。"); while (true) { @@ -103,7 +110,8 @@ public class StarterListener implements ServletContextListener { CrawlSource source = crawlService.queryCrawlSource(task.getSourceId()); RuleBean ruleBean = new ObjectMapper().readValue(source.getCrawlRule(), RuleBean.class); - if (crawlService.parseBookAndSave(task.getCatId(), ruleBean, task.getSourceId(), task.getSourceBookId())) { + if (crawlService.parseBookAndSave(task.getCatId(), ruleBean, task.getSourceId(), + task.getSourceBookId())) { //采集成功 crawlStatus = 1; } diff --git a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java index d8b02b8..9341b98 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/service/impl/CrawlServiceImpl.java @@ -2,17 +2,11 @@ package com.java2nb.novel.service.impl; import com.fasterxml.jackson.databind.ObjectMapper; import com.github.pagehelper.PageHelper; -import io.github.xxyopen.model.page.PageBean; import com.java2nb.novel.core.cache.CacheKey; import com.java2nb.novel.core.cache.CacheService; import com.java2nb.novel.core.crawl.CrawlParser; import com.java2nb.novel.core.crawl.RuleBean; import com.java2nb.novel.core.enums.ResponseStatus; -import io.github.xxyopen.model.page.builder.pagehelper.PageBuilder; -import io.github.xxyopen.util.IdWorker; -import io.github.xxyopen.util.ThreadUtil; -import io.github.xxyopen.web.exception.BusinessException; -import io.github.xxyopen.web.util.BeanUtil; import com.java2nb.novel.entity.Book; import com.java2nb.novel.entity.CrawlSingleTask; import com.java2nb.novel.entity.CrawlSource; @@ -24,6 +18,12 @@ import com.java2nb.novel.service.BookService; import com.java2nb.novel.service.CrawlService; import com.java2nb.novel.vo.CrawlSingleTaskVO; import com.java2nb.novel.vo.CrawlSourceVO; +import io.github.xxyopen.model.page.PageBean; +import io.github.xxyopen.model.page.builder.pagehelper.PageBuilder; +import io.github.xxyopen.util.IdWorker; +import io.github.xxyopen.util.ThreadUtil; +import io.github.xxyopen.web.exception.BusinessException; +import io.github.xxyopen.web.util.BeanUtil; import io.github.xxyopen.web.util.SpringUtil; import lombok.RequiredArgsConstructor; import lombok.SneakyThrows; @@ -51,6 +51,7 @@ import static org.mybatis.dynamic.sql.select.SelectDSL.select; @Slf4j public class CrawlServiceImpl implements CrawlService { + private final CrawlParser crawlParser; private final CrawlSourceMapper crawlSourceMapper; @@ -71,15 +72,16 @@ public class CrawlServiceImpl implements CrawlService { crawlSourceMapper.insertSelective(source); } + @Override public void updateCrawlSource(CrawlSource source) { - if(source.getId()!=null){ - Optional opt=crawlSourceMapper.selectByPrimaryKey(source.getId()); - if(opt.isPresent()) { - CrawlSource crawlSource =opt.get(); + if (source.getId() != null) { + Optional opt = crawlSourceMapper.selectByPrimaryKey(source.getId()); + if (opt.isPresent()) { + CrawlSource crawlSource = opt.get(); if (crawlSource.getSourceStatus() == (byte) 1) { //关闭 - openOrCloseCrawl(crawlSource.getId(),(byte)0); + openOrCloseCrawl(crawlSource.getId(), (byte) 0); } Date currentDate = new Date(); crawlSource.setUpdateTime(currentDate); @@ -89,14 +91,15 @@ public class CrawlServiceImpl implements CrawlService { } } } + @Override public PageBean listCrawlByPage(int page, int pageSize) { PageHelper.startPage(page, pageSize); SelectStatementProvider render = select(id, sourceName, sourceStatus, createTime, updateTime) - .from(crawlSource) - .orderBy(updateTime) - .build() - .render(RenderingStrategies.MYBATIS3); + .from(crawlSource) + .orderBy(updateTime) + .build() + .render(RenderingStrategies.MYBATIS3); List crawlSources = crawlSourceMapper.selectMany(render); PageBean pageBean = PageBuilder.build(crawlSources); pageBean.setList(BeanUtil.copyList(crawlSources, CrawlSourceVO.class)); @@ -113,7 +116,8 @@ public class CrawlServiceImpl implements CrawlService { if (sourceStatus == (byte) 0) { //关闭,直接修改数据库状态,并直接修改数据库状态后获取该爬虫正在运行的线程集合全部停止 SpringUtil.getBean(CrawlService.class).updateCrawlSourceStatus(sourceId, sourceStatus); - Set runningCrawlThreadId = (Set) cacheService.getObject(CacheKey.RUNNING_CRAWL_THREAD_KEY_PREFIX + sourceId); + Set runningCrawlThreadId = (Set) cacheService.getObject( + CacheKey.RUNNING_CRAWL_THREAD_KEY_PREFIX + sourceId); if (runningCrawlThreadId != null) { for (Long ThreadId : runningCrawlThreadId) { Thread thread = ThreadUtil.findThread(ThreadId); @@ -157,11 +161,12 @@ public class CrawlServiceImpl implements CrawlService { @Override public CrawlSource queryCrawlSource(Integer sourceId) { - SelectStatementProvider render = select(CrawlSourceDynamicSqlSupport.sourceStatus, CrawlSourceDynamicSqlSupport.crawlRule) - .from(crawlSource) - .where(id, isEqualTo(sourceId)) - .build() - .render(RenderingStrategies.MYBATIS3); + SelectStatementProvider render = select(CrawlSourceDynamicSqlSupport.sourceStatus, + CrawlSourceDynamicSqlSupport.crawlRule) + .from(crawlSource) + .where(id, isEqualTo(sourceId)) + .build() + .render(RenderingStrategies.MYBATIS3); return crawlSourceMapper.selectMany(render).get(0); } @@ -182,10 +187,10 @@ public class CrawlServiceImpl implements CrawlService { public PageBean listCrawlSingleTaskByPage(int page, int pageSize) { PageHelper.startPage(page, pageSize); SelectStatementProvider render = select(CrawlSingleTaskDynamicSqlSupport.crawlSingleTask.allColumns()) - .from(CrawlSingleTaskDynamicSqlSupport.crawlSingleTask) - .orderBy(CrawlSingleTaskDynamicSqlSupport.createTime.descending()) - .build() - .render(RenderingStrategies.MYBATIS3); + .from(CrawlSingleTaskDynamicSqlSupport.crawlSingleTask) + .orderBy(CrawlSingleTaskDynamicSqlSupport.createTime.descending()) + .build() + .render(RenderingStrategies.MYBATIS3); List crawlSingleTasks = crawlSingleTaskMapper.selectMany(render); PageBean pageBean = PageBuilder.build(crawlSingleTasks); pageBean.setList(BeanUtil.copyList(crawlSingleTasks, CrawlSingleTaskVO.class)); @@ -200,7 +205,8 @@ public class CrawlServiceImpl implements CrawlService { @Override public CrawlSingleTask getCrawlSingleTask() { - List list = crawlSingleTaskMapper.selectMany(select(CrawlSingleTaskDynamicSqlSupport.crawlSingleTask.allColumns()) + List list = crawlSingleTaskMapper.selectMany( + select(CrawlSingleTaskDynamicSqlSupport.crawlSingleTask.allColumns()) .from(CrawlSingleTaskDynamicSqlSupport.crawlSingleTask) .where(CrawlSingleTaskDynamicSqlSupport.taskStatus, isEqualTo((byte) 2)) .orderBy(CrawlSingleTaskDynamicSqlSupport.createTime) @@ -226,12 +232,12 @@ public class CrawlServiceImpl implements CrawlService { @Override public CrawlSource getCrawlSource(Integer id) { - Optional opt=crawlSourceMapper.selectByPrimaryKey(id); - if(opt.isPresent()) { - CrawlSource crawlSource =opt.get(); - return crawlSource; - } - return null; + Optional opt = crawlSourceMapper.selectByPrimaryKey(id); + if (opt.isPresent()) { + CrawlSource crawlSource = opt.get(); + return crawlSource; + } + return null; } /** @@ -251,8 +257,8 @@ public class CrawlServiceImpl implements CrawlService { if (StringUtils.isNotBlank(ruleBean.getCatIdRule().get("catId" + catId))) { //拼接分类URL String catBookListUrl = ruleBean.getBookListUrl() - .replace("{catId}", ruleBean.getCatIdRule().get("catId" + catId)) - .replace("{page}", page + ""); + .replace("{catId}", ruleBean.getCatIdRule().get("catId" + catId)) + .replace("{page}", page + ""); String bookListHtml = getByHttpClientWithChrome(catBookListUrl); if (bookListHtml != null) { @@ -268,14 +274,12 @@ public class CrawlServiceImpl implements CrawlService { return; } - String bookId = bookIdMatcher.group(1); parseBookAndSave(catId, ruleBean, sourceId, bookId); } catch (Exception e) { log.error(e.getMessage(), e); } - isFindBookId = bookIdMatcher.find(); } @@ -306,7 +310,7 @@ public class CrawlServiceImpl implements CrawlService { final AtomicBoolean parseResult = new AtomicBoolean(false); - CrawlParser.parseBook(ruleBean, bookId, book -> { + crawlParser.parseBook(ruleBean, bookId, book -> { if (book.getBookName() == null || book.getAuthorName() == null) { return; } @@ -330,9 +334,11 @@ public class CrawlServiceImpl implements CrawlService { book.setCrawlLastTime(new Date()); book.setId(idWorker.nextId()); //解析章节目录 - boolean parseIndexContentResult = CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0), chapter -> { - bookService.saveBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList()); - }); + boolean parseIndexContentResult = crawlParser.parseBookIndexAndContent(bookId, book, ruleBean, + new HashMap<>(0), chapter -> { + bookService.saveBookAndIndexAndContent(book, chapter.getBookIndexList(), + chapter.getBookContentList()); + }); parseResult.set(parseIndexContentResult); } else { @@ -356,11 +362,12 @@ public class CrawlServiceImpl implements CrawlService { @Override public List queryCrawlSourceByStatus(Byte sourceStatus) { - SelectStatementProvider render = select(CrawlSourceDynamicSqlSupport.id, CrawlSourceDynamicSqlSupport.sourceStatus, CrawlSourceDynamicSqlSupport.crawlRule) - .from(crawlSource) - .where(CrawlSourceDynamicSqlSupport.sourceStatus, isEqualTo(sourceStatus)) - .build() - .render(RenderingStrategies.MYBATIS3); + SelectStatementProvider render = select(CrawlSourceDynamicSqlSupport.id, + CrawlSourceDynamicSqlSupport.sourceStatus, CrawlSourceDynamicSqlSupport.crawlRule) + .from(crawlSource) + .where(CrawlSourceDynamicSqlSupport.sourceStatus, isEqualTo(sourceStatus)) + .build() + .render(RenderingStrategies.MYBATIS3); return crawlSourceMapper.selectMany(render); } } diff --git a/novel-crawl/src/main/java/com/java2nb/novel/utils/Constants.java b/novel-crawl/src/main/java/com/java2nb/novel/utils/Constants.java index d278c19..5d3c59c 100644 --- a/novel-crawl/src/main/java/com/java2nb/novel/utils/Constants.java +++ b/novel-crawl/src/main/java/com/java2nb/novel/utils/Constants.java @@ -7,7 +7,7 @@ public class Constants { /** * 本地图片保存前缀 - * */ + */ public static final String LOCAL_PIC_PREFIX = "/localPic/"; /** @@ -23,5 +23,5 @@ public class Constants { /** * 爬取小说http请求失败重试次数 */ - public static final Integer HTTP_FAIL_RETRY_COUNT = 5; + public static final Integer HTTP_FAIL_RETRY_COUNT = 3; } diff --git a/novel-crawl/src/main/java/com/java2nb/novel/utils/CrawlHttpClient.java b/novel-crawl/src/main/java/com/java2nb/novel/utils/CrawlHttpClient.java new file mode 100644 index 0000000..515f005 --- /dev/null +++ b/novel-crawl/src/main/java/com/java2nb/novel/utils/CrawlHttpClient.java @@ -0,0 +1,57 @@ +package com.java2nb.novel.utils; + +import com.java2nb.novel.core.utils.HttpUtil; +import lombok.extern.slf4j.Slf4j; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Component; + +import java.util.Objects; +import java.util.Random; + +/** + * @author Administrator + */ +@Slf4j +@Component +public class CrawlHttpClient { + + @Value("${crawl.interval.min}") + private Integer intervalMin; + + @Value("${crawl.interval.max}") + private Integer intervalMax; + + private final Random random = new Random(); + + private static final ThreadLocal RETRY_COUNT = new ThreadLocal<>(); + + public String get(String url) { + if (Objects.nonNull(intervalMin) && Objects.nonNull(intervalMax) && intervalMax > intervalMin) { + try { + Thread.sleep(random.nextInt(intervalMax - intervalMin + 1) + intervalMin); + } catch (InterruptedException e) { + log.error(e.getMessage(), e); + } + } + String body = HttpUtil.getByHttpClientWithChrome(url); + if (Objects.isNull(body) || body.length() < Constants.INVALID_HTML_LENGTH) { + return processErrorHttpResult(url); + } + //成功获得html内容 + return body; + } + + private String processErrorHttpResult(String url) { + Integer count = RETRY_COUNT.get(); + if (count == null) { + count = 0; + } + if (count < Constants.HTTP_FAIL_RETRY_COUNT) { + RETRY_COUNT.set(++count); + return get(url); + } + RETRY_COUNT.remove(); + return null; + } + +} diff --git a/novel-crawl/src/main/resources/application.yml b/novel-crawl/src/main/resources/application.yml index 132a14e..b4908fe 100644 --- a/novel-crawl/src/main/resources/application.yml +++ b/novel-crawl/src/main/resources/application.yml @@ -14,12 +14,18 @@ admin: username: admin password: admin -#爬虫自动更新的线程数 -#建议小说数量不多或者正在运行新书入库爬虫的情况下设置为1即可 -#随着小说数量的增多可以逐渐增加,但建议不要超出CPU的线程数 + + crawl: update: + #爬虫自动更新的线程数 + #建议小说数量不多或者正在运行新书入库爬虫的情况下设置为1即可 + #随着小说数量的增多可以逐渐增加,但建议不要超出CPU的线程数 thread: 1 + # 采集间隔时间,单位:毫秒 + interval: + min: 300 + max: 500