mirror of
https://github.com/201206030/novel-plus.git
synced 2025-04-26 17:20:52 +00:00
优化
This commit is contained in:
parent
5c35f7af0a
commit
a8219253e9
@ -14,7 +14,10 @@ import org.springframework.http.ResponseEntity;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.*;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.Date;
|
||||
import java.util.Objects;
|
||||
|
||||
@ -37,10 +40,13 @@ public class FileUtil {
|
||||
//本地图片保存
|
||||
HttpHeaders headers = new HttpHeaders();
|
||||
HttpEntity<String> requestEntity = new HttpEntity<>(null, headers);
|
||||
ResponseEntity<Resource> resEntity = RestTemplateUtil.getInstance(Charsets.ISO_8859_1.name()).exchange(picSrc, HttpMethod.GET, requestEntity, Resource.class);
|
||||
ResponseEntity<Resource> resEntity = RestTemplates.newInstance(Charsets.ISO_8859_1.name())
|
||||
.exchange(picSrc, HttpMethod.GET, requestEntity, Resource.class);
|
||||
input = Objects.requireNonNull(resEntity.getBody()).getInputStream();
|
||||
Date currentDate = new Date();
|
||||
picSrc = visitPrefix + DateUtils.formatDate(currentDate, "yyyy") + "/" + DateUtils.formatDate(currentDate, "MM") + "/" + DateUtils.formatDate(currentDate, "dd") + "/"
|
||||
picSrc =
|
||||
visitPrefix + DateUtils.formatDate(currentDate, "yyyy") + "/" + DateUtils.formatDate(currentDate, "MM")
|
||||
+ "/" + DateUtils.formatDate(currentDate, "dd") + "/"
|
||||
+ UUIDUtil.getUUID32()
|
||||
+ picSrc.substring(picSrc.lastIndexOf("."));
|
||||
File picFile = new File(picSavePath + picSrc);
|
||||
@ -67,7 +73,6 @@ public class FileUtil {
|
||||
closeStream(input, out);
|
||||
}
|
||||
|
||||
|
||||
return picSrc;
|
||||
}
|
||||
|
||||
|
@ -1,38 +1,24 @@
|
||||
package com.java2nb.novel.core.utils;
|
||||
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.http.*;
|
||||
import org.springframework.web.client.RestTemplate;
|
||||
|
||||
/**
|
||||
* @author Administrator
|
||||
*/
|
||||
@Slf4j
|
||||
public class HttpUtil {
|
||||
|
||||
private static RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8");
|
||||
|
||||
|
||||
public static String getByHttpClient(String url) {
|
||||
try {
|
||||
|
||||
ResponseEntity<String> forEntity = restTemplate.getForEntity(url, String.class);
|
||||
if (forEntity.getStatusCode() == HttpStatus.OK) {
|
||||
return forEntity.getBody();
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
return null;
|
||||
}
|
||||
}
|
||||
private static final RestTemplate REST_TEMPLATE = RestTemplates.newInstance("utf-8");
|
||||
|
||||
public static String getByHttpClientWithChrome(String url) {
|
||||
try {
|
||||
|
||||
HttpHeaders headers = new HttpHeaders();
|
||||
headers.add("user-agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36");
|
||||
headers.add("user-agent",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.67 Safari/537.36");
|
||||
HttpEntity<String> requestEntity = new HttpEntity<>(null, headers);
|
||||
ResponseEntity<String> forEntity = restTemplate.exchange(url.toString(), HttpMethod.GET, requestEntity, String.class);
|
||||
ResponseEntity<String> forEntity = REST_TEMPLATE.exchange(url, HttpMethod.GET, requestEntity, String.class);
|
||||
|
||||
if (forEntity.getStatusCode() == HttpStatus.OK) {
|
||||
return forEntity.getBody();
|
||||
@ -40,8 +26,9 @@ public class HttpUtil {
|
||||
return null;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
log.error(e.getMessage(), e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -26,16 +26,16 @@ import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
@Component
|
||||
public class RestTemplateUtil {
|
||||
public class RestTemplates {
|
||||
|
||||
private static HttpProxyProperties httpProxyProperties;
|
||||
|
||||
RestTemplateUtil(HttpProxyProperties properties) {
|
||||
RestTemplates(HttpProxyProperties properties) {
|
||||
httpProxyProperties = properties;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public static RestTemplate getInstance(String charset) {
|
||||
public static RestTemplate newInstance(String charset) {
|
||||
|
||||
TrustStrategy acceptingTrustStrategy = (X509Certificate[] chain, String authType) -> true;
|
||||
|
@ -1,23 +1,23 @@
|
||||
package com.java2nb.novel.core.crawl;
|
||||
|
||||
import com.java2nb.novel.core.utils.HttpUtil;
|
||||
import com.java2nb.novel.core.utils.RandomBookInfoUtil;
|
||||
import com.java2nb.novel.core.utils.RestTemplateUtil;
|
||||
import com.java2nb.novel.core.utils.StringUtil;
|
||||
import com.java2nb.novel.entity.Book;
|
||||
import com.java2nb.novel.entity.BookContent;
|
||||
import com.java2nb.novel.entity.BookIndex;
|
||||
import com.java2nb.novel.utils.Constants;
|
||||
import com.java2nb.novel.utils.CrawlHttpClient;
|
||||
import io.github.xxyopen.util.IdWorker;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.client.RestTemplate;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@ -26,20 +26,19 @@ import java.util.regex.Pattern;
|
||||
*
|
||||
* @author Administrator
|
||||
*/
|
||||
@Slf4j
|
||||
@Component
|
||||
@RequiredArgsConstructor
|
||||
public class CrawlParser {
|
||||
|
||||
private static final IdWorker idWorker = IdWorker.INSTANCE;
|
||||
private final IdWorker ID_WORKER = IdWorker.INSTANCE;
|
||||
|
||||
private static final RestTemplate restTemplate = RestTemplateUtil.getInstance("utf-8");
|
||||
|
||||
private static final ThreadLocal<Integer> retryCount = new ThreadLocal<>();
|
||||
private final CrawlHttpClient crawlHttpClient;
|
||||
|
||||
@SneakyThrows
|
||||
public static void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) {
|
||||
public void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) {
|
||||
Book book = new Book();
|
||||
String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
|
||||
String bookDetailHtml = getByHttpClientWithChrome(bookDetailUrl);
|
||||
String bookDetailHtml = crawlHttpClient.get(bookDetailUrl);
|
||||
if (bookDetailHtml != null) {
|
||||
Pattern bookNamePatten = PatternFactory.getPattern(ruleBean.getBookNamePatten());
|
||||
Matcher bookNameMatch = bookNamePatten.matcher(bookDetailHtml);
|
||||
@ -144,7 +143,7 @@ public class CrawlParser {
|
||||
handler.handle(book);
|
||||
}
|
||||
|
||||
public static boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean,
|
||||
public boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean,
|
||||
Map<Integer, BookIndex> existBookIndexMap, CrawlBookChapterHandler handler) {
|
||||
|
||||
Date currentDate = new Date();
|
||||
@ -153,7 +152,7 @@ public class CrawlParser {
|
||||
List<BookContent> contentList = new ArrayList<>();
|
||||
//读取目录
|
||||
String indexListUrl = ruleBean.getBookIndexUrl().replace("{bookId}", sourceBookId);
|
||||
String indexListHtml = getByHttpClientWithChrome(indexListUrl);
|
||||
String indexListHtml = crawlHttpClient.get(indexListUrl);
|
||||
|
||||
if (indexListHtml != null) {
|
||||
if (StringUtils.isNotBlank(ruleBean.getBookIndexStart())) {
|
||||
@ -217,7 +216,7 @@ public class CrawlParser {
|
||||
.replace("{indexId}", sourceIndexId);
|
||||
|
||||
//查询章节内容
|
||||
String contentHtml = getByHttpClientWithChrome(contentUrl);
|
||||
String contentHtml = crawlHttpClient.get(contentUrl);
|
||||
if (contentHtml != null && !contentHtml.contains("正在手打中")) {
|
||||
String content = contentHtml.substring(
|
||||
contentHtml.indexOf(ruleBean.getContentStart()) + ruleBean.getContentStart().length());
|
||||
@ -254,7 +253,7 @@ public class CrawlParser {
|
||||
} else {
|
||||
//章节插入
|
||||
//设置目录和章节内容
|
||||
Long indexId = idWorker.nextId();
|
||||
Long indexId = ID_WORKER.nextId();
|
||||
bookIndex.setId(indexId);
|
||||
bookIndex.setBookId(book.getId());
|
||||
|
||||
@ -308,56 +307,4 @@ public class CrawlParser {
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
|
||||
private static String getByHttpClient(String url) {
|
||||
try {
|
||||
ResponseEntity<String> forEntity = restTemplate.getForEntity(url, String.class);
|
||||
if (forEntity.getStatusCode() == HttpStatus.OK) {
|
||||
String body = forEntity.getBody();
|
||||
assert body != null;
|
||||
if (body.length() < Constants.INVALID_HTML_LENGTH) {
|
||||
return processErrorHttpResult(url);
|
||||
}
|
||||
//成功获得html内容
|
||||
return body;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return processErrorHttpResult(url);
|
||||
|
||||
}
|
||||
|
||||
private static String getByHttpClientWithChrome(String url) {
|
||||
try {
|
||||
|
||||
String body = HttpUtil.getByHttpClientWithChrome(url);
|
||||
if (body != null && body.length() < Constants.INVALID_HTML_LENGTH) {
|
||||
return processErrorHttpResult(url);
|
||||
}
|
||||
//成功获得html内容
|
||||
return body;
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
return processErrorHttpResult(url);
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private static String processErrorHttpResult(String url) {
|
||||
Integer count = retryCount.get();
|
||||
if (count == null) {
|
||||
count = 0;
|
||||
}
|
||||
if (count < Constants.HTTP_FAIL_RETRY_COUNT) {
|
||||
Thread.sleep(new Random().nextInt(10 * 1000));
|
||||
retryCount.set(++count);
|
||||
return getByHttpClient(url);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
@ -1,10 +1,12 @@
|
||||
package com.java2nb.novel.core.listener;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.java2nb.novel.core.crawl.ChapterBean;
|
||||
import com.java2nb.novel.core.crawl.CrawlParser;
|
||||
import com.java2nb.novel.core.crawl.RuleBean;
|
||||
import com.java2nb.novel.entity.*;
|
||||
import com.java2nb.novel.entity.Book;
|
||||
import com.java2nb.novel.entity.BookIndex;
|
||||
import com.java2nb.novel.entity.CrawlSingleTask;
|
||||
import com.java2nb.novel.entity.CrawlSource;
|
||||
import com.java2nb.novel.service.BookService;
|
||||
import com.java2nb.novel.service.CrawlService;
|
||||
import com.java2nb.novel.utils.Constants;
|
||||
@ -33,6 +35,8 @@ public class StarterListener implements ServletContextListener {
|
||||
|
||||
private final CrawlService crawlService;
|
||||
|
||||
private final CrawlParser crawlParser;
|
||||
|
||||
@Value("${crawl.update.thread}")
|
||||
private int updateThreadCount;
|
||||
|
||||
@ -56,20 +60,24 @@ public class StarterListener implements ServletContextListener {
|
||||
CrawlSource source = crawlService.queryCrawlSource(needUpdateBook.getCrawlSourceId());
|
||||
RuleBean ruleBean = new ObjectMapper().readValue(source.getCrawlRule(), RuleBean.class);
|
||||
//解析小说基本信息
|
||||
CrawlParser.parseBook(ruleBean, needUpdateBook.getCrawlBookId(),book -> {
|
||||
crawlParser.parseBook(ruleBean, needUpdateBook.getCrawlBookId(), book -> {
|
||||
//这里只做老书更新
|
||||
book.setId(needUpdateBook.getId());
|
||||
book.setWordCount(needUpdateBook.getWordCount());
|
||||
if (needUpdateBook.getPicUrl() != null && needUpdateBook.getPicUrl().contains(Constants.LOCAL_PIC_PREFIX)) {
|
||||
if (needUpdateBook.getPicUrl() != null && needUpdateBook.getPicUrl()
|
||||
.contains(Constants.LOCAL_PIC_PREFIX)) {
|
||||
//本地图片则不更新
|
||||
book.setPicUrl(null);
|
||||
}
|
||||
//查询已存在的章节
|
||||
Map<Integer, BookIndex> existBookIndexMap = bookService.queryExistBookIndexMap(needUpdateBook.getId());
|
||||
Map<Integer, BookIndex> existBookIndexMap = bookService.queryExistBookIndexMap(
|
||||
needUpdateBook.getId());
|
||||
//解析章节目录
|
||||
CrawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, ruleBean, existBookIndexMap,chapter -> {
|
||||
bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList(), existBookIndexMap);
|
||||
});
|
||||
crawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book,
|
||||
ruleBean, existBookIndexMap, chapter -> {
|
||||
bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(),
|
||||
chapter.getBookContentList(), existBookIndexMap);
|
||||
});
|
||||
});
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
@ -88,7 +96,6 @@ public class StarterListener implements ServletContextListener {
|
||||
|
||||
}
|
||||
|
||||
|
||||
new Thread(() -> {
|
||||
log.info("程序启动,开始执行单本采集任务线程。。。");
|
||||
while (true) {
|
||||
@ -103,7 +110,8 @@ public class StarterListener implements ServletContextListener {
|
||||
CrawlSource source = crawlService.queryCrawlSource(task.getSourceId());
|
||||
RuleBean ruleBean = new ObjectMapper().readValue(source.getCrawlRule(), RuleBean.class);
|
||||
|
||||
if (crawlService.parseBookAndSave(task.getCatId(), ruleBean, task.getSourceId(), task.getSourceBookId())) {
|
||||
if (crawlService.parseBookAndSave(task.getCatId(), ruleBean, task.getSourceId(),
|
||||
task.getSourceBookId())) {
|
||||
//采集成功
|
||||
crawlStatus = 1;
|
||||
}
|
||||
|
@ -2,17 +2,11 @@ package com.java2nb.novel.service.impl;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.github.pagehelper.PageHelper;
|
||||
import io.github.xxyopen.model.page.PageBean;
|
||||
import com.java2nb.novel.core.cache.CacheKey;
|
||||
import com.java2nb.novel.core.cache.CacheService;
|
||||
import com.java2nb.novel.core.crawl.CrawlParser;
|
||||
import com.java2nb.novel.core.crawl.RuleBean;
|
||||
import com.java2nb.novel.core.enums.ResponseStatus;
|
||||
import io.github.xxyopen.model.page.builder.pagehelper.PageBuilder;
|
||||
import io.github.xxyopen.util.IdWorker;
|
||||
import io.github.xxyopen.util.ThreadUtil;
|
||||
import io.github.xxyopen.web.exception.BusinessException;
|
||||
import io.github.xxyopen.web.util.BeanUtil;
|
||||
import com.java2nb.novel.entity.Book;
|
||||
import com.java2nb.novel.entity.CrawlSingleTask;
|
||||
import com.java2nb.novel.entity.CrawlSource;
|
||||
@ -24,6 +18,12 @@ import com.java2nb.novel.service.BookService;
|
||||
import com.java2nb.novel.service.CrawlService;
|
||||
import com.java2nb.novel.vo.CrawlSingleTaskVO;
|
||||
import com.java2nb.novel.vo.CrawlSourceVO;
|
||||
import io.github.xxyopen.model.page.PageBean;
|
||||
import io.github.xxyopen.model.page.builder.pagehelper.PageBuilder;
|
||||
import io.github.xxyopen.util.IdWorker;
|
||||
import io.github.xxyopen.util.ThreadUtil;
|
||||
import io.github.xxyopen.web.exception.BusinessException;
|
||||
import io.github.xxyopen.web.util.BeanUtil;
|
||||
import io.github.xxyopen.web.util.SpringUtil;
|
||||
import lombok.RequiredArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
@ -51,6 +51,7 @@ import static org.mybatis.dynamic.sql.select.SelectDSL.select;
|
||||
@Slf4j
|
||||
public class CrawlServiceImpl implements CrawlService {
|
||||
|
||||
private final CrawlParser crawlParser;
|
||||
|
||||
private final CrawlSourceMapper crawlSourceMapper;
|
||||
|
||||
@ -71,15 +72,16 @@ public class CrawlServiceImpl implements CrawlService {
|
||||
crawlSourceMapper.insertSelective(source);
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateCrawlSource(CrawlSource source) {
|
||||
if(source.getId()!=null){
|
||||
Optional<CrawlSource> opt=crawlSourceMapper.selectByPrimaryKey(source.getId());
|
||||
if(opt.isPresent()) {
|
||||
CrawlSource crawlSource =opt.get();
|
||||
if (source.getId() != null) {
|
||||
Optional<CrawlSource> opt = crawlSourceMapper.selectByPrimaryKey(source.getId());
|
||||
if (opt.isPresent()) {
|
||||
CrawlSource crawlSource = opt.get();
|
||||
if (crawlSource.getSourceStatus() == (byte) 1) {
|
||||
//关闭
|
||||
openOrCloseCrawl(crawlSource.getId(),(byte)0);
|
||||
openOrCloseCrawl(crawlSource.getId(), (byte) 0);
|
||||
}
|
||||
Date currentDate = new Date();
|
||||
crawlSource.setUpdateTime(currentDate);
|
||||
@ -89,14 +91,15 @@ public class CrawlServiceImpl implements CrawlService {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public PageBean<CrawlSource> listCrawlByPage(int page, int pageSize) {
|
||||
PageHelper.startPage(page, pageSize);
|
||||
SelectStatementProvider render = select(id, sourceName, sourceStatus, createTime, updateTime)
|
||||
.from(crawlSource)
|
||||
.orderBy(updateTime)
|
||||
.build()
|
||||
.render(RenderingStrategies.MYBATIS3);
|
||||
.from(crawlSource)
|
||||
.orderBy(updateTime)
|
||||
.build()
|
||||
.render(RenderingStrategies.MYBATIS3);
|
||||
List<CrawlSource> crawlSources = crawlSourceMapper.selectMany(render);
|
||||
PageBean<CrawlSource> pageBean = PageBuilder.build(crawlSources);
|
||||
pageBean.setList(BeanUtil.copyList(crawlSources, CrawlSourceVO.class));
|
||||
@ -113,7 +116,8 @@ public class CrawlServiceImpl implements CrawlService {
|
||||
if (sourceStatus == (byte) 0) {
|
||||
//关闭,直接修改数据库状态,并直接修改数据库状态后获取该爬虫正在运行的线程集合全部停止
|
||||
SpringUtil.getBean(CrawlService.class).updateCrawlSourceStatus(sourceId, sourceStatus);
|
||||
Set<Long> runningCrawlThreadId = (Set<Long>) cacheService.getObject(CacheKey.RUNNING_CRAWL_THREAD_KEY_PREFIX + sourceId);
|
||||
Set<Long> runningCrawlThreadId = (Set<Long>) cacheService.getObject(
|
||||
CacheKey.RUNNING_CRAWL_THREAD_KEY_PREFIX + sourceId);
|
||||
if (runningCrawlThreadId != null) {
|
||||
for (Long ThreadId : runningCrawlThreadId) {
|
||||
Thread thread = ThreadUtil.findThread(ThreadId);
|
||||
@ -157,11 +161,12 @@ public class CrawlServiceImpl implements CrawlService {
|
||||
|
||||
@Override
|
||||
public CrawlSource queryCrawlSource(Integer sourceId) {
|
||||
SelectStatementProvider render = select(CrawlSourceDynamicSqlSupport.sourceStatus, CrawlSourceDynamicSqlSupport.crawlRule)
|
||||
.from(crawlSource)
|
||||
.where(id, isEqualTo(sourceId))
|
||||
.build()
|
||||
.render(RenderingStrategies.MYBATIS3);
|
||||
SelectStatementProvider render = select(CrawlSourceDynamicSqlSupport.sourceStatus,
|
||||
CrawlSourceDynamicSqlSupport.crawlRule)
|
||||
.from(crawlSource)
|
||||
.where(id, isEqualTo(sourceId))
|
||||
.build()
|
||||
.render(RenderingStrategies.MYBATIS3);
|
||||
return crawlSourceMapper.selectMany(render).get(0);
|
||||
}
|
||||
|
||||
@ -182,10 +187,10 @@ public class CrawlServiceImpl implements CrawlService {
|
||||
public PageBean<CrawlSingleTask> listCrawlSingleTaskByPage(int page, int pageSize) {
|
||||
PageHelper.startPage(page, pageSize);
|
||||
SelectStatementProvider render = select(CrawlSingleTaskDynamicSqlSupport.crawlSingleTask.allColumns())
|
||||
.from(CrawlSingleTaskDynamicSqlSupport.crawlSingleTask)
|
||||
.orderBy(CrawlSingleTaskDynamicSqlSupport.createTime.descending())
|
||||
.build()
|
||||
.render(RenderingStrategies.MYBATIS3);
|
||||
.from(CrawlSingleTaskDynamicSqlSupport.crawlSingleTask)
|
||||
.orderBy(CrawlSingleTaskDynamicSqlSupport.createTime.descending())
|
||||
.build()
|
||||
.render(RenderingStrategies.MYBATIS3);
|
||||
List<CrawlSingleTask> crawlSingleTasks = crawlSingleTaskMapper.selectMany(render);
|
||||
PageBean<CrawlSingleTask> pageBean = PageBuilder.build(crawlSingleTasks);
|
||||
pageBean.setList(BeanUtil.copyList(crawlSingleTasks, CrawlSingleTaskVO.class));
|
||||
@ -200,7 +205,8 @@ public class CrawlServiceImpl implements CrawlService {
|
||||
@Override
|
||||
public CrawlSingleTask getCrawlSingleTask() {
|
||||
|
||||
List<CrawlSingleTask> list = crawlSingleTaskMapper.selectMany(select(CrawlSingleTaskDynamicSqlSupport.crawlSingleTask.allColumns())
|
||||
List<CrawlSingleTask> list = crawlSingleTaskMapper.selectMany(
|
||||
select(CrawlSingleTaskDynamicSqlSupport.crawlSingleTask.allColumns())
|
||||
.from(CrawlSingleTaskDynamicSqlSupport.crawlSingleTask)
|
||||
.where(CrawlSingleTaskDynamicSqlSupport.taskStatus, isEqualTo((byte) 2))
|
||||
.orderBy(CrawlSingleTaskDynamicSqlSupport.createTime)
|
||||
@ -226,12 +232,12 @@ public class CrawlServiceImpl implements CrawlService {
|
||||
|
||||
@Override
|
||||
public CrawlSource getCrawlSource(Integer id) {
|
||||
Optional<CrawlSource> opt=crawlSourceMapper.selectByPrimaryKey(id);
|
||||
if(opt.isPresent()) {
|
||||
CrawlSource crawlSource =opt.get();
|
||||
return crawlSource;
|
||||
}
|
||||
return null;
|
||||
Optional<CrawlSource> opt = crawlSourceMapper.selectByPrimaryKey(id);
|
||||
if (opt.isPresent()) {
|
||||
CrawlSource crawlSource = opt.get();
|
||||
return crawlSource;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -251,8 +257,8 @@ public class CrawlServiceImpl implements CrawlService {
|
||||
if (StringUtils.isNotBlank(ruleBean.getCatIdRule().get("catId" + catId))) {
|
||||
//拼接分类URL
|
||||
String catBookListUrl = ruleBean.getBookListUrl()
|
||||
.replace("{catId}", ruleBean.getCatIdRule().get("catId" + catId))
|
||||
.replace("{page}", page + "");
|
||||
.replace("{catId}", ruleBean.getCatIdRule().get("catId" + catId))
|
||||
.replace("{page}", page + "");
|
||||
|
||||
String bookListHtml = getByHttpClientWithChrome(catBookListUrl);
|
||||
if (bookListHtml != null) {
|
||||
@ -268,14 +274,12 @@ public class CrawlServiceImpl implements CrawlService {
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
String bookId = bookIdMatcher.group(1);
|
||||
parseBookAndSave(catId, ruleBean, sourceId, bookId);
|
||||
} catch (Exception e) {
|
||||
log.error(e.getMessage(), e);
|
||||
}
|
||||
|
||||
|
||||
isFindBookId = bookIdMatcher.find();
|
||||
}
|
||||
|
||||
@ -306,7 +310,7 @@ public class CrawlServiceImpl implements CrawlService {
|
||||
|
||||
final AtomicBoolean parseResult = new AtomicBoolean(false);
|
||||
|
||||
CrawlParser.parseBook(ruleBean, bookId, book -> {
|
||||
crawlParser.parseBook(ruleBean, bookId, book -> {
|
||||
if (book.getBookName() == null || book.getAuthorName() == null) {
|
||||
return;
|
||||
}
|
||||
@ -330,9 +334,11 @@ public class CrawlServiceImpl implements CrawlService {
|
||||
book.setCrawlLastTime(new Date());
|
||||
book.setId(idWorker.nextId());
|
||||
//解析章节目录
|
||||
boolean parseIndexContentResult = CrawlParser.parseBookIndexAndContent(bookId, book, ruleBean, new HashMap<>(0), chapter -> {
|
||||
bookService.saveBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList());
|
||||
});
|
||||
boolean parseIndexContentResult = crawlParser.parseBookIndexAndContent(bookId, book, ruleBean,
|
||||
new HashMap<>(0), chapter -> {
|
||||
bookService.saveBookAndIndexAndContent(book, chapter.getBookIndexList(),
|
||||
chapter.getBookContentList());
|
||||
});
|
||||
parseResult.set(parseIndexContentResult);
|
||||
|
||||
} else {
|
||||
@ -356,11 +362,12 @@ public class CrawlServiceImpl implements CrawlService {
|
||||
|
||||
@Override
|
||||
public List<CrawlSource> queryCrawlSourceByStatus(Byte sourceStatus) {
|
||||
SelectStatementProvider render = select(CrawlSourceDynamicSqlSupport.id, CrawlSourceDynamicSqlSupport.sourceStatus, CrawlSourceDynamicSqlSupport.crawlRule)
|
||||
.from(crawlSource)
|
||||
.where(CrawlSourceDynamicSqlSupport.sourceStatus, isEqualTo(sourceStatus))
|
||||
.build()
|
||||
.render(RenderingStrategies.MYBATIS3);
|
||||
SelectStatementProvider render = select(CrawlSourceDynamicSqlSupport.id,
|
||||
CrawlSourceDynamicSqlSupport.sourceStatus, CrawlSourceDynamicSqlSupport.crawlRule)
|
||||
.from(crawlSource)
|
||||
.where(CrawlSourceDynamicSqlSupport.sourceStatus, isEqualTo(sourceStatus))
|
||||
.build()
|
||||
.render(RenderingStrategies.MYBATIS3);
|
||||
return crawlSourceMapper.selectMany(render);
|
||||
}
|
||||
}
|
||||
|
@ -7,7 +7,7 @@ public class Constants {
|
||||
|
||||
/**
|
||||
* 本地图片保存前缀
|
||||
* */
|
||||
*/
|
||||
public static final String LOCAL_PIC_PREFIX = "/localPic/";
|
||||
|
||||
/**
|
||||
@ -23,5 +23,5 @@ public class Constants {
|
||||
/**
|
||||
* 爬取小说http请求失败重试次数
|
||||
*/
|
||||
public static final Integer HTTP_FAIL_RETRY_COUNT = 5;
|
||||
public static final Integer HTTP_FAIL_RETRY_COUNT = 3;
|
||||
}
|
||||
|
@ -0,0 +1,57 @@
|
||||
package com.java2nb.novel.utils;
|
||||
|
||||
import com.java2nb.novel.core.utils.HttpUtil;
|
||||
import lombok.extern.slf4j.Slf4j;
|
||||
import org.springframework.beans.factory.annotation.Value;
|
||||
import org.springframework.stereotype.Component;
|
||||
|
||||
import java.util.Objects;
|
||||
import java.util.Random;
|
||||
|
||||
/**
|
||||
* @author Administrator
|
||||
*/
|
||||
@Slf4j
|
||||
@Component
|
||||
public class CrawlHttpClient {
|
||||
|
||||
@Value("${crawl.interval.min}")
|
||||
private Integer intervalMin;
|
||||
|
||||
@Value("${crawl.interval.max}")
|
||||
private Integer intervalMax;
|
||||
|
||||
private final Random random = new Random();
|
||||
|
||||
private static final ThreadLocal<Integer> RETRY_COUNT = new ThreadLocal<>();
|
||||
|
||||
public String get(String url) {
|
||||
if (Objects.nonNull(intervalMin) && Objects.nonNull(intervalMax) && intervalMax > intervalMin) {
|
||||
try {
|
||||
Thread.sleep(random.nextInt(intervalMax - intervalMin + 1) + intervalMin);
|
||||
} catch (InterruptedException e) {
|
||||
log.error(e.getMessage(), e);
|
||||
}
|
||||
}
|
||||
String body = HttpUtil.getByHttpClientWithChrome(url);
|
||||
if (Objects.isNull(body) || body.length() < Constants.INVALID_HTML_LENGTH) {
|
||||
return processErrorHttpResult(url);
|
||||
}
|
||||
//成功获得html内容
|
||||
return body;
|
||||
}
|
||||
|
||||
private String processErrorHttpResult(String url) {
|
||||
Integer count = RETRY_COUNT.get();
|
||||
if (count == null) {
|
||||
count = 0;
|
||||
}
|
||||
if (count < Constants.HTTP_FAIL_RETRY_COUNT) {
|
||||
RETRY_COUNT.set(++count);
|
||||
return get(url);
|
||||
}
|
||||
RETRY_COUNT.remove();
|
||||
return null;
|
||||
}
|
||||
|
||||
}
|
@ -14,12 +14,18 @@ admin:
|
||||
username: admin
|
||||
password: admin
|
||||
|
||||
#爬虫自动更新的线程数
|
||||
#建议小说数量不多或者正在运行新书入库爬虫的情况下设置为1即可
|
||||
#随着小说数量的增多可以逐渐增加,但建议不要超出CPU的线程数
|
||||
|
||||
|
||||
crawl:
|
||||
update:
|
||||
#爬虫自动更新的线程数
|
||||
#建议小说数量不多或者正在运行新书入库爬虫的情况下设置为1即可
|
||||
#随着小说数量的增多可以逐渐增加,但建议不要超出CPU的线程数
|
||||
thread: 1
|
||||
# 采集间隔时间,单位:毫秒
|
||||
interval:
|
||||
min: 300
|
||||
max: 500
|
||||
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user