1.解决爬虫线程停止失败的bug,2新增新笔趣阁源,兼容更多源站

This commit is contained in:
xiongxiaoyang
2020-05-24 00:54:27 +08:00
parent 80b933db8d
commit a0fb8e481a
5 changed files with 19 additions and 4 deletions

View File

@ -94,6 +94,8 @@ public class CrawlParser {
String desc = bookDetailHtml.substring(bookDetailHtml.indexOf(ruleBean.getDescStart()) + ruleBean.getDescStart().length());
desc = desc.substring(0, desc.indexOf(ruleBean.getDescEnd()));
//过滤掉简介中的a标签
desc = desc.replaceAll("<a[^<]+</a>","");
//设置书籍简介
book.setBookDesc(desc);
if (StringUtils.isNotBlank(ruleBean.getStatusPatten())) {
@ -173,6 +175,7 @@ public class CrawlParser {
String lastIndexName = null;
while (isFindIndex) {
BookIndex hasIndex = hasIndexs.get(indexNum);
String indexName = indexNameMatch.group(1);

View File

@ -167,6 +167,11 @@ public class CrawlServiceImpl implements CrawlService {
boolean isFindBookId = bookIdMatcher.find();
while (isFindBookId) {
try {
if(Thread.currentThread().isInterrupted()){
return;
}
String bookId = bookIdMatcher.group(1);
Book book = CrawlParser.parseBook(ruleBean, bookId);
//这里只做新书入库,查询是否存在这本书