Compare commits

..

8 Commits

14 changed files with 116 additions and 156 deletions

View File

@ -1,5 +1,4 @@
<p align="center"> <p align="center">
<a href="https://www.swiftproxy.net/?code=T2WV1VT50"><img src="https://xxyopen.com/images/ad1.png" alt="AD" ></a>
<a href="https://cloud.tencent.com/act/cps/redirect?redirect=2446&cps_key=736e609d66e0ac4e57813316cec6fd0b&from=console"><img src="https://youdoc.github.io/img/tencent.jpg" alt="AD" ></a> <a href="https://cloud.tencent.com/act/cps/redirect?redirect=2446&cps_key=736e609d66e0ac4e57813316cec6fd0b&from=console"><img src="https://youdoc.github.io/img/tencent.jpg" alt="AD" ></a>
</p> </p>
<p align="center"> <p align="center">

View File

@ -5,7 +5,7 @@
<groupId>com.java2nb</groupId> <groupId>com.java2nb</groupId>
<artifactId>novel-admin</artifactId> <artifactId>novel-admin</artifactId>
<version>5.1.1</version> <version>5.1.3</version>
<packaging>jar</packaging> <packaging>jar</packaging>
<name>novel-admin</name> <name>novel-admin</name>

View File

@ -5,7 +5,7 @@
<parent> <parent>
<artifactId>novel</artifactId> <artifactId>novel</artifactId>
<groupId>com.java2nb</groupId> <groupId>com.java2nb</groupId>
<version>5.1.1</version> <version>5.1.3</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -41,11 +41,6 @@ public interface CacheKey {
* */ * */
String TEMPLATE_DIR_KEY = "templateDirKey";; String TEMPLATE_DIR_KEY = "templateDirKey";;
/**
* 正在运行的爬虫线程存储KEY前缀
* */
String RUNNING_CRAWL_THREAD_KEY_PREFIX = "runningCrawlTreadDataKeyPrefix";
/** /**
* 上一次搜索引擎更新的时间 * 上一次搜索引擎更新的时间
* */ * */

View File

@ -5,7 +5,7 @@
<parent> <parent>
<artifactId>novel</artifactId> <artifactId>novel</artifactId>
<groupId>com.java2nb</groupId> <groupId>com.java2nb</groupId>
<version>5.1.1</version> <version>5.1.3</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -7,6 +7,6 @@ import com.java2nb.novel.entity.Book;
* */ * */
public interface CrawlBookHandler { public interface CrawlBookHandler {
void handle(Book book); void handle(Book book) throws InterruptedException;
} }

View File

@ -10,9 +10,11 @@ import com.java2nb.novel.utils.CrawlHttpClient;
import io.github.xxyopen.util.IdWorker; import io.github.xxyopen.util.IdWorker;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import java.text.ParseException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
@ -26,6 +28,7 @@ import java.util.regex.Pattern;
* *
* @author Administrator * @author Administrator
*/ */
@Slf4j
@Component @Component
@RequiredArgsConstructor @RequiredArgsConstructor
public class CrawlParser { public class CrawlParser {
@ -34,8 +37,8 @@ public class CrawlParser {
private final CrawlHttpClient crawlHttpClient; private final CrawlHttpClient crawlHttpClient;
@SneakyThrows public void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler)
public void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) { throws InterruptedException {
Book book = new Book(); Book book = new Book();
String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId); String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
String bookDetailHtml = crawlHttpClient.get(bookDetailUrl, ruleBean.getCharset()); String bookDetailHtml = crawlHttpClient.get(bookDetailUrl, ruleBean.getCharset());
@ -120,8 +123,12 @@ public class CrawlParser {
if (isFindUpdateTime) { if (isFindUpdateTime) {
String updateTime = updateTimeMatch.group(1); String updateTime = updateTimeMatch.group(1);
//设置更新时间 //设置更新时间
try {
book.setLastIndexUpdateTime( book.setLastIndexUpdateTime(
new SimpleDateFormat(ruleBean.getUpadateTimeFormatPatten()).parse(updateTime)); new SimpleDateFormat(ruleBean.getUpadateTimeFormatPatten()).parse(updateTime));
} catch (ParseException e) {
log.error("解析最新章节更新时间出错", e);
}
} }
} }
@ -144,7 +151,7 @@ public class CrawlParser {
} }
public boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, public boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean,
Map<Integer, BookIndex> existBookIndexMap, CrawlBookChapterHandler handler) { Map<Integer, BookIndex> existBookIndexMap, CrawlBookChapterHandler handler) throws InterruptedException{
Date currentDate = new Date(); Date currentDate = new Date();

View File

@ -74,10 +74,8 @@ public class StarterListener implements ServletContextInitializer {
needUpdateBook.getId()); needUpdateBook.getId());
//解析章节目录 //解析章节目录
crawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, crawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book,
ruleBean, existBookIndexMap, chapter -> { ruleBean, existBookIndexMap, chapter -> bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(),
bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList(), existBookIndexMap));
chapter.getBookContentList(), existBookIndexMap);
});
}); });
} catch (Exception e) { } catch (Exception e) {
log.error(e.getMessage(), e); log.error(e.getMessage(), e);

View File

@ -1,61 +0,0 @@
package com.java2nb.novel.core.schedule;
import com.java2nb.novel.core.cache.CacheKey;
import com.java2nb.novel.core.cache.CacheService;
import com.java2nb.novel.entity.CrawlSource;
import com.java2nb.novel.service.CrawlService;
import io.github.xxyopen.util.ThreadUtil;
import lombok.RequiredArgsConstructor;
import lombok.extern.slf4j.Slf4j;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Service;
import java.util.List;
import java.util.Set;
/**
* 爬虫线程监控器,监控执行完成的爬虫源,并修改状态
*
* @author Administrator
*/
@Service
@RequiredArgsConstructor
@Slf4j
public class CrawlThreadMonitor {
private final CacheService cacheService;
private final CrawlService crawlService;
@Scheduled(fixedRate = 1000 * 60 * 5)
public void monitor() {
//查询需要监控的正在运行的爬虫源
List<CrawlSource> sources = crawlService.queryCrawlSourceByStatus((byte) 1);
for (CrawlSource source : sources) {
Set<Long> runningCrawlThreadIds = (Set<Long>) cacheService.getObject(CacheKey.RUNNING_CRAWL_THREAD_KEY_PREFIX + source.getId());
boolean sourceStop = true;
if (runningCrawlThreadIds != null) {
for (Long threadId : runningCrawlThreadIds) {
Thread thread = ThreadUtil.findThread(threadId);
if (thread != null && thread.isAlive()) {
//有活跃线程,说明该爬虫源正在运行,数据库中状态正确,不需要修改
sourceStop = false;
}
}
}
if (sourceStop) {
crawlService.updateCrawlSourceStatus(source.getId(), (byte) 0);
}
}
}
}

View File

@ -53,7 +53,7 @@ public interface CrawlService {
* @param ruleBean 采集规则\ * @param ruleBean 采集规则\
* @return true:成功false:失败 * @return true:成功false:失败
* */ * */
boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId); boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId) throws InterruptedException;
/** /**
* 根据爬虫状态查询爬虫源集合 * 根据爬虫状态查询爬虫源集合

View File

@ -34,6 +34,7 @@ import org.mybatis.dynamic.sql.render.RenderingStrategies;
import org.mybatis.dynamic.sql.select.render.SelectStatementProvider; import org.mybatis.dynamic.sql.select.render.SelectStatementProvider;
import org.springframework.stereotype.Service; import org.springframework.stereotype.Service;
import java.time.Duration;
import java.util.*; import java.util.*;
import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicBoolean;
import java.util.regex.Matcher; import java.util.regex.Matcher;
@ -65,6 +66,10 @@ public class CrawlServiceImpl implements CrawlService {
private final CrawlHttpClient crawlHttpClient; private final CrawlHttpClient crawlHttpClient;
private final Map<Integer, Byte> crawlSourceStatusMap = new HashMap<>();
private final Map<Integer, Set<Long>> runningCrawlThread = new HashMap<>();
@Override @Override
public void addCrawlSource(CrawlSource source) { public void addCrawlSource(CrawlSource source) {
@ -103,6 +108,8 @@ public class CrawlServiceImpl implements CrawlService {
.build() .build()
.render(RenderingStrategies.MYBATIS3); .render(RenderingStrategies.MYBATIS3);
List<CrawlSource> crawlSources = crawlSourceMapper.selectMany(render); List<CrawlSource> crawlSources = crawlSourceMapper.selectMany(render);
crawlSources.forEach(crawlSource -> crawlSource.setSourceStatus(
Optional.ofNullable(crawlSourceStatusMap.get(crawlSource.getId())).orElse((byte) 0)));
PageBean<CrawlSource> pageBean = PageBuilder.build(crawlSources); PageBean<CrawlSource> pageBean = PageBuilder.build(crawlSources);
pageBean.setList(BeanUtil.copyList(crawlSources, CrawlSourceVO.class)); pageBean.setList(BeanUtil.copyList(crawlSources, CrawlSourceVO.class));
return pageBean; return pageBean;
@ -112,14 +119,13 @@ public class CrawlServiceImpl implements CrawlService {
@Override @Override
public void openOrCloseCrawl(Integer sourceId, Byte sourceStatus) { public void openOrCloseCrawl(Integer sourceId, Byte sourceStatus) {
//判断是开启还是关闭,如果是关闭,则修改数据库状态后获取该爬虫正在运行的线程集合并全部停止 // 判断是开启还是关闭,如果是关闭,则获取该爬虫正在运行的线程集合并全部中断
//如果是开启,先查询数据库中状态,判断该爬虫源是否还在运行,如果在运行,则忽略, // 如果是开启,先判断该爬虫源是否还在运行,如果在运行,则忽略,如果没有运行则启动线程爬取小说数据并加入到runningCrawlThread中
// 如果没有则修改数据库状态并启动线程爬取小说数据加入到runningCrawlThread中 // 最后,保存爬虫源状态
if (sourceStatus == (byte) 0) { if (sourceStatus == (byte) 0) {
//关闭,直接修改数据库状态,并直接修改数据库状态后获取该爬虫正在运行的线程集合全部停止 // 关闭
SpringUtil.getBean(CrawlService.class).updateCrawlSourceStatus(sourceId, sourceStatus); // 将该爬虫源正在运行的线程集合全部停止
Set<Long> runningCrawlThreadId = (Set<Long>) cacheService.getObject( Set<Long> runningCrawlThreadId = runningCrawlThread.get(sourceId);
CacheKey.RUNNING_CRAWL_THREAD_KEY_PREFIX + sourceId);
if (runningCrawlThreadId != null) { if (runningCrawlThreadId != null) {
for (Long ThreadId : runningCrawlThreadId) { for (Long ThreadId : runningCrawlThreadId) {
Thread thread = ThreadUtil.findThread(ThreadId); Thread thread = ThreadUtil.findThread(ThreadId);
@ -131,16 +137,13 @@ public class CrawlServiceImpl implements CrawlService {
} else { } else {
//开启 // 开启
//查询爬虫源状态和规则 Byte realSourceStatus = Optional.ofNullable(crawlSourceStatusMap.get(sourceId)).orElse((byte) 0);
CrawlSource source = queryCrawlSource(sourceId);
Byte realSourceStatus = source.getSourceStatus();
if (realSourceStatus == (byte) 0) { if (realSourceStatus == (byte) 0) {
//该爬虫源已经停止运行了,修改数据库状态并启动线程爬取小说数据加入到runningCrawlThread中 // 查询爬虫源规则
SpringUtil.getBean(CrawlService.class).updateCrawlSourceStatus(sourceId, sourceStatus); CrawlSource source = queryCrawlSource(sourceId);
//该爬虫源已经停止运行了,启动线程爬取小说数据并将线程加入到runningCrawlThread中
RuleBean ruleBean = new ObjectMapper().readValue(source.getCrawlRule(), RuleBean.class); RuleBean ruleBean = new ObjectMapper().readValue(source.getCrawlRule(), RuleBean.class);
Set<Long> threadIds = new HashSet<>(); Set<Long> threadIds = new HashSet<>();
//按分类开始爬虫解析任务 //按分类开始爬虫解析任务
for (int i = 1; i < 8; i++) { for (int i = 1; i < 8; i++) {
@ -149,15 +152,14 @@ public class CrawlServiceImpl implements CrawlService {
thread.start(); thread.start();
//thread加入到监控缓存中 //thread加入到监控缓存中
threadIds.add(thread.getId()); threadIds.add(thread.getId());
} }
cacheService.setObject(CacheKey.RUNNING_CRAWL_THREAD_KEY_PREFIX + sourceId, threadIds); runningCrawlThread.put(sourceId, threadIds);
}
} }
// 保存爬虫源状态
} crawlSourceStatusMap.put(sourceId, sourceStatus);
} }
@ -248,6 +250,11 @@ public class CrawlServiceImpl implements CrawlService {
@Override @Override
public void parseBookList(int catId, RuleBean ruleBean, Integer sourceId) { public void parseBookList(int catId, RuleBean ruleBean, Integer sourceId) {
String catIdRule = ruleBean.getCatIdRule().get("catId" + catId);
if (StringUtils.isBlank(catIdRule)) {
return;
}
//当前页码1 //当前页码1
int page = 1; int page = 1;
int totalPage = page; int totalPage = page;
@ -255,9 +262,7 @@ public class CrawlServiceImpl implements CrawlService {
while (page <= totalPage) { while (page <= totalPage) {
try { try {
String catIdRule = ruleBean.getCatIdRule().get("catId" + catId); String catBookListUrl;
if (StringUtils.isNotBlank(catIdRule)) {
String catBookListUrl = "";
if (StringUtils.isNotBlank(ruleBean.getBookListUrl())) { if (StringUtils.isNotBlank(ruleBean.getBookListUrl())) {
// 兼容老规则 // 兼容老规则
// 拼接分类URL // 拼接分类URL
@ -287,6 +292,12 @@ public class CrawlServiceImpl implements CrawlService {
String bookId = bookIdMatcher.group(1); String bookId = bookIdMatcher.group(1);
parseBookAndSave(catId, ruleBean, sourceId, bookId); parseBookAndSave(catId, ruleBean, sourceId, bookId);
} catch (InterruptedException e) {
log.error(e.getMessage(), e);
//1.阻塞过程(使用了 sleep,同步锁的 wait,socket 中的 receiver,accept 等方法时)
//捕获中断异常InterruptedException来退出线程。
//2.非阻塞过程中通过判断中断标志来退出线程。
return;
} catch (Exception e) { } catch (Exception e) {
log.error(e.getMessage(), e); log.error(e.getMessage(), e);
} }
@ -302,26 +313,40 @@ public class CrawlServiceImpl implements CrawlService {
totalPage = Integer.parseInt(totalPageMatcher.group(1)); totalPage = Integer.parseInt(totalPageMatcher.group(1));
} }
}
} }
} catch (InterruptedException e) {
log.error(e.getMessage(), e);
//1.阻塞过程(使用了 sleep,同步锁的 wait,socket 中的 receiver,accept 等方法时)
//捕获中断异常InterruptedException来退出线程。
//2.非阻塞过程中通过判断中断标志来退出线程。
return;
} catch (Exception e) { } catch (Exception e) {
log.error(e.getMessage(), e); log.error(e.getMessage(), e);
} }
if (page == totalPage) { if (page >= totalPage) {
// 第一遍采集完成,翻到第一页,继续第二次采集,适用于分页数比较少的最近更新列表 // 第一遍采集完成,翻到第一页,继续第二次采集,适用于分页数比较少的最近更新列表
page = 0; page = 1;
try {
// 第一遍采集完成休眠1分钟
Thread.sleep(Duration.ofMinutes(1));
} catch (InterruptedException e) {
log.error(e.getMessage(), e);
//1.阻塞过程(使用了 sleep,同步锁的 wait,socket 中的 receiver,accept 等方法时)
//捕获中断异常InterruptedException来退出线程。
//2.非阻塞过程中通过判断中断标志来退出线程。
return;
} }
} else {
page += 1; page += 1;
} }
}
} }
@Override @Override
public boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId) { public boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId)
throws InterruptedException {
final AtomicBoolean parseResult = new AtomicBoolean(false); final AtomicBoolean parseResult = new AtomicBoolean(false);
@ -385,4 +410,5 @@ public class CrawlServiceImpl implements CrawlService {
.render(RenderingStrategies.MYBATIS3); .render(RenderingStrategies.MYBATIS3);
return crawlSourceMapper.selectMany(render); return crawlSourceMapper.selectMany(render);
} }
} }

View File

@ -25,13 +25,9 @@ public class CrawlHttpClient {
private static final ThreadLocal<Integer> RETRY_COUNT = new ThreadLocal<>(); private static final ThreadLocal<Integer> RETRY_COUNT = new ThreadLocal<>();
public String get(String url, String charset) { public String get(String url, String charset) throws InterruptedException {
if (Objects.nonNull(intervalMin) && Objects.nonNull(intervalMax) && intervalMax > intervalMin) { if (Objects.nonNull(intervalMin) && Objects.nonNull(intervalMax) && intervalMax > intervalMin) {
try {
Thread.sleep(random.nextInt(intervalMax - intervalMin + 1) + intervalMin); Thread.sleep(random.nextInt(intervalMax - intervalMin + 1) + intervalMin);
} catch (InterruptedException e) {
log.error(e.getMessage(), e);
}
} }
String body = HttpUtil.getByHttpClientWithChrome(url, charset); String body = HttpUtil.getByHttpClientWithChrome(url, charset);
if (Objects.isNull(body) || body.length() < Constants.INVALID_HTML_LENGTH) { if (Objects.isNull(body) || body.length() < Constants.INVALID_HTML_LENGTH) {
@ -41,7 +37,7 @@ public class CrawlHttpClient {
return body; return body;
} }
private String processErrorHttpResult(String url, String charset) { private String processErrorHttpResult(String url, String charset) throws InterruptedException{
Integer count = RETRY_COUNT.get(); Integer count = RETRY_COUNT.get();
if (count == null) { if (count == null) {
count = 0; count = 0;

View File

@ -5,7 +5,7 @@
<parent> <parent>
<artifactId>novel</artifactId> <artifactId>novel</artifactId>
<groupId>com.java2nb</groupId> <groupId>com.java2nb</groupId>
<version>5.1.1</version> <version>5.1.3</version>
</parent> </parent>
<modelVersion>4.0.0</modelVersion> <modelVersion>4.0.0</modelVersion>

View File

@ -5,7 +5,7 @@
<groupId>com.java2nb</groupId> <groupId>com.java2nb</groupId>
<artifactId>novel</artifactId> <artifactId>novel</artifactId>
<version>5.1.1</version> <version>5.1.3</version>
<modules> <modules>
<module>novel-common</module> <module>novel-common</module>
<module>novel-front</module> <module>novel-front</module>