fix(novel-crawl): 调整线程终止逻辑

This commit is contained in:
xiongxiaoyang 2025-05-13 09:58:47 +08:00
parent 2c86cb9a7d
commit 1f53b56bd6
6 changed files with 45 additions and 25 deletions

View File

@ -7,6 +7,6 @@ import com.java2nb.novel.entity.Book;
* */ * */
public interface CrawlBookHandler { public interface CrawlBookHandler {
void handle(Book book); void handle(Book book) throws InterruptedException;
} }

View File

@ -10,9 +10,11 @@ import com.java2nb.novel.utils.CrawlHttpClient;
import io.github.xxyopen.util.IdWorker; import io.github.xxyopen.util.IdWorker;
import lombok.RequiredArgsConstructor; import lombok.RequiredArgsConstructor;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.springframework.stereotype.Component; import org.springframework.stereotype.Component;
import java.text.ParseException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
@ -26,6 +28,7 @@ import java.util.regex.Pattern;
* *
* @author Administrator * @author Administrator
*/ */
@Slf4j
@Component @Component
@RequiredArgsConstructor @RequiredArgsConstructor
public class CrawlParser { public class CrawlParser {
@ -34,8 +37,8 @@ public class CrawlParser {
private final CrawlHttpClient crawlHttpClient; private final CrawlHttpClient crawlHttpClient;
@SneakyThrows public void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler)
public void parseBook(RuleBean ruleBean, String bookId, CrawlBookHandler handler) { throws InterruptedException {
Book book = new Book(); Book book = new Book();
String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId); String bookDetailUrl = ruleBean.getBookDetailUrl().replace("{bookId}", bookId);
String bookDetailHtml = crawlHttpClient.get(bookDetailUrl, ruleBean.getCharset()); String bookDetailHtml = crawlHttpClient.get(bookDetailUrl, ruleBean.getCharset());
@ -120,8 +123,12 @@ public class CrawlParser {
if (isFindUpdateTime) { if (isFindUpdateTime) {
String updateTime = updateTimeMatch.group(1); String updateTime = updateTimeMatch.group(1);
//设置更新时间 //设置更新时间
book.setLastIndexUpdateTime( try {
new SimpleDateFormat(ruleBean.getUpadateTimeFormatPatten()).parse(updateTime)); book.setLastIndexUpdateTime(
new SimpleDateFormat(ruleBean.getUpadateTimeFormatPatten()).parse(updateTime));
} catch (ParseException e) {
log.error("解析最新章节更新时间出错", e);
}
} }
} }
@ -144,7 +151,7 @@ public class CrawlParser {
} }
public boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean, public boolean parseBookIndexAndContent(String sourceBookId, Book book, RuleBean ruleBean,
Map<Integer, BookIndex> existBookIndexMap, CrawlBookChapterHandler handler) { Map<Integer, BookIndex> existBookIndexMap, CrawlBookChapterHandler handler) throws InterruptedException{
Date currentDate = new Date(); Date currentDate = new Date();

View File

@ -74,10 +74,8 @@ public class StarterListener implements ServletContextInitializer {
needUpdateBook.getId()); needUpdateBook.getId());
//解析章节目录 //解析章节目录
crawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book, crawlParser.parseBookIndexAndContent(needUpdateBook.getCrawlBookId(), book,
ruleBean, existBookIndexMap, chapter -> { ruleBean, existBookIndexMap, chapter -> bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(),
bookService.updateBookAndIndexAndContent(book, chapter.getBookIndexList(), chapter.getBookContentList(), existBookIndexMap));
chapter.getBookContentList(), existBookIndexMap);
});
}); });
} catch (Exception e) { } catch (Exception e) {
log.error(e.getMessage(), e); log.error(e.getMessage(), e);

View File

@ -53,7 +53,7 @@ public interface CrawlService {
* @param ruleBean 采集规则\ * @param ruleBean 采集规则\
* @return true:成功false:失败 * @return true:成功false:失败
* */ * */
boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId); boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId) throws InterruptedException;
/** /**
* 根据爬虫状态查询爬虫源集合 * 根据爬虫状态查询爬虫源集合

View File

@ -249,6 +249,11 @@ public class CrawlServiceImpl implements CrawlService {
@Override @Override
public void parseBookList(int catId, RuleBean ruleBean, Integer sourceId) { public void parseBookList(int catId, RuleBean ruleBean, Integer sourceId) {
String catIdRule = ruleBean.getCatIdRule().get("catId" + catId);
if (StringUtils.isBlank(catIdRule)) {
return;
}
//当前页码1 //当前页码1
int page = 1; int page = 1;
int totalPage = page; int totalPage = page;
@ -256,11 +261,7 @@ public class CrawlServiceImpl implements CrawlService {
while (page <= totalPage) { while (page <= totalPage) {
try { try {
String catIdRule = ruleBean.getCatIdRule().get("catId" + catId); String catBookListUrl;
if (StringUtils.isBlank(catIdRule) || Thread.currentThread().isInterrupted()) {
return;
}
String catBookListUrl = "";
if (StringUtils.isNotBlank(ruleBean.getBookListUrl())) { if (StringUtils.isNotBlank(ruleBean.getBookListUrl())) {
// 兼容老规则 // 兼容老规则
// 拼接分类URL // 拼接分类URL
@ -290,6 +291,12 @@ public class CrawlServiceImpl implements CrawlService {
String bookId = bookIdMatcher.group(1); String bookId = bookIdMatcher.group(1);
parseBookAndSave(catId, ruleBean, sourceId, bookId); parseBookAndSave(catId, ruleBean, sourceId, bookId);
} catch (InterruptedException e) {
log.error(e.getMessage(), e);
//1.阻塞过程使用了 sleep,同步锁的 wait,socket 中的 receiver,accept 等方法时
//捕获中断异常InterruptedException来退出线程
//2.非阻塞过程中通过判断中断标志来退出线程
return;
} catch (Exception e) { } catch (Exception e) {
log.error(e.getMessage(), e); log.error(e.getMessage(), e);
} }
@ -306,6 +313,12 @@ public class CrawlServiceImpl implements CrawlService {
} }
} }
} catch (InterruptedException e) {
log.error(e.getMessage(), e);
//1.阻塞过程使用了 sleep,同步锁的 wait,socket 中的 receiver,accept 等方法时
//捕获中断异常InterruptedException来退出线程
//2.非阻塞过程中通过判断中断标志来退出线程
return;
} catch (Exception e) { } catch (Exception e) {
log.error(e.getMessage(), e); log.error(e.getMessage(), e);
} }
@ -317,8 +330,12 @@ public class CrawlServiceImpl implements CrawlService {
Thread.sleep(Duration.ofMinutes(1)); Thread.sleep(Duration.ofMinutes(1));
} catch (InterruptedException e) { } catch (InterruptedException e) {
log.error(e.getMessage(), e); log.error(e.getMessage(), e);
//1.阻塞过程使用了 sleep,同步锁的 wait,socket 中的 receiver,accept 等方法时
//捕获中断异常InterruptedException来退出线程
//2.非阻塞过程中通过判断中断标志来退出线程
return;
} }
}else{ } else {
page += 1; page += 1;
} }
} }
@ -327,7 +344,8 @@ public class CrawlServiceImpl implements CrawlService {
} }
@Override @Override
public boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId) { public boolean parseBookAndSave(int catId, RuleBean ruleBean, Integer sourceId, String bookId)
throws InterruptedException {
final AtomicBoolean parseResult = new AtomicBoolean(false); final AtomicBoolean parseResult = new AtomicBoolean(false);
@ -391,4 +409,5 @@ public class CrawlServiceImpl implements CrawlService {
.render(RenderingStrategies.MYBATIS3); .render(RenderingStrategies.MYBATIS3);
return crawlSourceMapper.selectMany(render); return crawlSourceMapper.selectMany(render);
} }
} }

View File

@ -25,13 +25,9 @@ public class CrawlHttpClient {
private static final ThreadLocal<Integer> RETRY_COUNT = new ThreadLocal<>(); private static final ThreadLocal<Integer> RETRY_COUNT = new ThreadLocal<>();
public String get(String url, String charset) { public String get(String url, String charset) throws InterruptedException {
if (Objects.nonNull(intervalMin) && Objects.nonNull(intervalMax) && intervalMax > intervalMin) { if (Objects.nonNull(intervalMin) && Objects.nonNull(intervalMax) && intervalMax > intervalMin) {
try { Thread.sleep(random.nextInt(intervalMax - intervalMin + 1) + intervalMin);
Thread.sleep(random.nextInt(intervalMax - intervalMin + 1) + intervalMin);
} catch (InterruptedException e) {
log.error(e.getMessage(), e);
}
} }
String body = HttpUtil.getByHttpClientWithChrome(url, charset); String body = HttpUtil.getByHttpClientWithChrome(url, charset);
if (Objects.isNull(body) || body.length() < Constants.INVALID_HTML_LENGTH) { if (Objects.isNull(body) || body.length() < Constants.INVALID_HTML_LENGTH) {
@ -41,7 +37,7 @@ public class CrawlHttpClient {
return body; return body;
} }
private String processErrorHttpResult(String url, String charset) { private String processErrorHttpResult(String url, String charset) throws InterruptedException{
Integer count = RETRY_COUNT.get(); Integer count = RETRY_COUNT.get();
if (count == null) { if (count == null) {
count = 0; count = 0;